In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("D:\\SEM 6\\Mini Project\\Autism\\Datasets\\ASD_Toddlers\\Toddler Autism dataset July 2018.csv")
df.drop(df.columns[0], axis=1, inplace=True)

# Separate features and labels
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]

# Select categorical features
category_features = features.iloc[:, [12, 13, 14, 15, 16]]

# Drop categorical columns
features.drop(features.columns[-6:], axis=1, inplace=True)

# Fill missing values with mean
features = features.fillna(features.mean())

# One Hot Encoding without changing column names
enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
category_transformed = enc.fit_transform(category_features)
category_encoded_columns = enc.get_feature_names_out(category_features.columns)
transformed_df = pd.DataFrame(category_transformed, columns=category_encoded_columns)

# Concatenate encoded features with numerical features
result_df = pd.concat([features, transformed_df], axis=1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(result_df, labels, test_size=0.19, random_state=42)

# Define AdaBoost classifier with a decision tree base estimator
ada_boost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2, min_samples_split=2),
                               n_estimators=100, learning_rate=0.5, algorithm='SAMME.R', random_state=42)

# Preprocessing pipeline with Normalizer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('normalizer', Normalizer())
        ]), ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons']),
        ('cat', Pipeline([
            ('onehot', OneHotEncoder())
        ]), transformed_df.columns)
    ],
    remainder='passthrough'  # Pass through any other columns without transformation
)

# Create pipeline with preprocessing and AdaBoost classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ada_boost)
])

# Define parameter grid for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [50, 100, 150],
    'classifier__learning_rate': [0.1, 0.5, 1.0]
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters found by GridSearchCV
print("Best Parameters:", grid_search.best_params_)

# Test the model on the separate testing set
y_pred_test = grid_search.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)
print("Testing Set Accuracy:", test_accuracy_test)




Best Parameters: {'classifier__learning_rate': 1.0, 'classifier__n_estimators': 150}
Testing Set Accuracy: 0.9900497512437811


In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("D:\\SEM 6\\Mini Project\\Autism\\Datasets\\ASD_Toddlers\\Toddler Autism dataset July 2018.csv")
df.drop(df.columns[0], axis=1, inplace=True)

# Separate features and labels
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]

# Select categorical features
category_features = features.iloc[:, [12, 13, 14, 15, 16]]

# Drop categorical columns
features.drop(features.columns[-6:], axis=1, inplace=True)

# Fill missing values with mean
features = features.fillna(features.mean())

# One Hot Encoding without changing column names
enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
category_transformed = enc.fit_transform(category_features)
category_encoded_columns = enc.get_feature_names_out(category_features.columns)
transformed_df = pd.DataFrame(category_transformed, columns=category_encoded_columns)

# Concatenate encoded features with numerical features
result_df = pd.concat([features, transformed_df], axis=1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(result_df, labels, test_size=0.23, random_state=42)

# Define Random Forest classifier
random_forest = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=42)

# Preprocessing pipeline with Normalizer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('normalizer', Normalizer())
        ]), ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons']),
        ('cat', Pipeline([
            ('onehot', OneHotEncoder())
        ]), transformed_df.columns)
    ],
    remainder='passthrough'  # Pass through any other columns without transformation
)

# Create pipeline with preprocessing and Random Forest classifier
pipeline_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', random_forest)
])

# Define parameter grid for hyperparameter tuning
param_grid_rf = {
    'classifier__n_estimators': [50, 100, 150],
    'classifier__max_depth': [None, 5, 10],
    'classifier__min_samples_split': [2, 5, 10]
}

# Use GridSearchCV for hyperparameter tuning
grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

# Print the best parameters found by GridSearchCV
print("Best Parameters (Random Forest):", grid_search_rf.best_params_)

# Test the model on the separate testing set
y_pred_test_rf = grid_search_rf.predict(X_test)
test_accuracy_test_rf = accuracy_score(y_test, y_pred_test_rf)
print("Testing Set Accuracy (Random Forest):", test_accuracy_test_rf)


 nan nan nan nan nan nan nan nan nan]


Best Parameters (Random Forest): {'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50}
Testing Set Accuracy (Random Forest): 0.9835390946502057


In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("D:\\SEM 6\\Mini Project\\Autism\\Datasets\\ASD_Toddlers\\Toddler Autism dataset July 2018.csv")
df.drop(df.columns[0], axis=1, inplace=True)

# Separate features and labels
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]

# Select categorical features
category_features = features.iloc[:, [12, 13, 14, 15, 16]]

# Drop categorical columns
features.drop(features.columns[-6:], axis=1, inplace=True)

# Fill missing values with mean
features = features.fillna(features.mean())

# One Hot Encoding without changing column names
enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
category_transformed = enc.fit_transform(category_features)
category_encoded_columns = enc.get_feature_names_out(category_features.columns)
transformed_df = pd.DataFrame(category_transformed, columns=category_encoded_columns)

# Concatenate encoded features with numerical features
result_df = pd.concat([features, transformed_df], axis=1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(result_df, labels, test_size=0.22, random_state=42)

# Define Decision Tree classifier
decision_tree = DecisionTreeClassifier(criterion='log_loss',splitter='random',max_depth=None, min_samples_split=2,random_state=35)

# Preprocessing pipeline with Normalizer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('normalizer', Normalizer())
        ]), ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons']),
        ('cat', Pipeline([
            ('onehot', OneHotEncoder())
        ]), transformed_df.columns)
    ],
    remainder='passthrough'  # Pass through any other columns without transformation
)

# Create pipeline with preprocessing and Decision Tree classifier for training with cross-validation
pipeline_cv = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', decision_tree)
])

# Define 10-fold cross-validation
cv = KFold(n_splits=10, shuffle=True, random_state=69)

# Perform cross-validation on training set
cv_scores = cross_val_score(pipeline_cv, X_train, y_train, cv=cv, scoring='accuracy')

# Print cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", np.mean(cv_scores))

# Train the model on the entire training set
pipeline_cv.fit(X_train, y_train)

# Test the model on the separate testing set
y_pred_cv = pipeline_cv.predict(X_test)
test_accuracy_cv = accuracy_score(y_test, y_pred_cv)
print("Testing Set Accuracy with cross-validation:", test_accuracy_cv)

# Create pipeline with preprocessing and Decision Tree classifier for testing without cross-validation
pipeline_test = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', decision_tree)
])

# Train the model on the entire training set without cross-validation
pipeline_test.fit(X_train, y_train)

# Test the model on the separate testing set
y_pred_test = pipeline_test.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)
print("Testing Set Accuracy without cross-validation:", test_accuracy_test)




Cross-validation scores: [0.92771084 0.95180723 0.85365854 0.95121951 0.90243902 0.95121951
 0.86585366 0.92682927 0.95121951 0.93902439]
Mean CV accuracy: 0.9220981486923303
Testing Set Accuracy with cross-validation: 0.9741379310344828
Testing Set Accuracy without cross-validation: 0.9741379310344828


In [37]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("D:\\SEM 6\\Mini Project\\Autism\\Datasets\\ASD_Toddlers\\Toddler Autism dataset July 2018.csv")
df.drop(df.columns[0], axis=1, inplace=True)

# Separate features and labels
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]

# Select categorical features
category_features = features.iloc[:, [12, 13, 14, 15, 16]]

# Drop categorical columns
features.drop(features.columns[-6:], axis=1, inplace=True)

# Fill missing values with mean
features = features.fillna(features.mean())

# One Hot Encoding without changing column names
enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
category_transformed = enc.fit_transform(category_features)
category_encoded_columns = enc.get_feature_names_out(category_features.columns)
transformed_df = pd.DataFrame(category_transformed, columns=category_encoded_columns)

# Concatenate encoded features with numerical features
result_df = pd.concat([features, transformed_df], axis=1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(result_df, labels, test_size=0.09, random_state=42)

# Define KNN classifier
knn = KNeighborsClassifier(metric='euclidean', n_neighbors=11, weights='distance', algorithm='auto', p=2, n_jobs=3)

# Preprocessing pipeline with Normalizer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('normalizer', Normalizer())
        ]), ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Age_Mons']),
        ('cat', Pipeline([
            ('onehot', OneHotEncoder())
        ]), transformed_df.columns)
    ],
    remainder='passthrough'  # Pass through any other columns without transformation
)

# Create pipeline with preprocessing and KNN classifier for training with cross-validation
pipeline_cv = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', knn)
])

# Define 10-fold cross-validation
cv = KFold(n_splits=10, shuffle=True, random_state=69)

# Perform cross-validation on training set
cv_scores = cross_val_score(pipeline_cv, X_train, y_train, cv=cv, scoring='accuracy')

# Print cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", np.mean(cv_scores))

# Train the model on the entire training set
pipeline_cv.fit(X_train, y_train)

# Test the model on the separate testing set
y_pred_cv = pipeline_cv.predict(X_test)
test_accuracy_cv = accuracy_score(y_test, y_pred_cv)
print("Testing Set Accuracy with cross-validation:", test_accuracy_cv)

# Create pipeline with preprocessing and KNN classifier for testing without cross-validation
pipeline_test = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', knn)
])

# Train the model on the entire training set without cross-validation
pipeline_test.fit(X_train, y_train)

# Test the model on the separate testing set
y_pred_test = pipeline_test.predict(X_test)
test_accuracy_test = accuracy_score(y_test, y_pred_test)
print("Testing Set Accuracy without cross-validation:", test_accuracy_test)




Cross-validation scores: [0.92708333 0.89583333 0.90625    0.91666667 0.86458333 0.88541667
 0.875      0.9375     0.89583333 0.85263158]
Mean CV accuracy: 0.8956798245614035
Testing Set Accuracy with cross-validation: 0.9894736842105263
Testing Set Accuracy without cross-validation: 0.9894736842105263
