In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
import matplotlib.pyplot as plot
# we can use the LabelEncoder to encode the gender feature
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# importing two different imputation methods that take into consideration all the features when predicting the missing values
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.dummy import DummyClassifier

# oversample the minority class using SMOTE
from imblearn.over_sampling import SMOTE
from collections import Counter
from itertools import combinations


np.random.seed(42)

In [None]:
# load the dataset (1)
df = pd.read_csv('./data/ToddlerAutismdatasetJuly2018.csv')
# print the dimensionality of the dataframe (1)
print(f"dataframe shape:\n{df.shape}\n")
# print the names of the columns that can be used as features when training the machine learning model (1)
print(f"dataframe columns:\n{df.columns}\n")
# print the different data types that can be identified from the entire dataset (1)
print(f"dataframe info:\n{df.info()}\n")

In [None]:
# Validate there no missing missing rwos in data
missing_rows = df[df.isnull().any(axis=1)]
print(missing_rows.size)

# load the dataset (1)
df = pd.read_csv('./data/ToddlerAutismdatasetJuly2018.csv')
# print the dimensionality of the dataframe (1)
print(f"dataframe shape:\n{df.shape}\n")
# print the names of the columns that can be used as features when training the machine learning model (1)
print(f"dataframe columns:\n{df.columns}\n")
# print the different data types that can be identified from the entire dataset (1)
print(f"dataframe info:\n{df.info()}\n")

# plotting the class distrabution for our dataset.
x_axis = ['No', 'Yes']
y_axis = [len(df.loc[df.Class == 'No']),len(df.loc[df.Class == 'Yes']) ]
plot.bar(x_axis, y_axis)
plot.show()

print(df.loc[df.Class==1, 'Age_Mons'].median())
# identify features that represent a notable correlation
df_corr = df.drop(columns=['Class','Case_No'])
corr = df_corr.corr()
for col in corr.columns:
    print(f"correlation with column: {col}")
    print(corr[col].sort_values(ascending=False))
    print()

split = StratifiedShuffleSplit(n_splits=10, test_size=0.3)

for train_index, test_index in split.split(df, df['Class']):
    train_set = df.loc[train_index]
    test_set = df.loc[test_index]


# print the dimensionality of the test dataset (0.5)
print(train_set.shape)
# print the dimensionality of the training dataset (0.5)
print(test_set.shape)

# print the proportional distribution of the classes to identify whether or not the classes are equally(or closer) distributed between the train and test datasets (1 + 1)
print(train_set.Class.value_counts()/len(train_set))
print(test_set.Class.value_counts()/len(test_set))


##### Model Development

In [None]:
# separate the features and the labels to be used in model development, dropped Case_No and Who completed the test due to irrelevance (2)
data = train_set.drop(columns=['Class', 'Case_No','Who completed the test', 'Qchat-10-Score'])
labels = train_set['Class'].to_numpy(copy=True)


# print the dimensionality of the dataset and the labels (0.5 + 0.5)
print(data.shape)
print(labels.shape)


# select one of the scaling strategies and briefly explain why it is essential to scale your features in the markdown cell mentioned below 

# create the necessary pipelines and combine the features to be used as the training data for the given algorithm 
numerical_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
    ])

num_feature_names = data.drop(columns=['Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD']).columns
cat_feature_names = ['Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD']

full_pipeline = ColumnTransformer([
        ("num", numerical_pipeline, num_feature_names),
        ("cat", OneHotEncoder(), cat_feature_names),
    ])

input_x = full_pipeline.fit_transform(data)


questions = [0,1,2,3,4,5,6,7,8,9]

question_combinations = [",".join(map(str, comb)) for comb in combinations(questions, 3)]
training_data_list = []

for i, c in enumerate(question_combinations):
    question_list = c.split(',')
    training_data_list.append([])
    for row in input_x:
        training_data_list[i].append(np.delete(row, [int(question_list[0]),int(question_list[1]),int(question_list[2])]))

print(training_data_list[0:3])
print(len(input_x[0]))
print(training_data_list[0])



##### Creating different models to train the data set on

In [None]:
model_svc = SVC()
model_dtree = DecisionTreeClassifier(random_state=42)
model_rforest = RandomForestClassifier(random_state=42)
model_nb = GaussianNB()
model_logreg = LogisticRegression()

In [None]:

scoring = {"accuracy": "accuracy", "bal_accuracy": "balanced_accuracy", "F1_macro": "f1_macro"}

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10], 'gamma':[0.001, 0.0001]}
gridSearchCV_svm = GridSearchCV(model_svc, parameters, scoring=scoring, n_jobs = -1, refit='bal_accuracy')

parameters = {'max_depth':(3, 4), 'min_samples_split':[5, 10], 'min_samples_leaf':[10, 20]}
gridSearchCV_dtc = GridSearchCV(model_dtree, parameters, scoring=scoring, n_jobs = -1, refit='bal_accuracy')

parameters = {}  # param_grid={}
gridSearchCV_nby = GridSearchCV(model_nb, parameters, scoring=scoring, n_jobs = -1, refit='bal_accuracy')



parameters = {'n_estimators':[100, 200], 'max_depth':[3, 5], 'bootstrap':(True, False)}
gridSearchCV_rdf = GridSearchCV(model_rforest, parameters, scoring=scoring, n_jobs = -1, refit='bal_accuracy')
# fit the training data (0.5)
gridSearchCV_dtc_list = []
gridSearchCV_rdf_list = []
gridSearchCV_svm_list = []
gridSearchCV_nby_list = []

for i in range(0,5):
    scoring = {"accuracy": "accuracy", "bal_accuracy": "balanced_accuracy", "F1_macro": "f1_macro"}

    parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10], 'gamma':[0.001, 0.0001]}
    gridSearchCV_svm = GridSearchCV(model_svc, parameters, scoring=scoring, n_jobs = -1, refit='bal_accuracy')

    parameters = {'max_depth':(3, 4), 'min_samples_split':[5, 10], 'min_samples_leaf':[10, 20]}
    gridSearchCV_dtc = GridSearchCV(model_dtree, parameters, scoring=scoring, n_jobs = -1, refit='bal_accuracy')

    parameters = {}  # param_grid={}
    gridSearchCV_nby = GridSearchCV(model_nb, parameters, scoring=scoring, n_jobs = -1, refit='bal_accuracy')



    parameters = {'n_estimators':[100, 200], 'max_depth':[3, 5], 'bootstrap':(True, False)}
    gridSearchCV_rdf = GridSearchCV(model_rforest, parameters, scoring=scoring, n_jobs = -1, refit='bal_accuracy')
    
    gridSearchCV_dtc.fit(training_data_list[i], labels)
    gridSearchCV_rdf.fit(training_data_list[i], labels)
    gridSearchCV_svm.fit(training_data_list[i], labels)
    gridSearchCV_nby.fit(training_data_list[i], labels)

    gridSearchCV_dtc_list.append(gridSearchCV_dtc)
    gridSearchCV_rdf_list.append(gridSearchCV_rdf)
    gridSearchCV_svm_list.append(gridSearchCV_svm)
    gridSearchCV_nby_list.append(gridSearchCV_nby)

# print the best parameters (0.5)
print(gridSearchCV_dtc.best_params_)
print(gridSearchCV_rdf.best_params_)
print(gridSearchCV_svm.best_params_)
print(gridSearchCV_nby.best_params_)

# print the best estimator (0.5)
print(gridSearchCV_dtc.best_estimator_)
print(gridSearchCV_rdf.best_estimator_)
print(gridSearchCV_svm.best_estimator_)
print(gridSearchCV_nby.best_estimator_)

# print the best score from trained GridSearchCV model (0.5)

print(gridSearchCV_dtc.best_score_)
print(gridSearchCV_rdf.best_score_)
print(gridSearchCV_svm.best_score_)
print(gridSearchCV_nby.best_score_)

In [None]:
best_value_models = []
for i in range(len(gridSearchCV_svm_list)):
    best_svm_accuracy = 0
    best_svm_accuracy_score = 0
    best_svm_balanced = 0
    best_svm_balanced_score = 0
    best_svm_f1 = 0
    best_svm_f1_score = 0
    # print the grid search cross-validation results listing the above mentioned evaluation methods (3)
    print('For Model' , i ) 
    print("svm cross-validation")
    print("max acuracy:")
    for n in range(0,5):
        acc = gridSearchCV_svm_list[i].cv_results_[f"split{n}_test_accuracy"]
        if max(acc) > best_svm_accuracy_score:
            best_svm_accuracy_score = max(acc)
            best_svm_accuracy = i
        print(max(acc))
    print("max balanced accuracy for svm:")

    for n in range(0,5):
        bal_acc = gridSearchCV_svm_list[i].cv_results_[f"split{n}_test_bal_accuracy"]
        if max(bal_acc)>best_svm_balanced_score:
            best_svm_balanced_score = max(bal_acc)
            best_svm_balanced = i
        print(max(bal_acc))
    print("max f1_macro svm per ")
    for n in range(0,5):
        f1m = gridSearchCV_svm_list[i].cv_results_[f"split{n}_test_F1_macro"]
        if max(f1m) > best_svm_f1_score:
            best_svm_f1_score = max(f1m)
            best_svm_f1 = i
        print(max(f1m))
best_value_models.append(best_svm_accuracy)
best_value_models.append(best_svm_balanced)
best_value_models.append(best_svm_f1)
    

for i in range(len(gridSearchCV_dtc_list)):
    best_dtc_accuracy = 0
    best_dtc_accuracy_score = 0
    best_dtc_balanced = 0
    best_dtc_balanced_score = 0
    best_dtc_f1 = 0
    best_dtc_f1_score = 0
    print('For Model', i)
    print("dtc cross-validation:")
    print("max acuracy dtc")
    for n in range(0,5):
        acc = gridSearchCV_dtc_list[i].cv_results_[f"split{n}_test_accuracy"]
        if max(acc) > best_dtc_accuracy_score:
            best_dtc_accuracy_score = max(acc)
            best_dtc_accuracy = i
        print(max(acc))
    print("max balanced accuracy:")

    for n in range(0,5):
        bal_acc = gridSearchCV_dtc_list[i].cv_results_[f"split{n}_test_bal_accuracy"]
        if max(bal_acc)>best_dtc_balanced_score:
            best_dtc_balanced_score = max(bal_acc)
            best_dtc_balanced = i
        print(max(bal_acc))
    print("max f1_macro:")

    for n in range(0,5):
        f1m = gridSearchCV_dtc_list[i].cv_results_[f"split{n}_test_F1_macro"]
        if max(f1m) > best_dtc_f1_score:
            best_dtc_f1_score = max(f1m)
            best_dtc_f1 = i
        print(max(f1m))

best_value_models.append(best_dtc_accuracy)
best_value_models.append(best_dtc_balanced)
best_value_models.append(best_dtc_f1)
for i in range(len(gridSearchCV_rdf_list)):
    best_rdf_accuracy = 0
    best_rdf_accuracy_score = 0
    best_rdf_balanced = 0
    best_rdf_balanced_score = 0
    best_rdf_f1 = 0
    best_rdf_f1_score = 0
    print('For Model', i)
    print("rdf cross-validation:")
    print("max acuracy:")

    for n in range(0,5):
        acc = gridSearchCV_rdf_list[i].cv_results_[f"split{n}_test_accuracy"]
        if max(acc) > best_rdf_accuracy_score:
            best_rdf_accuracy_score = max(acc)
            best_rdf_accuracy = i
        print(max(acc))
    print("max balanced accuracy")

    for n in range(0,5):
        bal_acc = gridSearchCV_rdf_list[i].cv_results_[f"split{n}_test_bal_accuracy"]
        if max(bal_acc)>best_rdf_balanced_score:
            best_rdf_balanced_score = max(bal_acc)
            best_rdf_balanced = i
        print(max(bal_acc))
    print("\nmax f1_macro")

    for n in range(0,5):
        f1m = gridSearchCV_rdf_list[i].cv_results_[f"split{n}_test_F1_macro"]
        if max(f1m) > best_rdf_f1_score:
            best_rdf_f1_score = max(f1m)
            best_rdf_f1 = i
        print(max(f1m))
best_value_models.append(best_rdf_accuracy)
best_value_models.append(best_rdf_balanced)
best_value_models.append(best_rdf_f1)
for i in range(len(gridSearchCV_nby_list)):
    best_nby_accuracy = 0
    best_nby_accuracy_score = 0
    best_nby_balanced = 0
    best_nby_balanced_score = 0
    best_nby_f1 = 0
    best_nby_f1_score = 0
    print('For Model', i)
    print("nby cross-validation results:")
    print("max acuracy")
    for n in range(0,5):
        acc = gridSearchCV_nby_list[i].cv_results_[f"split{n}_test_accuracy"]
        if max(acc) > best_nby_accuracy_score:
            best_nby_accuracy_score = max(acc)
            best_nby_accuracy = i
        print(max(acc))
    print("max balanced accuracy:")
    for n in range(0,5):
        bal_acc = gridSearchCV_nby_list[i].cv_results_[f"split{n}_test_bal_accuracy"]
        if max(bal_acc)>best_nby_balanced_score:
            best_nby_balanced_score = max(bal_acc)
            best_nby_balanced = i
        print(max(bal_acc))
    print("max f1_macro")
    for n in range(0,5):
        f1m = gridSearchCV_nby_list[i].cv_results_[f"split{n}_test_F1_macro"]
        if max(f1m) > best_nby_f1_score:
            best_nby_f1_score = max(f1m)
            best_nby_f1 = i
        print(max(f1m))
best_value_models.append(best_nby_accuracy)
best_value_models.append(best_nby_balanced)
best_value_models.append(best_nby_f1)
print(best_value_models)




In [None]:
# use a dummy classifier to identify a simple baseline (i.e., a majority class baseline) so that you can compare your prediction results (3)
for i in range(len(training_data_list)):
    dummy_clf = DummyClassifier(strategy='most_frequent')
    dummy_clf.fit(training_data_list[i], labels)
    dummy_clf.score(training_data_list[i], labels)



In [None]:
# prepare the test data to be predicted (2)
test_data = test_set.drop(columns=['Class', 'Case_No','Who completed the test', 'Qchat-10-Score'])
test_labels = test_set['Class'].to_numpy(copy=True)

test_data_list = []


# print the dimensionality of the dataset and the labels (0.5 + 0.5)
print(test_data.shape)
print(test_labels.shape)

# transform test data for prediction (2)
input_x_test = full_pipeline.transform(test_data)

for i, c in enumerate(question_combinations):
    question_list = c.split(',')
    test_data_list.append([])
    for row in input_x_test:
        test_data_list[i].append(np.delete(row, [int(question_list[0]),int(question_list[1]),int(question_list[2])]))

# obtain predictions on test data using the best model from GridSearchCV (i.e., .best_estimator_) (2)
test_predictions_svm =  gridSearchCV_svm.best_estimator_.predict(test_data_list[0])

# obtain predictions on test data using the best model from GridSearchCV (i.e., .best_estimator_) (2)
test_predictions_rdf =  gridSearchCV_rdf.best_estimator_.predict(test_data_list[0])

# obtain predictions on test data using the best model from GridSearchCV (i.e., .best_estimator_) (2)
test_predictions_nby =  gridSearchCV_nby.best_estimator_.predict(test_data_list[0])

# obtain predictions on test data using the best model from GridSearchCV (i.e., .best_estimator_) (2)
test_predictions_dtc =  gridSearchCV_dtc.best_estimator_.predict(test_data_list[0])

# generate the classification report and the confusion matrix for test predictions (3)
cr_test_svm = classification_report(test_labels, test_predictions_svm)
cm_test_svm = confusion_matrix(test_labels, test_predictions_svm)

print("SVM")
print(cr_test_svm)
print(cm_test_svm)

# generate the classification report and the confusion matrix for test predictions (3)
cr_test_rdf = classification_report(test_labels, test_predictions_rdf)
cm_test_rdf = confusion_matrix(test_labels, test_predictions_rdf)

print('rdf')
print(cr_test_rdf)
print(cm_test_rdf)

# generate the classification report and the confusion matrix for test predictions (3)
cr_test_nby = classification_report(test_labels, test_predictions_nby)
cm_test_nby = confusion_matrix(test_labels, test_predictions_nby)

print('nby')
print(cr_test_nby)
print(cm_test_nby)

# generate the classification report and the confusion matrix for test predictions (3)
cr_test_dtc = classification_report(test_labels, test_predictions_dtc)
cm_test_dtc = confusion_matrix(test_labels, test_predictions_dtc)

print('dtc')
print(cr_test_dtc)
print(cm_test_dtc)