# Tester for LR-5

** Please use python3 to run this code and install all packages that the classifier depends on**

In [91]:
__authoroftheclassifier__ = 'Sebastien Levy'

from processing import ADOS_Data
from cross_validation_self import CVP_Set
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.svm import LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from classifiers import RegClassifier, BinClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report, confusion_matrix, precision_recall_curve, auc

import matplotlib.pyplot as plt
import pandas as pd

In [92]:
def test_func(pred_score, predlabels):
    y_true = predlabels
    print('Confusion matrix:')
    cm = confusion_matrix([1-x for x in y_true], [1-int(x > 0.5) for x in pred_score])
    print(cm)
    print('Precision: {}'.format(float(cm[0][0])/(cm[0][0]+cm[1][0])))
    print('Recall/Sensitivity: {}'.format(float(cm[0][0])/(cm[0][0]+cm[0][1])))
    print('Specificity: {}'.format(float(cm[1][1])/(cm[1][1]+cm[1][0])))
    print ('class report')
    print (classification_report([1-x for x in y_true], [1-int(x > 0.5) for x in pred_score]))

# M2 training

In [93]:
MODULE = 'm2'

FEATURE_SET = ['A3', 'A5', 'B1', 'B2', 'B10']

N_FOLD = 10
PRED_RATIO = 0.2
SCALING_PARAM = 4
# Binary or Replacement
MISSING_VALUE_STRATEGY = 'Binary'
# poly, linear, indicator, interaction_ind, pca_comp
PROCESSING_STRATEGY = 'linear'

POLY_DEGREE = 2
NORMALIZE = True

ADOS_FILE = MODULE+"/data/ados_"+MODULE+"_allData.csv"
label_id = "ASD"
label_age = "age_months"
label_gender = "male"
columns_to_delete = ["Subject.Id", "Diagnosis"]
sub_diagnosis_id = ["social_affect_calc","restricted_repetitive_calc","SA_RRI_total_calc","severity_calc"]

In [94]:
#from sklearn.model_selection import cross_val_score
#scores = cross_val_score(clf, X, y, cv=5)

In [95]:
# We import the data
data = ADOS_Data.read_csv(ADOS_FILE)
sub_diagnosis = data[sub_diagnosis_id]

# We drop the columns that are not interesting for us, and the row with no label
data.select_good_columns(columns_to_delete+sub_diagnosis_id)

print('gendering')
print((data[data['ASD'] == 1][['male']]).sum())

data.full_preprocessing(NORMALIZE, MISSING_VALUE_STRATEGY, PROCESSING_STRATEGY, [label_age], label_gender, label_id)
if FEATURE_SET != []:
    data.select_good_columns(FEATURE_SET, keep_the_column=True)
    

hello!
      A1   A2  A3  A4  A5  A6  A7  A8  B1  B2  ...  D1  D2  D3  D4  E1  E2  \
0      0  1.0   2   0   1   2   0   2   2   1  ...   2   2   0   3   0   1   
1      1  0.0   1   2   1   1   0   1   2   1  ...   0   0   0   1   1   0   
2      1  2.0   2   1   2   2   2   2   2   2  ...   2   0   0   2   1   2   
3      0  1.0   1   0   1   2   0   0   2   2  ...   2   0   0   1   0   1   
4      1  0.0   1   2   2   2   0   2   2   1  ...   2   2   0   1   1   0   
...   ..  ...  ..  ..  ..  ..  ..  ..  ..  ..  ...  ..  ..  ..  ..  ..  ..   
1384   0  0.0   2   2   2   2   0   1   2   1  ...   0   0   0   2   0   0   
1385   1  2.0   2   2   1   2   1   1   2   2  ...   2   2   0   2   0   0   
1386   0  1.0   2   2   2   2   1   1   2   1  ...   2   2   0   2   1   0   
1387   0  1.0   2   1   2   1   0   0   2   1  ...   2   2   0   2   1   0   
1388   0  1.0   1   1   0   1   0   0   2   1  ...   0   0   0   1   0   0   

      E3  age_months  male  ASD  
0      2        65.0  

  self.labels = self[label_id]


In [107]:
data.labels

0       2
1       2
2       2
3       2
4       2
       ..
1384    2
1385    2
1386    2
1387    2
1388    1
Name: ASD, Length: 1389, dtype: int64

In [97]:
x_train, x_test, y_train, y_test = train_test_split(data, data.labels, test_size=0.2, random_state=13)

In [98]:
from pandas import set_option
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [99]:
#Using different models to get an idea which model would be good for tuning 
def GetSimpleModel():
    simpleModels = []
    simpleModels.append(('Logistic Regression'   , LogisticRegression(max_iter=5000)))
    simpleModels.append(('KNeighbors'  , KNeighborsClassifier()))
    simpleModels.append(('DecisionTree' , DecisionTreeClassifier()))
    simpleModels.append(('Naive Bayes'   , GaussianNB()))
    simpleModels.append(('Support Vector Machine'  , SVC(probability=True)))
    simpleModels.append(('ADABoost'   , AdaBoostClassifier()))
    simpleModels.append(('GradientBoosting'  , GradientBoostingClassifier()))
    simpleModels.append(('RandomForest'   , RandomForestClassifier()))

    
    return simpleModels

In [100]:
# Function to get cross validation score 
def BasedLine(x_train, y_train,models):
    
    num_folds = 5
    scoring = 'accuracy'

    results = []
    names = []
    for name, model in models:
        kfold = StratifiedKFold(n_splits=num_folds)
        res = cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
        results.append(res)
        names.append(name)
    cv_scores = []
    for cv in results:
        cv_scores.append("{:.4f}".format(cv.mean()))  

    scoreDataFrame = pd.DataFrame({'Model':names, 'Score': cv_scores})
    return scoreDataFrame

In [101]:
models = GetSimpleModel()
cv_Score=BasedLine(x_train, y_train,models)
cv_Score

Unnamed: 0,Model,Score
0,Logistic Regression,0.8038
1,KNeighbors,0.7678
2,DecisionTree,0.7741
3,Naive Bayes,0.7687
4,Support Vector Machine,0.7975
5,ADABoost,0.7993
6,GradientBoosting,0.784
7,RandomForest,0.7813


## LR training

In [102]:
lr = LogisticRegression(C=0.05, penalty='l2',class_weight='balanced')
lr.fit(x_train, y_train)

LogisticRegression(C=0.05, class_weight='balanced')

# M2 testing

In [103]:
FILENAME = "primary_dataset.csv"
columns_to_delete = ['child_id','scorer_id','video_file'
]
pred_feat = ADOS_Data.read_csv(FILENAME)
ytrue = pred_feat["ASD"]
# We drop the columns that are not interesting for us, and the row with no label
pred_feat.select_good_columns(columns_to_delete)

pred_feat.full_preprocessing(NORMALIZE, MISSING_VALUE_STRATEGY, PROCESSING_STRATEGY, [label_age], label_gender, label_id)
if FEATURE_SET != []:
    pred_feat.select_good_columns(FEATURE_SET, keep_the_column=True)

hello!
         updatedAt  question_set  A3  A5  B10  question3  question4  \
0    6/26/17 16:43             1   8   8    8          8          2   
1    6/26/17 16:57             1   0   0    0          0          1   
2    6/26/17 17:29             1   0   0    0          0          0   
3    6/26/17 17:35             1   2   3    3          1          2   
4    6/26/17 18:36             1   0   8    8          8          8   
..             ...           ...  ..  ..  ...        ...        ...   
462  11/5/17 17:35             1   0   1    1          1          1   
463  11/5/17 17:45             1   0   2    2          2          1   
464  11/5/17 17:50             1   0   1    1          1          1   
465  11/5/17 17:54             1   0   2    2          1          1   
466  11/5/17 17:57             1   0   2    2          2          1   

     question5  question6  question7  ...  question26  question27  question28  \
0            2          0          3  ...           3      

  self.labels = self[label_id]


## Logistic Regression

In [104]:
test_func(list(lr.predict(pred_feat)), ytrue)

Confusion matrix:
[[257  86]
 [ 38  86]]
Precision: 0.8711864406779661
Recall/Sensitivity: 0.749271137026239
Specificity: 0.6935483870967742
class report
              precision    recall  f1-score   support

           0       0.87      0.75      0.81       343
           1       0.50      0.69      0.58       124

    accuracy                           0.73       467
   macro avg       0.69      0.72      0.69       467
weighted avg       0.77      0.73      0.75       467



In [105]:
print("---Printing probabilities---")
LogisticProbabilities = (lr.predict_proba(pred_feat))
for line in LogisticProbabilities:
    print(line)
print("---End of Printing Probabilities---")

---Printing probabilities---
[0.52780485 0.25918228 0.21301287]
[0.87484337 0.09865102 0.02650561]
[0.87484337 0.09865102 0.02650561]
[0.00381638 0.16051484 0.83566878]
[0.28590134 0.29297358 0.42112508]
[0.29818976 0.44801445 0.25379579]
[0.74290631 0.12853337 0.12856032]
[0.74290631 0.12853337 0.12856032]
[0.04739664 0.26495513 0.68764823]
[0.28590134 0.29297358 0.42112508]
[0.34839477 0.24533597 0.40626926]
[0.87484337 0.09865102 0.02650561]
[0.04498438 0.23327554 0.72174009]
[0.02995643 0.19314569 0.77689788]
[0.01523986 0.17778128 0.80697886]
[0.34839477 0.24533597 0.40626926]
[0.01523986 0.17778128 0.80697886]
[0.51927225 0.34681415 0.1339136 ]
[0.02995643 0.19314569 0.77689788]
[0.43710277 0.24756298 0.31533425]
[0.8444396  0.10982723 0.04573316]
[0.43710277 0.24756298 0.31533425]
[0.60549815 0.25779504 0.13670681]
[0.00381638 0.16051484 0.83566878]
[0.35541622 0.26980135 0.37478243]
[0.51927225 0.34681415 0.1339136 ]
[0.80204264 0.12031205 0.07764531]
[0.00134285 0.14425286 0.8

In [106]:
print(list(lr.predict(pred_feat)))

[0, 0, 0, 2, 2, 1, 0, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 0, 2, 2, 2, 0, 2, 0, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 2, 1, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 0, 2, 0, 0, 0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 2, 0, 0, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 0, 0, 2, 0, 0, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 0, 0, 2, 0, 0, 2, 2, 0, 2, 2, 0, 2, 0, 2, 0, 0, 2, 2, 2, 0, 0, 2, 0, 2, 2, 0, 0, 0, 2, 2, 0, 0, 2, 0, 2, 0, 2, 2, 2, 0, 2, 

In [10]:
l = list(zip(LogisticProbabilities, list(lr.predict(pred_feat))))
with open('results_lr_5.csv', 'w') as f:
    for i in range(len(l)):
        f.write(str(l[i])+'\n')