# Tester for LR-5

** Please use python3 to run this code and install all packages that the classifier depends on**

In [1]:
__authoroftheclassifier__ = 'Sebastien Levy'

from processing import ADOS_Data
from cross_validation import CVP_Set
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.svm import LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from classifiers import RegClassifier, BinClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report, confusion_matrix, precision_recall_curve, auc

import matplotlib.pyplot as plt
import pandas as pd



In [2]:
def test_func(pred_score, predlabels):
    y_true = predlabels
    print('Confusion matrix:')
    cm = confusion_matrix([1-x for x in y_true], [1-int(x > 0.5) for x in pred_score])
    print(cm)
    print('Precision: {}'.format(float(cm[0][0])/(cm[0][0]+cm[1][0])))
    print('Recall/Sensitivity: {}'.format(float(cm[0][0])/(cm[0][0]+cm[0][1])))
    print('Specificity: {}'.format(float(cm[1][1])/(cm[1][1]+cm[1][0])))
    print ('class report')
    print (classification_report([1-x for x in y_true], [1-int(x > 0.5) for x in pred_score]))

# M2 training

In [3]:
MODULE = 'm2'

FEATURE_SET = ['A3', 'A5', 'B1', 'B2', 'B10']

N_FOLD = 10
PRED_RATIO = 0.2
SCALING_PARAM = 4
# Binary or Replacement
MISSING_VALUE_STRATEGY = 'Binary'
# poly, linear, indicator, interaction_ind, pca_comp
PROCESSING_STRATEGY = 'linear'

POLY_DEGREE = 2
NORMALIZE = True

ADOS_FILE = MODULE+"/data/ados_"+MODULE+"_allData.csv"
label_id = "ASD"
label_age = "age_months"
label_gender = "male"
columns_to_delete = ["Subject.Id", "Diagnosis"]
sub_diagnosis_id = ["social_affect_calc","restricted_repetitive_calc","SA_RRI_total_calc","severity_calc"]

In [4]:
# We import the data
data = ADOS_Data.read_csv(ADOS_FILE)
sub_diagnosis = data[sub_diagnosis_id]

# We drop the columns that are not interesting for us, and the row with no label
data.select_good_columns(columns_to_delete+sub_diagnosis_id)

print('gendering')
print((data[data['ASD'] == 1][['male']]).sum())

data.full_preprocessing(NORMALIZE, MISSING_VALUE_STRATEGY, PROCESSING_STRATEGY, [label_age], label_gender, label_id)
if FEATURE_SET != []:
    data.select_good_columns(FEATURE_SET, keep_the_column=True)
    
cv_set = CVP_Set(data, data.labels, N_FOLD, PRED_RATIO)

hello!
      A1   A2  A3  A4  A5  A6  A7  A8  B1  B2 ...   D1  D2  D3  D4  E1  E2  \
0      0  1.0   2   0   1   2   0   2   2   1 ...    2   2   0   3   0   1   
1      1  0.0   1   2   1   1   0   1   2   1 ...    0   0   0   1   1   0   
2      1  2.0   2   1   2   2   2   2   2   2 ...    2   0   0   2   1   2   
3      0  1.0   1   0   1   2   0   0   2   2 ...    2   0   0   1   0   1   
4      1  0.0   1   2   2   2   0   2   2   1 ...    2   2   0   1   1   0   
5      0  1.0   2   1   2   2   0   1   2   1 ...    2   2   0   2   0   1   
6      0  1.0   1   2   1   2   2   3   2   2 ...    2   2   0   1   0   1   
7      0  2.0   2   0   0   2   3   1   2   2 ...    2   0   0   1   0   2   
8      1  1.0   2   0   0   1   1   1   2   1 ...    2   0   0   3   1   1   
9      1  2.0   2   2   2   2   1   0   2   1 ...    2   1   0   2   1   2   
10     0  0.0   2   2   0   2   1   0   2   1 ...    2   1   0   1   0   0   
11     1  1.0   1   1   0   2   1   2   2   1 ...    2   

## LR training

In [5]:
lr = BinClassifier(proc=LogisticRegression(C=0.05, penalty='l2',class_weight='balanced'), severity=False)
lr.fit(cv_set.cv_feat, cv_set.cv_labels)

# M2 testing

In [6]:
FILENAME = "primary_dataset.csv"
columns_to_delete = ['child_id','scorer_id','video_file'
]
pred_feat = ADOS_Data.read_csv(FILENAME)
ytrue = pred_feat["ASD"]
# We drop the columns that are not interesting for us, and the row with no label
pred_feat.select_good_columns(columns_to_delete)

pred_feat.full_preprocessing(NORMALIZE, MISSING_VALUE_STRATEGY, PROCESSING_STRATEGY, [label_age], label_gender, label_id)
if FEATURE_SET != []:
    pred_feat.select_good_columns(FEATURE_SET, keep_the_column=True)

hello!
         updatedAt  question_set  A3  A5  B10  question3  question4  \
0    6/26/17 16:43             1   8   8    8          8          2   
1    6/26/17 16:57             1   0   0    0          0          1   
2    6/26/17 17:29             1   0   0    0          0          0   
3    6/26/17 17:35             1   2   3    3          1          2   
4    6/26/17 18:36             1   0   8    8          8          8   
5    6/26/17 20:49             1   0   1    1          0          2   
6    6/26/17 21:07             1   0   8    8          7          2   
7    6/26/17 21:38             1   0   0    0          0          1   
8    6/26/17 22:21             1   0   3    3          7          2   
9    6/27/17 17:37             1   0   8    8          8          8   
10   6/27/17 19:58             1   0   1    1          1          1   
11   6/27/17 22:07             1   0   8    8          8          8   
12   6/27/17 22:28             1   0   2    2          2          2   

  result = result.union(other)
  result = result.union(other)
  result = result.union(other)
  result = result.union(other)
  result = result.union(other)
  index = _union_indexes(indexes)


## Logistic Regression

In [7]:
test_func(list(lr.predict(pred_feat)), ytrue)

Confusion matrix:
[[261  82]
 [ 37  87]]
Precision: 0.8758389261744967
Recall/Sensitivity: 0.760932944606414
Specificity: 0.7016129032258065
class report
             precision    recall  f1-score   support

          0       0.88      0.76      0.81       343
          1       0.51      0.70      0.59       124

avg / total       0.78      0.75      0.76       467



In [8]:
print("---Printing probabilities---")
LogisticProbabilities = (lr.predict_proba(pred_feat))
for line in LogisticProbabilities:
    print(line)
print("---End of Printing Probabilities---")

---Printing probabilities---
[ 0.5919002  0.4080998]
[ 0.86087581  0.13912419]
[ 0.86087581  0.13912419]
[ 0.03560753  0.96439247]
[ 0.41252128  0.58747872]
[ 0.50671985  0.49328015]
[ 0.68427803  0.31572197]
[ 0.68427803  0.31572197]
[ 0.16326719  0.83673281]
[ 0.41252128  0.58747872]
[ 0.4327033  0.5672967]
[ 0.86087581  0.13912419]
[ 0.15566441  0.84433559]
[ 0.12091323  0.87908677]
[ 0.08631399  0.91368601]
[ 0.4327033  0.5672967]
[ 0.08631399  0.91368601]
[ 0.66719569  0.33280431]
[ 0.12091323  0.87908677]
[ 0.50553293  0.49446707]
[ 0.8134962  0.1865038]
[ 0.50553293  0.49446707]
[ 0.67294163  0.32705837]
[ 0.03560753  0.96439247]
[ 0.44667868  0.55332132]
[ 0.66719569  0.33280431]
[ 0.75458106  0.24541894]
[ 0.01877075  0.98122925]
[ 0.41252128  0.58747872]
[ 0.749736  0.250264]
[ 0.5919002  0.4080998]
[ 0.27578453  0.72421547]
[ 0.07366278  0.92633722]
[ 0.21162422  0.78837578]
[ 0.02642118  0.97357882]
[ 0.50553293  0.49446707]
[ 0.02709805  0.97290195]
[ 0.58560723  0.4143927

In [9]:
print(list(lr.predict(pred_feat)))

[0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 

In [10]:
l = list(zip(LogisticProbabilities, list(lr.predict(pred_feat))))
with open('results_lr_5.csv', 'w') as f:
    for i in range(len(l)):
        f.write(str(l[i])+'\n')