In [None]:
import pandas as pd
import numpy as np

# set seed for reproducibility
np.random.seed(0) 

In [None]:
folderpath = "/opt/helthcare-final-project-autism/notebooks/data/processed"
filepath = f"{folderpath}/2017-2018_NSCH_DRC.csv.gzip"

In [None]:
df = pd.read_csv(filepath, index_col='HHID', compression='gzip')
have_autism = df[['K2Q35B']].fillna(2)
df = df[[col for col in df.columns.values if col != 'K2Q35B']]

In [None]:
df_variables = pd.read_csv(f"{folderpath}/2017-2018_description_columns.csv", index_col="column")

In [None]:
description_itens_list = ['screener', 'asd', 'autism']
autism_related_columns = df_variables[df_variables.description.fillna("").str.contains(f"({'|'.join(description_itens_list)})", case=False)].index.tolist()

In [None]:
df = df[[col for col in df.columns.values if col not in autism_related_columns]]

In [None]:
from scipy.stats import spearmanr
cor_col = []
p_value_col = []
for col in df.columns.values:
    cor, p_value = spearmanr(df[col].values, have_autism.values, nan_policy='omit')
    cor_col.append(cor)
    p_value_col.append(cor)
#     print(f"{col} - spearman: {cor} - p-value: {p_value}")

df_spearman_corr = pd.DataFrame({'spearman': cor_col,
                                 'p_value': p_value_col},
                                 index=df.columns.values)
df_spearman_corr = df_spearman_corr.sort_values(by=['spearman'], ascending=False)

df_spearman_corr = df_spearman_corr.join(df_variables)

filter_func = lambda x, lim: x>lim or x<-lim

selected_columns = (df_spearman_corr[df_spearman_corr.spearman.apply(lambda x: filter_func(x, 0.1))].index.values.tolist())

df_to_model = df[selected_columns]
df_to_model.head()

In [None]:
y = have_autism.values
X = df_to_model.fillna('999').values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, clf.predict(X_test)))

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

selector = SelectFromModel(estimator=clf, threshold=0.005).fit(X_train, y_train)
# selector.transform(X_train).shape
logisticRegression = LogisticRegression().fit(selector.transform(X_train), y_train)

print(classification_report(y_test, logisticRegression.predict(selector.transform(X_test))))

In [None]:
from sklearn.ensemble import RandomForestClassifier

randomForestClassifier = RandomForestClassifier(max_depth=3, random_state=0)

randomForestClassifier.fit(X_train, y_train)

print(classification_report(y_test, randomForestClassifier.predict(X_test)))

In [None]:
from sklearn import svm

svmClassifier = svm.SVC(random_state=0)

svmClassifier.fit(X_train, y_train)

print(classification_report(y_test, svmClassifier.predict(X_test)))

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report

clf_pipeline = Pipeline([
  ('feature_selection', SelectFromModel(RandomForestClassifier(max_depth=3, random_state=0),
                                        threshold="0.667*median", max_features=25)),
  ('classification', svm.SVC(random_state=0))
])
clf_pipeline.fit(X_train, y_train)

print(classification_report(y_test, clf_pipeline.predict(X_test)))

In [None]:
columns_of_model = [y for x, y in zip(clf_pipeline['feature_selection'].get_support(), selected_columns) if x]
print(len(columns_of_model))

In [None]:
%load_ext autoreload
%autoreload 2

import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

from src.variablesName import VariablesName
vNames = VariablesName()
for col in columns_of_model:
    print(f"{col}   {vNames.descriptionOfColumn(col)}")