In [None]:
import pandas as pd
import numpy as np

# set seed for reproducibility
np.random.seed(0) 

In [None]:
folderpath = "/opt/helthcare-final-project-autism/notebooks/data/processed"
filepath = f"{folderpath}/2017-2018_NSCH_DRC.csv.gzip"

In [None]:
df = pd.read_csv(filepath, index_col='HHID', compression='gzip')
have_autism = df[['K2Q35B']].fillna(2)
df = df[[col for col in df.columns.values if col != 'K2Q35B']]

In [None]:
from scipy.stats import spearmanr
cor_col = []
p_value_col = []
for col in df.columns.values:
    cor, p_value = spearmanr(df[col].values, have_autism.values, nan_policy='omit')
    cor_col.append(cor)
    p_value_col.append(cor)
#     print(f"{col} - spearman: {cor} - p-value: {p_value}")

In [None]:
df_variables = pd.read_csv(f"{folderpath}/2017-2018_description_columns.csv", index_col="column")

In [None]:
df_spearman_corr = pd.DataFrame({'spearman': cor_col,
                                 'p_value': p_value_col},
                                 index=df.columns.values)
df_spearman_corr = df_spearman_corr.sort_values(by=['spearman'], ascending=False)

In [None]:
df_spearman_corr = df_spearman_corr.join(df_variables)
# df_spearman_corr.head(n=50)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
corr = df.corr(method='spearman')

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=np.bool))


# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

In [None]:
%matplotlib notebook
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap)

In [None]:
%matplotlib notebook

ax = sns.barplot(x='index', y='spearman', data = df_spearman_corr.reset_index())
plt.show()

In [None]:
# itens_list = ['_1718']
# df_spearman_corr_[df_spearman_corr_.index.str.contains(f"({'|'.join(itens_list)})")]

description_itens_list = ['screener']
exclude_columns = df_spearman_corr[df_spearman_corr.description.fillna('').str.contains(f"({'|'.join(description_itens_list)})", case=False)].index.values
df_spearman_corr_ = df_spearman_corr.loc[[c for c in df_spearman_corr.index.values if c not in exclude_columns]]

# df_spearman_corr_.head(n=25)

In [None]:
filter_func = lambda x, lim: x>lim or x<-lim

# df_spearman_corr_[df_spearman_corr_.spearman.apply(lambda x: filter_func(x, 0.1))]

selected_columns = (df_spearman_corr_[df_spearman_corr_.spearman.apply(lambda x: filter_func(x, 0.1))].index.values.tolist())

df_to_model = df[selected_columns]
df_to_model.head()
# have_autism # to predict

In [None]:
y = have_autism.values
X = df_to_model.fillna('999').values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, clf.predict(X_test)))

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

selector = SelectFromModel(estimator=clf, threshold=0.005).fit(X_train, y_train)
# selector.transform(X_train).shape
logisticRegression = LogisticRegression().fit(selector.transform(X_train), y_train)

print(classification_report(y_test, logisticRegression.predict(selector.transform(X_test))))

In [None]:
from sklearn.ensemble import RandomForestClassifier

randomForestClassifier = RandomForestClassifier(max_depth=3, random_state=0)

randomForestClassifier.fit(X_train, y_train)

print(classification_report(y_test, randomForestClassifier.predict(X_test)))