In [None]:
%load_ext autoreload
%autoreload 2

import os
import sys

nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

from src.variablesName import VariablesName
import src.pathconfig as pathcfg

vNames = VariablesName()

In [None]:
import pandas as pd
import numpy as np

# set seed for reproducibility
np.random.seed(0) 

In [None]:
folderpath = pathcfg.folderpath
filepath = f"{folderpath}/2017-2018_NSCH_DRC.csv.gzip"

In [None]:
df = pd.read_csv(filepath, index_col='HHID', compression='gzip')
have_autism = df[['K2Q35B']].fillna(2)
df = df[[col for col in df.columns.values if col != 'K2Q35B']]

In [None]:
from scipy.stats import kendalltau
from time import time

cor_col = []
p_value_col = []
start_time = time()
for i, col in enumerate(df.columns.values):
    cor, p_value = kendalltau(df[col].values, have_autism.values, nan_policy='omit', method='asymptotic')
    cor_col.append(cor)
    p_value_col.append(cor)
    if i%10==0 and i!=0:
        print(f"i:{i} dt: {time()-start_time}")
#     print(f"{col} - kendall: {cor} - p-value: {p_value}")

df_kendall_corr = pd.DataFrame({'kendall': cor_col,
                                 'p_value': p_value_col},
                                 index=df.columns.values)
# df_kendall_corr = df_kendall_corr.sort_values(by=['kendall'], ascending=False)

df_kendall_corr.reset_index(inplace=True)
df_kendall_corr['description'] = df_kendall_corr['index'].apply(lambda x: vNames.descriptionOfColumn(x))
df_kendall_corr.set_index('index', inplace=True)

In [None]:
description_itens_list = ['screener', 'asd', 'autism', 'sc ', 'cshcn', 'indicator'] 
autism_related_columns = [col for col in vNames.allColumns() if any(c in vNames.descriptionOfColumn(col).lower() for c in description_itens_list)]
# print(autism_related_columns)

In [None]:
# for col in autism_related_columns:
#     print(f"{col}   {vNames.descriptionOfColumn(col)}")

In [None]:
df = df[[col for col in df.columns if col not in autism_related_columns]]

In [None]:
filter_func = lambda x, lim: x>lim or x<-lim

selected_columns = (df_kendall_corr[df_kendall_corr.kendall.apply(lambda x: filter_func(x, 0.1))].index.values.tolist())

model_columns = [col for col in selected_columns if col in df.columns]
df_to_model = df[model_columns]
# df_to_model.head()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from seaborn import palplot, color_palette

color_pal = color_palette("Set1")
excludedColor = color_pal[-1]
notExcludedModelColor = color_pal[0]

df_kendall_corr.reset_index(inplace=True)
df_kendall_corr['color'] = df_kendall_corr['index'].apply(lambda x: notExcludedModelColor if x in model_columns else excludedColor)
df_kendall_corr.set_index('index', inplace=True)


notExcluded_patch = mpatches.Patch(color=notExcludedModelColor, label='Not Excluded Question')
excluded_patch = mpatches.Patch(color=excludedColor, label='Excluded Question')

In [None]:
%matplotlib inline

cfg = {'linewidth':0.5,
      'marker':'o'}

# marker=markerList,
plt.figure(figsize=(20, 10))
fig = plt.scatter(df_kendall_corr.index, df_kendall_corr.kendall, c=df_kendall_corr.color, **cfg)

plt.title("Kendall correlation of questions")
plt.ylabel('Correlation')
plt.xlabel('Questions')
plt.xticks(list())
plt.legend(handles=[excluded_patch, notExcluded_patch])

plt.show()

In [None]:
%matplotlib inline 


cfg = {'linewidth':0.5,
      'marker':'o'}
df_plot = df_kendall_corr
df_plot = df_kendall_corr.sort_values(by=['kendall'], ascending=False)

# marker=markerList,
plt.figure(figsize=(20, 10))
fig = plt.scatter(df_plot.index, df_plot.kendall, c=df_plot.color, **cfg)

plt.title("Kendall correlation of questions")
plt.ylabel('Correlation')
plt.xlabel('Questions')
plt.xticks(list())
plt.legend(handles=[excluded_patch, notExcluded_patch])

plt.show()