In [None]:
import pandas as pd
import numpy as np

folderpath = "/opt/helthcare-final-project-autism/notebooks/dataset/childhealthdata/2017-2018_NSCH_Topical_CSV_DRC_Jan2020"
filepath = f"{folderpath}/2017-2018 NSCH_Topical_DRC_Dec 2019.csv"

column_description_path = f"{folderpath}/2017-2018 NSCH_Topical_DRC_Variable List.xlsx"
# set seed for reproducibility
np.random.seed(0) 

In [None]:
df = pd.read_csv(filepath, index_col='HHID')

### Convert missing values code to NaN
```SPSS Codebook_ 2017-2018NSCH_DRCv1_12.31.19.pdf page 6, missing values codes```

In [None]:
df = df.replace([90, 95, 96, 99], np.nan)

In [None]:
df.describe()

In [None]:
df.sample(5)

In [None]:
missing_values_count = df.isnull().sum()/len(df.index)
missing_values_count = missing_values_count.sort_values(ascending=False)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Plot a simple histogram with binsize determined automatically
fig = sns.distplot(missing_values_count, bins=20, kde=False, color="b")

In [None]:
for col in df.columns:
    print(f"{col} -- {df[col].unique().size}", end='')
    if df[col].unique().size < 100:
        print(f" -- {df[col].unique()}")
    else:
        print('')

### Columns related to autism

* **K2Q35A_1_YEARS** - Autism ASD - First Told Age in Years
* **K2Q35A** - Autism ASD
* **K2Q35B** -	Autism ASD Currently
* **AUTISMMED** - Autism ASD - Medication Currently
* **AUTISMTREAT** - Autism ASD - Behavioral Treatment
* **K2Q35C** - Autism ASD Severity Description
* **K2Q35D** - Autism ASD - First Told Doctor Type 
* **autism_1718** - Children who currently have Autism or Autism Spectrum Disorder including Asperger's Disorder, pervasive developmental disorder, age 3-17 years
* **AutismSev_1718** - Parent-rated severity of child's current autism/ASD, age 3-17 years
* **AutismInd_1718** - Indicator 2.8: Children who currently have Autism or Autism Spectrum Disorder including Asperger's Disorder, pervasive developmental disorder, age 3-17 years
* **ASDSevInd_1718** -	Indicator 2.8a: Parent-rated severity of current Autism or Autism Spectrum Disorder, age 3-17 years
* **ASDMed_1718** -	Indicator 2.8b: Children currently taking medication for Autism, ASD, Asperger's Disorder or PDD, age 3-17 years
* **ASDBehTreat_1718** -	Indicator 2.8c: Received behavioral treatment for Autsim, ASD, Asperger's Disorder or PDD, age 3-17 years
* **ASDAge_1718** -	Indicator 2.8d: Age of diagnosis for Autism or ASD, age 3-17 years
* **ASDDrType_1718** -	Indicator 2.8e: Type of doctor or other health care provider who was the first to tell that this child had Autism, ASD, difficulties with emotions, concentration, or behavior, age 3-17 years
* **MedEmotion_1718** -	Indicator 2.9: Children who are taking medication for ADD/ADHD, ASD, difficulties with emotions, concentration, or behavior, age 3-17 years
* **MEDB10ScrQ5_1718** - Children qualifying CSHCN Screener question 5 or experience one of the 10 MEDB conditions, age 3-17 (ADHD, depression, anxiety, behavior, autism, dev delay, Tourette, speech, intellectual disability or learning
* **nom17_3ASD_1718** -	National Outcome Measure 17.3: Percent of children, ages 3 through 17, diagnosed with an autism spectrum disorder

In [None]:
df_vars = pd.read_excel(column_description_path) #.apply(lambda x: x.str.lower(), axis=1) #
df_vars[df_vars['description'].fillna('').str.contains('autism', case=False)]

In [None]:
for col in ['K2Q35A_1_YEARS', 'K2Q35A', 'K2Q35B', 'K2Q35C', 'K2Q35D', 'AUTISMMED', 'AUTISMTREAT', 'K2Q35C', 'K2Q35D']:
    print(df.reset_index().fillna(9999).groupby([col]).count()['HHID'])
    print('')

In [None]:
df[['K2Q35A_1_YEARS', 'K2Q35A', 'ASDAge_1718']]

In [None]:
[c for c in df.columns if (('autism' in c.lower()) or ('asd' in c.lower()))]

In [None]:
df.reset_index().fillna(9999).groupby(['npm6DSc_1718']).count()['HHID']