# EDA
This document runs a basic EDA and data cleaning in order to formulate appropriate classification algorithm and to create a clean dataset to feed as training data.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [13]:
df = pd.read_excel('data/NF Registry NF1 092420 v2.xls', sheet_name = 'Data')

In [29]:
refs = pd.read_excel('data/NF Registry NF1 092420 v2.xls', sheet_name = 'Report')

Remove the patient ID given there is no duplicates.

In [22]:
df = df.drop('Patient Id', axis = 1)

In [34]:
print(df.shape[0], 'patients data')

8223 patients data


## EDA
Since we are interested in the correlations between symptoms, I'll only look at the co-occurrence of different symptoms.

In [224]:
symptoms = df.iloc[:, 30:-2]

In [236]:
symptoms['pain'] = np.where(df['Has Pain'] == True, 1, 0)

In [225]:
symptoms = symptoms.fillna('')

In [226]:
def just_yes(series_):
    return series_.apply(lambda x: 1 if 'yes' in x.lower() else 0)

In [None]:
# manually check through each and log ones that require manual cleaning
manual_cols = []
for col in symptoms.columns:
    if len(symptoms[col].unique()) < 10: 
        print(col, end = ' -- ')
        print(symptoms[col].unique())
        ans = input('yes only? (Y:1 / N:0)')
        print('---------------------')

        if ans == '1': 
            symptoms[col] = just_yes(symptoms[col])
        else: 
            manual_cols.append(col)
    else: 
        manual_cols.append(col)

In [None]:
# drop if not indicative of symptoms
important = []
for col in manual_cols: 
    print(col, end = ' -- ')
    print(set([y.lstrip() for y in np.sum([x.split(',') for x in symptoms[col].unique()])]))
    ans = input('Symptoms? (Y:1 / N:0)')
    print('---------------------')

    if ans == '1': 
        important.append(col)
    else: 
        symptoms.drop(col, axis = 1, inplace = True)

In [245]:
symptoms = symptoms.drop('Itching Details', axis = 1)

In [247]:
symptoms['bowel or bladder control problem'] = symptoms['Plexiforms Problems'].apply(lambda x: 1 if 'bowel or bladder control' in x.lower() else 0)
symptoms['breathing problem'] = symptoms['Plexiforms Problems'].apply(lambda x: 1 if 'breathing' in x.lower() else 0)
symptoms['Problem with movement'] = symptoms['Plexiforms Problems'].apply(lambda x: 1 if 'movement or ability to walk' in x.lower() else 0)
symptoms['Numbness'] = symptoms['Plexiforms Problems'].apply(lambda x: 1 if 'numbness' in x.lower() else 0)

In [249]:
symptoms = symptoms.drop('Plexiforms Nf Locations', axis = 1)

In [251]:
# ADHD: divide into learning_disability, attention_issue, and ADD/ADHD
symptoms['learning disabilities'] = symptoms['Adhd'].apply(lambda x: 1 if 'learning disabilities' in x.lower() else 0)
symptoms['attention issues'] = symptoms['Adhd'].apply(lambda x: 1 if 'attention issues' in x.lower() else 0)
symptoms['ADD_ADHD'] = symptoms['Adhd'].apply(lambda x: 1 if 'ADD' in x.lower() else 0)
symptoms = symptoms.drop('Adhd', axis = 1)

In [252]:
# CVS: heart murmur, vascular or blood vessel issues, high blood pressure, moya moya, heart defect, pulmonic stenosis, 
# heart valve problems, renal artery stenosis

cvs_types = ['heart murmur', 'vascular or blood vessel issues', 'high blood pressure', 
             'moya moya', 'heart defect', 'pulmonic stenosis', 'heart valve problems', 'renal artery stenosis']

for typ in cvs_types: 
    symptoms[typ] = symptoms['Has Cvs Problems'].apply(lambda x: 1 if typ in x.lower() else 0)
symptoms = symptoms.drop('Has Cvs Problems', axis = 1)

In [253]:
cyst_tumor_types = ['adrenal tumor', 'bone cysts', 'brain tumor', 'breast cancer', 'gist', 
                    'gastrointestinal stromal tumors', 'glomus tumors', 'leukemia']

for typ in cyst_tumor_types: 
    symptoms[typ] = symptoms['Has Cyst Tumor'].apply(lambda x: 1 if typ in x.lower() else 0)
symptoms = symptoms.drop('Has Cyst Tumor', axis = 1)

In [254]:
# growth hormonal problems
hormonal_types = ['Chewing/swallowing problems',
                  'Constipation',
                  'Early onset of puberty (prior to the age of 8)',
                  'Hormonal imbalance that affected pregnancy or birth control',
                  'Large head size',
                  'Late onset of puberty (after the age of 17)',
                  'Over (hyper) or under (hypo) active thyroid',
                  'Poor weight gain in childhood',
                  'Reflux or gastroesophageal reflux (GERD)',
                  'Short stature']
for typ in hormonal_types: 
    symptoms[typ] = symptoms['Growth Hormonal Problems'].apply(lambda x: 1 if typ in x else 0)
symptoms = symptoms.drop('Growth Hormonal Problems', axis = 1)

In [255]:
# CNS
cns_types = ['Anxiety',
             'Arthritis',
             'Depression',
             'Difficulties with social interactions',
             'Fatigue',
             'Headaches or migraines',
             'Hydrocephalus (extra fluid or "water on the brain")',
             'Joint pain',
             'Loose (hyperextensible) joints',
             'Muscle tone or coordination issues',
             'Other mental health diagnosis',
             'Seizures or epilepsy',
             'Sleep disturbances']
for typ in cns_types: 
    symptoms[typ] = symptoms['Has Cns Problems'].apply(lambda x: 1 if typ in x else 0)

symptoms = symptoms.drop('Has Cns Problems', axis = 1)

In [261]:
symptoms['Vision Changes'] = [1 if x in ['Severe changes', 'Mild changes', 'Moderate changes'] else 0 for x in symptoms['Vision Changes'] ]

In [267]:
tmp = ['Gt Blood Done', 'Gt Confirmed Nf1', 'Spots Where', 'Itching Treatment',
       'Bone Bowing', 'Alternative Medicines', 'Plexiforms Problems', 'Family Nf1', 'Ct Nf1 Participation', 'Research Nf1']
symptoms = symptoms.drop(tmp, axis = 1)