# EDA
This document runs a basic EDA and data cleaning in order to formulate appropriate classification algorithm and to create a clean dataset to feed as training data.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
ls data

NF Registry NF1 092420 v2.xls


In [13]:
df = pd.read_excel('data/NF Registry NF1 092420 v2.xls', sheet_name = 'Data')

In [29]:
refs = pd.read_excel('data/NF Registry NF1 092420 v2.xls', sheet_name = 'Report')

Remove the patient ID given there is no duplicates.

In [22]:
df = df.drop('Patient Id', axis = 1)

In [34]:
print(df.shape[0], 'patients data')

8223 patients data


## EDA
Since we are interested in the correlations between symptoms, I'll only look at the co-occurrence of different symptoms.

In [176]:
symptoms = df.iloc[:, 30:-2]

In [177]:
target_symptoms = ['Plexiform Nf', 'Spinal Nf', 'Adhd', 'Has Autism Disorder', 
                  'Has Cvs Problems', 'Has Cyst Tumor', 'Growth Hormonal Problems', 
                  'Has Cns Problems', 'Optic Glioma Diagnosed', 'Lisch Nodules', 
                  'Mpnst', 'Sphenoid Wing Dysplasia', 'Osteoporosis', 'Scolosis']

In [178]:
symptoms = symptoms.fillna('')

In [179]:
# Plexiform NF: 1 for Yes, 0 for all others
symptoms[target_symptoms[0]] = symptoms[target_symptoms[0]].apply(lambda x: 1 if 'yes' in x.lower() else 0)

In [180]:
# Spinal NF: 1 for Yes, 0 for all others
symptoms[target_symptoms[1]] = symptoms[target_symptoms[1]].apply(lambda x: 1 if 'yes' in x.lower() else 0)

In [181]:
# ADHD: divide into learning_disability, attention_issue, and ADD/ADHD
symptoms['learning disabilities'] = symptoms[target_symptoms[2]].apply(lambda x: 1 if 'learning disabilities' in x.lower() else 0)
symptoms['attention issues'] = symptoms[target_symptoms[2]].apply(lambda x: 1 if 'attention issues' in x.lower() else 0)
symptoms['ADD_ADHD'] = symptoms[target_symptoms[2]].apply(lambda x: 1 if 'ADD' in x.lower() else 0)
symptoms = symptoms.drop(target_symptoms[2], axis = 1)

In [182]:
# Autism: 1 for Yes, 0 for all others
symptoms[target_symptoms[3]] = symptoms[target_symptoms[3]].apply(lambda x: 1 if 'yes' in x.lower() else 0)

In [183]:
# CVS: heart murmur, vascular or blood vessel issues, high blood pressure, moya moya, heart defect, pulmonic stenosis, 
# heart valve problems, renal artery stenosis

cvs_types = ['heart murmur', 'vascular or blood vessel issues', 'high blood pressure', 
             'moya moya', 'heart defect', 'pulmonic stenosis', 'heart valve problems', 'renal artery stenosis']

for typ in cvs_types: 
    symptoms[typ] = symptoms[target_symptoms[4]].apply(lambda x: 1 if typ in x.lower() else 0)
symptoms = symptoms.drop(target_symptoms[4], axis = 1)

In [184]:
cyst_tumor_types = ['adrenal tumor', 'bone cysts', 'brain tumor', 'breast cancer', 'gist', 
                    'gastrointestinal stromal tumors', 'glomus tumors', 'leukemia']

for typ in cyst_tumor_types: 
    symptoms[typ] = symptoms[target_symptoms[5]].apply(lambda x: 1 if typ in x.lower() else 0)
symptoms = symptoms.drop(target_symptoms[5], axis = 1)

In [185]:
# growth hormonal problems
hormonal_types = ['Chewing/swallowing problems',
                  'Constipation',
                  'Early onset of puberty (prior to the age of 8)',
                  'Hormonal imbalance that affected pregnancy or birth control',
                  'Large head size',
                  'Late onset of puberty (after the age of 17)',
                  'Over (hyper) or under (hypo) active thyroid',
                  'Poor weight gain in childhood',
                  'Reflux or gastroesophageal reflux (GERD)',
                  'Short stature']
for typ in hormonal_types: 
    symptoms[typ] = symptoms[target_symptoms[6]].apply(lambda x: 1 if typ in x else 0)
symptoms = symptoms.drop(target_symptoms[6], axis = 1)

In [186]:
# CNS
cns_types = ['Anxiety',
             'Arthritis',
             'Depression',
             'Difficulties with social interactions',
             'Fatigue',
             'Headaches or migraines',
             'Hydrocephalus (extra fluid or "water on the brain")',
             'Joint pain',
             'Loose (hyperextensible) joints',
             'Muscle tone or coordination issues',
             'Other mental health diagnosis',
             'Seizures or epilepsy',
             'Sleep disturbances']
for typ in cns_types: 
    symptoms[typ] = symptoms[target_symptoms[7]].apply(lambda x: 1 if typ in x else 0)

symptoms = symptoms.drop(target_symptoms[7], axis = 1)

In [187]:
# Optic Glioma
symptoms[target_symptoms[8]] = symptoms[target_symptoms[8]].apply(lambda x: 1 if 'yes' in x.lower() else 0)

In [188]:
# Lisch Nodules
symptoms[target_symptoms[9]] = symptoms[target_symptoms[9]].apply(lambda x: 1 if 'yes' in x.lower() else 0)

In [189]:
# MPNST
symptoms[target_symptoms[10]] = symptoms[target_symptoms[10]].apply(lambda x: 1 if 'yes' in x.lower() else 0)

In [190]:
# Sphenoid Wing Dysplasia
symptoms[target_symptoms[11]] = symptoms[target_symptoms[11]].apply(lambda x: 1 if 'yes' in x.lower() else 0)

In [191]:
# Osteoporosis
symptoms[target_symptoms[12]] = symptoms[target_symptoms[12]].apply(lambda x: 1 if 'yes' in x.lower() else 0)

In [192]:
# Scolosis
symptoms[target_symptoms[13]] = symptoms[target_symptoms[13]].apply(lambda x: 1 if 'yes' in x.lower() else 0)