# Data Preprocessing

This document runs a basic data cleaning of existing patient registry in order to formulate appropriate classification algorithm and to create a clean dataset to feed as training data. 

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [51]:
df = pd.read_excel('data/NF Registry NF1 092420 v2.xls', sheet_name = 'Data')

In [52]:
refs = pd.read_excel('data/NF Registry NF1 092420 v2.xls', sheet_name = 'Report')

In [53]:
keys = pd.read_csv('data/keys.csv', index_col=0)

Remove the patient ID given there is no duplicates.

In [54]:
df = df.drop('Patient Id', axis = 1)

In [55]:
print(df.shape[0], 'patients data')

8223 patients data


## Subset
We are only interested in the symptom information.

In [56]:
symptoms = df.iloc[:, 30:-2]

In [57]:
symptoms['pain'] = np.where(df['Has Pain'] == True, 'yes', 'no')

In [58]:
symptoms = symptoms.fillna('')

In [59]:
drop_list = ['Gt Blood Done',
 'Gt Confirmed Nf1',
 'Nf Severity',
 'Spots Where',
 'Itching Details',
 'Itching Treatment',
 'Rx Response',
 'Oral Medication',
 'Cream',
 'Ketotifen',
 'Turmeric',
 'Vitamin',
 'Other',
 'Age Neurofibromas Develop',
 'Nf Treated',
 'Plexiforms Nf Locations',
 'Plexiform Nf Rx',
 'Spinal Nf Rx',
 'Cognitive Interventions',
 'Cognitive Therapy',
 'Optic Glioma Age',
 'Optic Glioma Rx',
 'Mpnst Diagnosed Age',
 'Mpnst Rx',
 'Osteoporosis Rx',
 'Scolosis Type',
 'Scolosis Rx',
 'Bone Bowing Confirmed',
 'Bone Bowing Rx',
 'Alternative Medicines',
 'Alternative Medicines List',
 'Family Nf1',
 'Family Members Nf',
 'Bc Age',
 'Bc Rx',
 'Pregnancy',
 'Pregnancy Nf',
 'Ct Nf1 Participation',
 'Ct Symptoms Rx',
 'Ct Date Enrolled',
 'Other Ct Treatment',
 'Research Nf1',
 'Test Date',
 'Gene',
 'Source Tested',
 'Source Tumor',
 'Test Method',
 'Category',
 'Std Nomenclature',
 'Start Exon',
 'Nucleotide',
 'Amin Acid',
 'Pathogenicity',
 'Frame']

In [60]:
# symptom criteria with multiple answers
expand_list = [
 'Plexiforms Problems',
 'Adhd',
 'Has Cvs Problems',
 'Has Cyst Tumor',
 'Growth Hormonal Problems',
 'Has Cns Problems',
 'Vision Changes']

In [61]:
# symptom criteria with boolean answers
yes_list = ['Has Spots',
 'Freckles Armpit',
 'Freckles Groin',
 'Has Itching',
 'Plexiform Nf',
 'Plexiforms Visible',
 'Spinal Nf',
 'Has Cognitive Difficulties',
 'Has Autism Disorder',
 'Optic Glioma Diagnosed',
 'Lisch Nodules',
 'Mpnst',
 'Sphenoid Wing Dysplasia',
 'Fractures',
 'Osteoporosis',
 'Bone Bowing',
 'Breast Cancer', 
 'pain', 
 'Cutaneous Nf No', 
 'Scolosis']

#### Drop list
Remove all non-symptom columns

In [62]:
symptoms = symptoms.drop(drop_list, axis = 1)

#### Yes list
If existing columns are just yes or no, convert them to be binary.

In [63]:
def just_yes(series_):
    return series_.apply(lambda x: 1 if 'yes' in x.lower() else 0)

In [64]:
symptoms[yes_list] = symptoms[yes_list].apply(lambda x: just_yes(x))

#### Expand list
For the rest, we will look at them individually.

In [65]:
print(expand_list)

['Plexiforms Problems', 'Adhd', 'Has Cvs Problems', 'Has Cyst Tumor', 'Growth Hormonal Problems', 'Has Cns Problems', 'Vision Changes']


In [66]:
# plexiforms problems
symptoms['bowel or bladder control problem'] = \
symptoms['Plexiforms Problems'].apply(lambda x: 1 if 'bowel or bladder control' in x.lower() else 0)
symptoms['breathing problem'] = symptoms['Plexiforms Problems'].apply(lambda x: 1 
                                                                      if 'breathing' in x.lower() else 0)
symptoms['Problem with movement'] = symptoms['Plexiforms Problems'].apply(lambda x: 1 
                                                                          if 'movement or ability to walk' 
                                                                          in x.lower() else 0)
symptoms['Numbness'] = symptoms['Plexiforms Problems'].apply(lambda x: 1 
                                                             if 'numbness' in x.lower() else 0)
symptoms = symptoms.drop('Plexiforms Problems', axis = 1)

In [67]:
# ADHD: divide into learning_disability, attention_issue, and ADD/ADHD
symptoms['learning disabilities'] = symptoms['Adhd'].apply(lambda x: 1 
                                                           if 'learning disabilities' in x.lower() else 0)
symptoms['attention issues'] = symptoms['Adhd'].apply(lambda x: 1 
                                                      if 'attention issues' in x.lower() else 0)
symptoms['ADD_ADHD'] = symptoms['Adhd'].apply(lambda x: 1 
                                              if 'ADD' in x.lower() else 0)
symptoms = symptoms.drop('Adhd', axis = 1)

In [68]:
# Has Cvs Problems
# CVS: heart murmur, vascular or blood vessel issues, high blood pressure, moya moya, heart defect, pulmonic stenosis, 
# heart valve problems, renal artery stenosis

cvs_types = ['heart murmur', 'vascular or blood vessel issues', 'high blood pressure', 
             'moya moya', 'heart defect', 'pulmonic stenosis', 'heart valve problems', 'renal artery stenosis']

for typ in cvs_types: 
    symptoms[typ] = symptoms['Has Cvs Problems'].apply(lambda x: 1 if typ in x.lower() else 0)
    
symptoms = symptoms.drop('Has Cvs Problems', axis = 1)

In [69]:
# Has Cyst Tumor
cyst_tumor_types = ['adrenal tumor', 'bone cysts', 'brain tumor', 'breast cancer', 'gist', 
                    'gastrointestinal stromal tumors', 'glomus tumors', 'leukemia']

for typ in cyst_tumor_types: 
    symptoms[typ] = symptoms['Has Cyst Tumor'].apply(lambda x: 1 if typ in x.lower() else 0)
    
symptoms = symptoms.drop('Has Cyst Tumor', axis = 1)

In [70]:
# growth hormonal problems
hormonal_types = ['Chewing/swallowing problems',
                  'Constipation',
                  'Early onset of puberty (prior to the age of 8)',
                  'Hormonal imbalance that affected pregnancy or birth control',
                  'Large head size',
                  'Late onset of puberty (after the age of 17)',
                  'Over (hyper) or under (hypo) active thyroid',
                  'Poor weight gain in childhood',
                  'Reflux or gastroesophageal reflux (GERD)',
                  'Short stature']
for typ in hormonal_types: 
    symptoms[typ] = symptoms['Growth Hormonal Problems'].apply(lambda x: 1 if typ in x else 0)
    
symptoms = symptoms.drop('Growth Hormonal Problems', axis = 1)

In [71]:
# Has Cns Problems
cns_types = ['Anxiety',
             'Arthritis',
             'Depression',
             'Difficulties with social interactions',
             'Fatigue',
             'Headaches or migraines',
             'Hydrocephalus (extra fluid or "water on the brain")',
             'Joint pain',
             'Loose (hyperextensible) joints',
             'Muscle tone or coordination issues',
             'Other mental health diagnosis',
             'Seizures or epilepsy',
             'Sleep disturbances']

for typ in cns_types: 
    symptoms[typ] = symptoms['Has Cns Problems'].apply(lambda x: 1 if typ in x else 0)

symptoms = symptoms.drop('Has Cns Problems', axis = 1)

In [72]:
# vision changes
symptoms['Vision Changes'] = [1 
                              if x in ['Severe changes', 'Mild changes', 'Moderate changes'] 
                              else 0 for x in symptoms['Vision Changes'] ]

In [73]:
# there are two breast cancer columns, combine them
symptoms['Breast Cancer'] = np.where(symptoms['Breast Cancer'] + symptoms['breast cancer'] == 0, 0, 1)
symptoms = symptoms.drop('breast cancer', axis = 1)

In [74]:
# combine two gist columns
symptoms['gastrointestinal stromal tumors'] = np.where(symptoms['gastrointestinal stromal tumors'] + symptoms['gist'] == 0, 0, 1)
symptoms = symptoms.drop('gist', axis = 1)

In [75]:
symptoms.columns = keys.symptom

In [76]:
symptoms.head(5)

symptom,Spots,Freckles on armpit,Freckles on groin,Itching,Cutaneous Neurofibromas,Plexiform Neurofibromas,Visible Plexiform Neurofibromas,Spinal Neurofibromas,Cognitive Difficulties,Autism Spectrum Disorder,...,Difficulties with social interactions,Fatigue,Headaches or migraines,Hydrocephalus,Joint pain,Loose joints,Muscle coordination issues,Other mental health problems,Seizures or epilepsy,Sleep disturbances
0,1,1,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,1,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,1,0,0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [77]:
# save symptoms
symptoms.to_csv('data/symptoms_existing_data.csv')