# Data Preprocessing

This document runs a basic data cleaning of existing patient registry in order to formulate appropriate classification algorithm and to create a clean dataset to feed as training data. 

In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [113]:
df = pd.read_excel('data/NF Registry NF1 092420 v2.xls', sheet_name = 'Data')

In [114]:
refs = pd.read_excel('data/NF Registry NF1 092420 v2.xls', sheet_name = 'Report')

In [115]:
keys = pd.read_csv('data/keys.csv', index_col=0)

Remove the patient ID given there is no duplicates.

In [116]:
df = df.drop('Patient Id', axis = 1)

In [117]:
print(df.shape[0], 'patients data')

8223 patients data


## Subset
We are only interested in the symptom information.

In [118]:
symptoms = df.iloc[:, 30:-2]

In [119]:
symptoms['pain'] = np.where(df['Has Pain'] == True, 'yes', 'no')

In [120]:
symptoms = symptoms.fillna('')

In [121]:
drop_list = ['Gt Blood Done',
 'Gt Confirmed Nf1',
 'Nf Severity',
 'Spots Where',
 'Itching Details',
 'Itching Treatment',
 'Rx Response',
 'Oral Medication',
 'Cream',
 'Ketotifen',
 'Turmeric',
 'Vitamin',
 'Other',
 'Age Neurofibromas Develop',
 'Nf Treated',
 'Plexiforms Nf Locations',
 'Plexiform Nf Rx',
 'Spinal Nf Rx',
 'Cognitive Interventions',
 'Cognitive Therapy',
 'Optic Glioma Age',
 'Optic Glioma Rx',
 'Mpnst Diagnosed Age',
 'Mpnst Rx',
 'Osteoporosis Rx',
 'Scolosis Type',
 'Scolosis Rx',
 'Bone Bowing Confirmed',
 'Bone Bowing Rx',
 'Alternative Medicines',
 'Alternative Medicines List',
 'Family Nf1',
 'Family Members Nf',
 'Bc Age',
 'Bc Rx',
 'Pregnancy',
 'Pregnancy Nf',
 'Ct Nf1 Participation',
 'Ct Symptoms Rx',
 'Ct Date Enrolled',
 'Other Ct Treatment',
 'Research Nf1',
 'Test Date',
 'Gene',
 'Source Tested',
 'Source Tumor',
 'Test Method',
 'Category',
 'Std Nomenclature',
 'Start Exon',
 'Nucleotide',
 'Amin Acid',
 'Pathogenicity',
 'Frame']

In [122]:
# symptom criteria with multiple answers
expand_list = [
 'Plexiforms Problems',
 'Adhd',
 'Has Cvs Problems',
 'Has Cyst Tumor',
 'Growth Hormonal Problems',
 'Has Cns Problems',
 'Vision Changes']

In [123]:
# symptom criteria with boolean answers
yes_list = ['Has Spots',
 'Freckles Armpit',
 'Freckles Groin',
 'Has Itching',
 'Plexiform Nf',
 'Plexiforms Visible',
 'Spinal Nf',
 'Has Cognitive Difficulties',
 'Has Autism Disorder',
 'Optic Glioma Diagnosed',
 'Lisch Nodules',
 'Mpnst',
 'Sphenoid Wing Dysplasia',
 'Fractures',
 'Osteoporosis',
 'Bone Bowing',
 'Breast Cancer', 
 'pain', 
 'Cutaneous Nf No', 
 'Scolosis']

#### Drop list
Remove all non-symptom columns

In [124]:
symptoms = symptoms.drop(drop_list, axis = 1)

#### Yes list
If existing columns are just yes or no, convert them to be binary.

In [125]:
def just_yes(series_):
    return series_.apply(lambda x: 1 if 'yes' in x.lower() else 0)

In [126]:
symptoms[yes_list] = symptoms[yes_list].apply(lambda x: just_yes(x))

#### Expand list
For the rest, we will look at them individually.

In [127]:
print(expand_list)

['Plexiforms Problems', 'Adhd', 'Has Cvs Problems', 'Has Cyst Tumor', 'Growth Hormonal Problems', 'Has Cns Problems', 'Vision Changes']


In [128]:
# plexiforms problems
symptoms['bowel or bladder control problem'] = \
symptoms['Plexiforms Problems'].apply(lambda x: 1 if 'bowel or bladder control' in x.lower() else 0)
symptoms['breathing problem'] = symptoms['Plexiforms Problems'].apply(lambda x: 1 
                                                                      if 'breathing' in x.lower() else 0)
symptoms['Problem with movement'] = symptoms['Plexiforms Problems'].apply(lambda x: 1 
                                                                          if 'movement or ability to walk' 
                                                                          in x.lower() else 0)
symptoms['Numbness'] = symptoms['Plexiforms Problems'].apply(lambda x: 1 
                                                             if 'numbness' in x.lower() else 0)
symptoms = symptoms.drop('Plexiforms Problems', axis = 1)

In [129]:
# ADHD: divide into learning_disability, attention_issue, and ADD/ADHD
symptoms['learning disabilities'] = symptoms['Adhd'].apply(lambda x: 1 
                                                           if 'learning disabilities' in x.lower() else 0)
symptoms['attention issues'] = symptoms['Adhd'].apply(lambda x: 1 
                                                      if 'attention issues' in x.lower() else 0)
symptoms['ADD_ADHD'] = symptoms['Adhd'].apply(lambda x: 1 
                                              if 'ADD' in x.lower() else 0)
symptoms = symptoms.drop('Adhd', axis = 1)

In [130]:
# Has Cvs Problems
# CVS: heart murmur, vascular or blood vessel issues, high blood pressure, moya moya, heart defect, pulmonic stenosis, 
# heart valve problems, renal artery stenosis

cvs_types = ['heart murmur', 'vascular or blood vessel issues', 'high blood pressure', 
             'moya moya', 'heart defect', 'pulmonic stenosis', 'heart valve problems', 'renal artery stenosis']

for typ in cvs_types: 
    symptoms[typ] = symptoms['Has Cvs Problems'].apply(lambda x: 1 if typ in x.lower() else 0)
    
symptoms = symptoms.drop('Has Cvs Problems', axis = 1)

In [131]:
# Has Cyst Tumor
cyst_tumor_types = ['adrenal tumor', 'bone cysts', 'brain tumor', 'breast cancer', 'gist', 
                    'gastrointestinal stromal tumors', 'glomus tumors', 'leukemia']

for typ in cyst_tumor_types: 
    symptoms[typ] = symptoms['Has Cyst Tumor'].apply(lambda x: 1 if typ in x.lower() else 0)
    
symptoms = symptoms.drop('Has Cyst Tumor', axis = 1)

In [132]:
# growth hormonal problems
hormonal_types = ['Chewing/swallowing problems',
                  'Constipation',
                  'Early onset of puberty (prior to the age of 8)',
                  'Hormonal imbalance that affected pregnancy or birth control',
                  'Large head size',
                  'Late onset of puberty (after the age of 17)',
                  'Over (hyper) or under (hypo) active thyroid',
                  'Poor weight gain in childhood',
                  'Reflux or gastroesophageal reflux (GERD)',
                  'Short stature']
for typ in hormonal_types: 
    symptoms[typ] = symptoms['Growth Hormonal Problems'].apply(lambda x: 1 if typ in x else 0)
    
symptoms = symptoms.drop('Growth Hormonal Problems', axis = 1)

In [133]:
# Has Cns Problems
cns_types = ['Anxiety',
             'Arthritis',
             'Depression',
             'Difficulties with social interactions',
             'Fatigue',
             'Headaches or migraines',
             'Hydrocephalus (extra fluid or "water on the brain")',
             'Joint pain',
             'Loose (hyperextensible) joints',
             'Muscle tone or coordination issues',
             'Other mental health diagnosis',
             'Seizures or epilepsy',
             'Sleep disturbances']

for typ in cns_types: 
    symptoms[typ] = symptoms['Has Cns Problems'].apply(lambda x: 1 if typ in x else 0)

symptoms = symptoms.drop('Has Cns Problems', axis = 1)

In [134]:
# vision changes
symptoms['Vision Changes'] = [1 
                              if x in ['Severe changes', 'Mild changes', 'Moderate changes'] 
                              else 0 for x in symptoms['Vision Changes'] ]

In [135]:
# there are two breast cancer columns, combine them
symptoms['Breast Cancer'] = np.where(symptoms['Breast Cancer'] + symptoms['breast cancer'] == 0, 0, 1)
symptoms = symptoms.drop('breast cancer', axis = 1)

In [136]:
# combine two gist columns
symptoms['gastrointestinal stromal tumors'] = np.where(symptoms['gastrointestinal stromal tumors'] + symptoms['gist'] == 0, 0, 1)
symptoms = symptoms.drop('gist', axis = 1)

For the sake of this particular dataset, we will combine similar ones that's separated by location (Spots / Freckles) and add in some other ones that may come up even though not directly related to the NF.

In [137]:
symptoms.loc[:, 'Spots'] = np.where(symptoms['Has Spots'] + symptoms['Freckles Armpit'] + symptoms['Freckles Groin'] > 0, 1, 0)

In [138]:
symptoms = symptoms.drop(['Has Spots', 'Freckles Armpit', 'Freckles Groin'], axis = 1)

In [139]:
symptoms.loc[:, 'Heart Problem'] = np.where(symptoms['heart murmur'] + \
                                            symptoms['heart defect'] + \
                                            symptoms['heart valve problems'] > 0, 1, 0)

In [140]:
# adding a few more columns
symptoms.loc[:, 'Fever'] = 0
symptoms.loc[:, 'Nosebleed'] = 0

In [142]:
labels = {'Has Itching': 'Itching',
 'Cutaneous Nf No': 'Cutaneous Neurofibromas', 
 'Plexiform Nf': 'Plexiform Neurofibromas', 
 'Plexiforms Visible': 'Visible Plexiform Neurofibromas',
 'Spinal Nf': 'Spinal Neurofibromas', 
 'Has Cognitive Difficulties': 'Cognitive Difficulties', 
 'Has Autism Disorder': 'Autism Spectrum Disorder',
 'Optic Glioma Diagnosed': 'Optic Glioma', 
 'Vision Changes': 'Vision Changes', 
 'Lisch Nodules': 'Lisch Nodules', 
 'Mpnst': 'Malignant Peripheral Nerve Sheath Tumor (MPNST)',
 'Sphenoid Wing Dysplasia': 'Sphenoid Wing Dysplasia', 
 'Fractures': 'Fractures', 
 'Osteoporosis': 'Osteoporosis', 
 'Scolosis': 'Scoliosis',
 'Bone Bowing': 'Bone Bowing', 
 'Breast Cancer': 'Breast Cancer', 
 'pain': 'Pain',
 'bowel or bladder control problem': 'Bowel or bladder control problems', 
 'breathing problem': 'Breathing problems',
 'Problem with movement': 'Problem with movement', 
 'Numbness': 'Numbness', 
 'learning disabilities': 'Learning difficulties',
 'attention issues': 'Attention issues', 
 'ADD_ADHD': 'ADD or ADHD', 
 'Heart Problem': 'Heart Problem', 
 'heart murmur': 'Heart murmur',
 'vascular or blood vessel issues': 'Vascular or blood vessel issues',
 'high blood pressure': 'High blood pressure', 
 'moya moya': 'Moyamoya',
 'heart defect': 'Heart defect',
 'pulmonic stenosis': 'Pulmonic Stenosis',
 'heart valve problems': 'Heart valve problems',
 'renal artery stenosis': 'Renal artery stenosis',
 'adrenal tumor': 'Adrenal tumor',
 'bone cysts': 'Bone cysts', 
 'brain tumor': 'Brain tumor',
 'gastrointestinal stromal tumors': 'Gastrointestinal stromal tumors (GISTs)',
 'glomus tumors': 'Glomus tumors',
 'leukemia': 'Leukemia',
 'Chewing/swallowing problems': 'Chewing or swallowing problems', 
 'Constipation': 'Constipation',
 'Early onset of puberty (prior to the age of 8)': 'Early puberty',
 'Hormonal imbalance that affected pregnancy or birth control': 'Hormonal imbalance',
 'Large head size': 'Large head size',
 'Late onset of puberty (after the age of 17)': 'Late puberty',
 'Over (hyper) or under (hypo) active thyroid': 'Hyper or hypoactive thyroid',
 'Poor weight gain in childhood': 'Poor weight gain',
 'Reflux or gastroesophageal reflux (GERD)': 'Gastroesophageal reflux', 
 'Short stature': 'Short stature',
 'Anxiety': 'Anxiety',
 'Arthritis': 'Arthritis',
 'Depression': 'Depression', 
 'Difficulties with social interactions': 'Difficulties with social interactions',
 'Fatigue': 'Fatigue', 
 'Headaches or migraines': 'Headaches or migraines',
 'Hydrocephalus (extra fluid or "water on the brain")': 'Hydrocephalus',
 'Joint pain': 'Joint pain',
 'Loose (hyperextensible) joints': 'Loose joints', 
 'Muscle tone or coordination issues': 'Muscle coordination issues',
 'Other mental health diagnosis': 'Other mental health problems', 
 'Seizures or epilepsy': 'Seizures or epilepsy',
 'Sleep disturbances': 'Sleep disturbances', 
 'Fever': 'Fever',
 'Spots': 'Spots', 
 'Nosebleed': 'Nosebleed'}

In [144]:
symptoms = symptoms.rename(columns = labels)

In [145]:
symptoms.head(5)

Unnamed: 0,Itching,Cutaneous Neurofibromas,Plexiform Neurofibromas,Visible Plexiform Neurofibromas,Spinal Neurofibromas,Cognitive Difficulties,Autism Spectrum Disorder,Optic Glioma,Vision Changes,Lisch Nodules,...,Joint pain,Loose joints,Muscle coordination issues,Other mental health problems,Seizures or epilepsy,Sleep disturbances,Spots,Heart Problem,Fever,Nosebleed
0,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
1,1,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
3,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,1,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [146]:
# save symptoms
symptoms.to_csv('data/symptoms_existing_data.csv')