Dataset from https://www.kaggle.com/datasets/aibuzz/predict-the-genetic-disorders-datasetof-genomes

In [1]:
# import libraries
import math
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
sns.set_style('darkgrid')

In [2]:
#load the data into a dataframe
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

#inspect the dimensons
print('There are {} rows and {} columns for training.'.format(train_df.shape[0], train_df.shape[1]))
print('There are {} rows and {} columns for testing.'.format(test_df.shape[0], test_df.shape[1]))

There are 22083 rows and 45 columns for training.
There are 9465 rows and 43 columns for testing.


In [3]:
pd.options.display.max_rows = 4000

In [4]:
train_df.head(10)

Unnamed: 0,Patient Id,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Patient First Name,Family Name,Father's name,...,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
0,PID0x6418,2.0,Yes,No,Yes,No,4.760603,Richard,,Larre,...,,9.857562,,1.0,1.0,1.0,1.0,1.0,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
1,PID0x25d5,4.0,Yes,Yes,No,No,4.910669,Mike,,Brycen,...,Multiple,5.52256,normal,1.0,,1.0,1.0,0.0,,Cystic fibrosis
2,PID0x4a82,6.0,Yes,No,No,No,4.893297,Kimberly,,Nashon,...,Singular,,normal,0.0,1.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes
3,PID0x4ac8,12.0,Yes,No,Yes,No,4.70528,Jeffery,Hoelscher,Aayaan,...,Singular,7.919321,inconclusive,0.0,0.0,1.0,0.0,0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,PID0x1bf7,11.0,Yes,No,,Yes,4.720703,Johanna,Stutzman,Suave,...,Multiple,4.09821,,0.0,0.0,0.0,0.0,,Multifactorial genetic inheritance disorders,Cancer
5,PID0x44fe,14.0,Yes,No,Yes,No,5.103188,Richard,,Coleston,...,Multiple,10.27223,normal,1.0,0.0,0.0,1.0,0.0,Single-gene inheritance diseases,Cystic fibrosis
6,PID0x28de,3.0,Yes,No,Yes,Yes,4.90108,Mary,,Aydun,...,Multiple,6.825974,normal,0.0,0.0,0.0,0.0,0.0,Single-gene inheritance diseases,Tay-Sachs
7,PID0x4f8f,3.0,No,No,Yes,Yes,4.964816,Emma,Bryant,Keng,...,Singular,9.836352,inconclusive,0.0,0.0,1.0,,0.0,Single-gene inheritance diseases,Tay-Sachs
8,PID0x8ce3,11.0,No,No,Yes,No,5.209058,Willie,Camacho,Tr,...,Multiple,6.669552,slightly abnormal,1.0,1.0,1.0,0.0,1.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
9,PID0x8660,4.0,No,Yes,Yes,Yes,4.752272,John,Sandoval,Gregori,...,Multiple,6.397702,abnormal,0.0,0.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes


In [5]:
# get types for each variable
train_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22083 entries, 0 to 22082
Data columns (total 45 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   Patient Id                                        22083 non-null  object 
 1   Patient Age                                       20656 non-null  float64
 2   Genes in mother's side                            22083 non-null  object 
 3   Inherited from father                             21777 non-null  object 
 4   Maternal gene                                     19273 non-null  object 
 5   Paternal gene                                     22083 non-null  object 
 6   Blood cell count (mcL)                            22083 non-null  float64
 7   Patient First Name                                22083 non-null  object 
 8   Family Name                                       12392 non-null  object 
 9   Father's name    

In [6]:
#get rid of stuff like id and names
train_df = train_df[["Patient Age", 
                    "Genes in mother's side", 
                    "Inherited from father", 
                    "Maternal gene", 
                    "Paternal gene", 
                    "Blood cell count (mcL)",
                    "Status", 
                    "Respiratory Rate (breaths/min)", 
                    "Heart Rate (rates/min", 
                    "Test 1",
                    "Test 2",
                    "Test 3",
                    "Test 4",
                    "Test 5",
                    "Follow-up", 
                    "Gender", 
                    "Birth asphyxia", 
                    "Autopsy shows birth defect (if applicable)",
                    "Folic acid details (peri-conceptional)", 
                    "H/O serious maternal illness", 
                    "H/O radiation exposure (x-ray)", 
                    "H/O substance abuse", 
                    "Assisted conception IVF/ART", 
                    "History of anomalies in previous pregnancies", 
                    "No. of previous abortion", 
                    "Birth defects", 
                    "White Blood cell count (thousand per microliter)", 
                    "Blood test result", 
                    "Symptom 1", 
                    "Symptom 2", 
                    "Symptom 3", 
                    "Symptom 4", 
                    "Symptom 5", 
                    "Genetic Disorder", 
                    "Disorder Subclass"]]

train_df.head(10)

Unnamed: 0,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Status,Respiratory Rate (breaths/min),Heart Rate (rates/min,Test 1,...,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
0,2.0,Yes,No,Yes,No,4.760603,Alive,Normal (30-60),Normal,0.0,...,,9.857562,,1.0,1.0,1.0,1.0,1.0,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
1,4.0,Yes,Yes,No,No,4.910669,Deceased,Tachypnea,Normal,,...,Multiple,5.52256,normal,1.0,,1.0,1.0,0.0,,Cystic fibrosis
2,6.0,Yes,No,No,No,4.893297,Alive,Normal (30-60),Tachycardia,0.0,...,Singular,,normal,0.0,1.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes
3,12.0,Yes,No,Yes,No,4.70528,Deceased,Tachypnea,Normal,0.0,...,Singular,7.919321,inconclusive,0.0,0.0,1.0,0.0,0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,11.0,Yes,No,,Yes,4.720703,Alive,Tachypnea,Tachycardia,0.0,...,Multiple,4.09821,,0.0,0.0,0.0,0.0,,Multifactorial genetic inheritance disorders,Cancer
5,14.0,Yes,No,Yes,No,5.103188,Deceased,,Normal,0.0,...,Multiple,10.27223,normal,1.0,0.0,0.0,1.0,0.0,Single-gene inheritance diseases,Cystic fibrosis
6,3.0,Yes,No,Yes,Yes,4.90108,Alive,Normal (30-60),,,...,Multiple,6.825974,normal,0.0,0.0,0.0,0.0,0.0,Single-gene inheritance diseases,Tay-Sachs
7,3.0,No,No,Yes,Yes,4.964816,Alive,Tachypnea,Normal,0.0,...,Singular,9.836352,inconclusive,0.0,0.0,1.0,,0.0,Single-gene inheritance diseases,Tay-Sachs
8,11.0,No,No,Yes,No,5.209058,Alive,Tachypnea,Tachycardia,0.0,...,Multiple,6.669552,slightly abnormal,1.0,1.0,1.0,0.0,1.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
9,4.0,No,Yes,Yes,Yes,4.752272,Alive,Tachypnea,Tachycardia,0.0,...,Multiple,6.397702,abnormal,0.0,0.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes


In [7]:
# get types for each variable
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22083 entries, 0 to 22082
Data columns (total 35 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   Patient Age                                       20656 non-null  float64
 1   Genes in mother's side                            22083 non-null  object 
 2   Inherited from father                             21777 non-null  object 
 3   Maternal gene                                     19273 non-null  object 
 4   Paternal gene                                     22083 non-null  object 
 5   Blood cell count (mcL)                            22083 non-null  float64
 6   Status                                            22083 non-null  object 
 7   Respiratory Rate (breaths/min)                    19934 non-null  object 
 8   Heart Rate (rates/min                             19970 non-null  object 
 9   Test 1           

In [8]:
#Check VendorID for invalid values
for col in train_df.columns:
    print(train_df[col].value_counts(dropna=False))
    print()
    
# get rid of all the tests and autopsy info

4.0     1435
12.0    1435
NaN     1427
9.0     1415
2.0     1396
5.0     1394
0.0     1386
13.0    1384
3.0     1383
6.0     1374
1.0     1364
11.0    1353
7.0     1351
8.0     1340
14.0    1333
10.0    1313
Name: Patient Age, dtype: int64

Yes    13143
No      8940
Name: Genes in mother's side, dtype: int64

No     13133
Yes     8644
NaN      306
Name: Inherited from father, dtype: int64

Yes    10647
No      8626
NaN     2810
Name: Maternal gene, dtype: int64

No     12508
Yes     9575
Name: Paternal gene, dtype: int64

4.760603    1
4.800413    1
4.874316    1
4.592124    1
5.190047    1
           ..
5.128740    1
4.769116    1
4.916183    1
4.717727    1
4.738067    1
Name: Blood cell count (mcL), Length: 22083, dtype: int64

Alive       11083
Deceased    11000
Name: Status, dtype: int64

Normal (30-60)    10065
Tachypnea          9869
NaN                2149
Name: Respiratory Rate (breaths/min), dtype: int64

Normal         10187
Tachycardia     9783
NaN             2113
Name: He

In [9]:
train_df = train_df.replace('-', np.nan)

test_df = test_df.replace('-', np.nan)
test_df = test_df.replace('-99', np.nan)

In [10]:
#Check VendorID for invalid values
for col in train_df.columns:
    print(train_df[col].value_counts(dropna=False))
    print()

4.0     1435
12.0    1435
NaN     1427
9.0     1415
2.0     1396
5.0     1394
0.0     1386
13.0    1384
3.0     1383
6.0     1374
1.0     1364
11.0    1353
7.0     1351
8.0     1340
14.0    1333
10.0    1313
Name: Patient Age, dtype: int64

Yes    13143
No      8940
Name: Genes in mother's side, dtype: int64

No     13133
Yes     8644
NaN      306
Name: Inherited from father, dtype: int64

Yes    10647
No      8626
NaN     2810
Name: Maternal gene, dtype: int64

No     12508
Yes     9575
Name: Paternal gene, dtype: int64

4.760603    1
4.800413    1
4.874316    1
4.592124    1
5.190047    1
           ..
5.128740    1
4.769116    1
4.916183    1
4.717727    1
4.738067    1
Name: Blood cell count (mcL), Length: 22083, dtype: int64

Alive       11083
Deceased    11000
Name: Status, dtype: int64

Normal (30-60)    10065
Tachypnea          9869
NaN                2149
Name: Respiratory Rate (breaths/min), dtype: int64

Normal         10187
Tachycardia     9783
NaN             2113
Name: He

In [11]:
percent_missing = train_df.isnull().sum() * 100 / len(train_df)
print(percent_missing)

# get rid of H/O radiation exposure (x-ray) and H/O substance abuse and birth asphyxia

Patient Age                                          6.461984
Genes in mother's side                               0.000000
Inherited from father                                1.385681
Maternal gene                                       12.724720
Paternal gene                                        0.000000
Blood cell count (mcL)                               0.000000
Status                                               0.000000
Respiratory Rate (breaths/min)                       9.731468
Heart Rate (rates/min                                9.568446
Test 1                                               9.631843
Test 2                                               9.745053
Test 3                                               9.722411
Test 4                                               9.690712
Test 5                                               9.826563
Follow-up                                            9.808450
Gender                                               9.840149
Birth as

In [12]:
train_df = train_df[["Patient Age", 
                    "Genes in mother's side", 
                    "Inherited from father", 
                    "Maternal gene", 
                    "Paternal gene", 
                    "Blood cell count (mcL)",
                    "Status", 
                    "Respiratory Rate (breaths/min)", 
                    "Heart Rate (rates/min", 
                    "Follow-up", 
                    "Gender", 
                    "Folic acid details (peri-conceptional)", 
                    "H/O serious maternal illness", 
                    "Assisted conception IVF/ART", 
                    "History of anomalies in previous pregnancies", 
                    "No. of previous abortion", 
                    "Birth defects", 
                    "White Blood cell count (thousand per microliter)", 
                    "Blood test result", 
                    "Symptom 1", 
                    "Symptom 2", 
                    "Symptom 3", 
                    "Symptom 4", 
                    "Symptom 5", 
                    "Genetic Disorder", 
                    "Disorder Subclass"]]

test_df = test_df[["Patient Age", 
                    "Genes in mother's side", 
                    "Inherited from father", 
                    "Maternal gene", 
                    "Paternal gene", 
                    "Blood cell count (mcL)",
                    "Status", 
                    "Respiratory Rate (breaths/min)", 
                    "Heart Rate (rates/min", 
                    "Follow-up", 
                    "Gender", 
                    "Folic acid details (peri-conceptional)", 
                    "H/O serious maternal illness", 
                    "Assisted conception IVF/ART", 
                    "History of anomalies in previous pregnancies", 
                    "No. of previous abortion", 
                    "Birth defects", 
                    "White Blood cell count (thousand per microliter)", 
                    "Blood test result", 
                    "Symptom 1", 
                    "Symptom 2", 
                    "Symptom 3", 
                    "Symptom 4", 
                    "Symptom 5"]]

train_df.head(10)

Unnamed: 0,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Status,Respiratory Rate (breaths/min),Heart Rate (rates/min,Follow-up,...,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
0,2.0,Yes,No,Yes,No,4.760603,Alive,Normal (30-60),Normal,High,...,,9.857562,,1.0,1.0,1.0,1.0,1.0,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
1,4.0,Yes,Yes,No,No,4.910669,Deceased,Tachypnea,Normal,High,...,Multiple,5.52256,normal,1.0,,1.0,1.0,0.0,,Cystic fibrosis
2,6.0,Yes,No,No,No,4.893297,Alive,Normal (30-60),Tachycardia,Low,...,Singular,,normal,0.0,1.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes
3,12.0,Yes,No,Yes,No,4.70528,Deceased,Tachypnea,Normal,High,...,Singular,7.919321,inconclusive,0.0,0.0,1.0,0.0,0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,11.0,Yes,No,,Yes,4.720703,Alive,Tachypnea,Tachycardia,Low,...,Multiple,4.09821,,0.0,0.0,0.0,0.0,,Multifactorial genetic inheritance disorders,Cancer
5,14.0,Yes,No,Yes,No,5.103188,Deceased,,Normal,Low,...,Multiple,10.27223,normal,1.0,0.0,0.0,1.0,0.0,Single-gene inheritance diseases,Cystic fibrosis
6,3.0,Yes,No,Yes,Yes,4.90108,Alive,Normal (30-60),,Low,...,Multiple,6.825974,normal,0.0,0.0,0.0,0.0,0.0,Single-gene inheritance diseases,Tay-Sachs
7,3.0,No,No,Yes,Yes,4.964816,Alive,Tachypnea,Normal,Low,...,Singular,9.836352,inconclusive,0.0,0.0,1.0,,0.0,Single-gene inheritance diseases,Tay-Sachs
8,11.0,No,No,Yes,No,5.209058,Alive,Tachypnea,Tachycardia,Low,...,Multiple,6.669552,slightly abnormal,1.0,1.0,1.0,0.0,1.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
9,4.0,No,Yes,Yes,Yes,4.752272,Alive,Tachypnea,Tachycardia,Low,...,Multiple,6.397702,abnormal,0.0,0.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes


In [13]:
train_df.isnull().sum(axis=1).sort_values(ascending = False) #count the frequency of missing values in each row (i.e. how many columns are empty)

16845    11
15402    10
5050     10
18594    10
8911     10
         ..
2432      0
12911     0
12913     0
12915     0
22082     0
Length: 22083, dtype: int64

In [14]:
#Handle missing data -- deletion
train_df = train_df[train_df.isnull().sum(axis=1) < 5] #delete rows with 1 or more missing variables (in each row)

train_df.isnull().sum(axis=1).sort_values(ascending = False) #count the frequency of missing values in each row (i.e. how many columns are empty)

1        4
15887    4
15972    4
4638     4
15959    4
        ..
4446     0
16229    0
16228    0
16227    0
22082    0
Length: 19554, dtype: int64

In [15]:
# get types for each variable
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19554 entries, 1 to 22082
Data columns (total 26 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   Patient Age                                       18587 non-null  float64
 1   Genes in mother's side                            19554 non-null  object 
 2   Inherited from father                             19354 non-null  object 
 3   Maternal gene                                     17602 non-null  object 
 4   Paternal gene                                     19554 non-null  object 
 5   Blood cell count (mcL)                            19554 non-null  float64
 6   Status                                            19554 non-null  object 
 7   Respiratory Rate (breaths/min)                    18087 non-null  object 
 8   Heart Rate (rates/min                             18115 non-null  object 
 9   Follow-up        

In [16]:
train_df["Genes in mother's side"] = train_df["Genes in mother's side"].map(dict(Yes=True, No=False))
train_df["Inherited from father"] = train_df["Inherited from father"].map(dict(Yes=True, No=False))
train_df["Maternal gene"] = train_df["Maternal gene"].map(dict(Yes=True, No=False))
train_df["Paternal gene"] = train_df["Paternal gene"].map(dict(Yes=True, No=False))
train_df["Folic acid details (peri-conceptional)"] = train_df["Folic acid details (peri-conceptional)"].map(dict(Yes=True, No=False))
train_df["H/O serious maternal illness"] = train_df["H/O serious maternal illness"].map(dict(Yes=True, No=False))
train_df["Assisted conception IVF/ART"] = train_df["Assisted conception IVF/ART"].map(dict(Yes=True, No=False))
train_df["History of anomalies in previous pregnancies"] = train_df["History of anomalies in previous pregnancies"].map(dict(Yes=True, No=False))


test_df["Genes in mother's side"] = test_df["Genes in mother's side"].map(dict(Yes=True, No=False))
test_df["Inherited from father"] = test_df["Inherited from father"].map(dict(Yes=True, No=False))
test_df["Maternal gene"] = test_df["Maternal gene"].map(dict(Yes=True, No=False))
test_df["Paternal gene"] = test_df["Paternal gene"].map(dict(Yes=True, No=False))
test_df["Folic acid details (peri-conceptional)"] = test_df["Folic acid details (peri-conceptional)"].map(dict(Yes=True, No=False))
test_df["H/O serious maternal illness"] = test_df["H/O serious maternal illness"].map(dict(Yes=True, No=False))
test_df["Assisted conception IVF/ART"] = test_df["Assisted conception IVF/ART"].map(dict(Yes=True, No=False))
test_df["History of anomalies in previous pregnancies"] = test_df["History of anomalies in previous pregnancies"].map(dict(Yes=True, No=False))

In [17]:
train_df["Genes in mother's side"]  = train_df["Genes in mother's side"].astype('boolean')
train_df["Inherited from father"]  = train_df["Inherited from father"].astype('boolean')
train_df["Maternal gene"]  = train_df["Maternal gene"].astype('boolean')
train_df["Paternal gene"]  = train_df["Paternal gene"].astype('boolean')
train_df["Folic acid details (peri-conceptional)"]  = train_df["Folic acid details (peri-conceptional)"].astype('boolean')
train_df["H/O serious maternal illness"]  = train_df["H/O serious maternal illness"].astype('boolean')
train_df["Assisted conception IVF/ART"]  = train_df["Assisted conception IVF/ART"].astype('boolean')
train_df["History of anomalies in previous pregnancies"]  = train_df["History of anomalies in previous pregnancies"].astype('boolean')

train_df = pd.get_dummies(train_df, columns=["Status", 
                    "Respiratory Rate (breaths/min)", 
                    "Heart Rate (rates/min", 
                    "Follow-up", 
                    "Gender", 
                    "Birth defects",
                    "Blood test result",
                    "Genetic Disorder", 
                    "Disorder Subclass"])

test_df["Genes in mother's side"]  = test_df["Genes in mother's side"].astype('boolean')
test_df["Inherited from father"]  = test_df["Inherited from father"].astype('boolean')
test_df["Maternal gene"]  = test_df["Maternal gene"].astype('boolean')
test_df["Paternal gene"]  = test_df["Paternal gene"].astype('boolean')
test_df["Folic acid details (peri-conceptional)"]  = test_df["Folic acid details (peri-conceptional)"].astype('boolean')
test_df["H/O serious maternal illness"]  = test_df["H/O serious maternal illness"].astype('boolean')
test_df["Assisted conception IVF/ART"]  = test_df["Assisted conception IVF/ART"].astype('boolean')
test_df["History of anomalies in previous pregnancies"]  = test_df["History of anomalies in previous pregnancies"].astype('boolean')

test_df = pd.get_dummies(test_df, columns=["Status", 
                    "Respiratory Rate (breaths/min)", 
                    "Heart Rate (rates/min", 
                    "Follow-up", 
                    "Gender", 
                    "Birth defects",
                    "Blood test result"])


# get types for each variable
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19554 entries, 1 to 22082
Data columns (total 46 columns):
 #   Column                                                         Non-Null Count  Dtype  
---  ------                                                         --------------  -----  
 0   Patient Age                                                    18587 non-null  float64
 1   Genes in mother's side                                         19554 non-null  boolean
 2   Inherited from father                                          19354 non-null  boolean
 3   Maternal gene                                                  17602 non-null  boolean
 4   Paternal gene                                                  19554 non-null  boolean
 5   Blood cell count (mcL)                                         19554 non-null  float64
 6   Folic acid details (peri-conceptional)                         18088 non-null  boolean
 7   H/O serious maternal illness                              

In [18]:
train_df.head(10)

Unnamed: 0,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Folic acid details (peri-conceptional),H/O serious maternal illness,Assisted conception IVF/ART,History of anomalies in previous pregnancies,...,Genetic Disorder_Single-gene inheritance diseases,Disorder Subclass_Alzheimer's,Disorder Subclass_Cancer,Disorder Subclass_Cystic fibrosis,Disorder Subclass_Diabetes,Disorder Subclass_Hemochromatosis,Disorder Subclass_Leber's hereditary optic neuropathy,Disorder Subclass_Leigh syndrome,Disorder Subclass_Mitochondrial myopathy,Disorder Subclass_Tay-Sachs
1,4.0,True,True,False,False,4.910669,True,True,False,True,...,0,0,0,1,0,0,0,0,0,0
2,6.0,True,False,False,False,4.893297,True,False,True,True,...,0,0,0,0,1,0,0,0,0,0
3,12.0,True,False,True,False,4.70528,False,True,,True,...,0,0,0,0,0,0,0,1,0,0
4,11.0,True,False,,True,4.720703,False,True,True,False,...,0,0,1,0,0,0,0,0,0,0
5,14.0,True,False,True,False,5.103188,False,False,,False,...,1,0,0,1,0,0,0,0,0,0
6,3.0,True,False,True,True,4.90108,,True,True,False,...,1,0,0,0,0,0,0,0,0,1
7,3.0,False,False,True,True,4.964816,True,True,False,True,...,1,0,0,0,0,0,0,0,0,1
8,11.0,False,False,True,False,5.209058,True,True,False,True,...,0,0,0,0,0,0,0,1,0,0
9,4.0,False,True,True,True,4.752272,True,False,True,True,...,0,0,0,0,1,0,0,0,0,0
11,7.0,False,False,False,True,4.848795,True,True,False,True,...,1,0,0,1,0,0,0,0,0,0


In [19]:
for col in train_df.columns:
    print('"' + col + '",')

"Patient Age",
"Genes in mother's side",
"Inherited from father",
"Maternal gene",
"Paternal gene",
"Blood cell count (mcL)",
"Folic acid details (peri-conceptional)",
"H/O serious maternal illness",
"Assisted conception IVF/ART",
"History of anomalies in previous pregnancies",
"No. of previous abortion",
"White Blood cell count (thousand per microliter)",
"Symptom 1",
"Symptom 2",
"Symptom 3",
"Symptom 4",
"Symptom 5",
"Status_Alive",
"Status_Deceased",
"Respiratory Rate (breaths/min)_Normal (30-60)",
"Respiratory Rate (breaths/min)_Tachypnea",
"Heart Rate (rates/min_Normal",
"Heart Rate (rates/min_Tachycardia",
"Follow-up_High",
"Follow-up_Low",
"Gender_Ambiguous",
"Gender_Female",
"Gender_Male",
"Birth defects_Multiple",
"Birth defects_Singular",
"Blood test result_abnormal",
"Blood test result_inconclusive",
"Blood test result_normal",
"Blood test result_slightly abnormal",
"Genetic Disorder_Mitochondrial genetic inheritance disorders",
"Genetic Disorder_Multifactorial genetic inhe

In [20]:
#Handle missing data -- deletion
train_df = train_df[train_df.isnull().sum(axis=1) < 1] #delete rows with 1 or more missing variables (in each row)
test_df = test_df[test_df.isnull().sum(axis=1) < 1]

train_df.isnull().sum(axis=1).sort_values(ascending = False) #count the frequency of missing values in each row (i.e. how many columns are empty)

8        0
14832    0
14810    0
14812    0
14818    0
        ..
7409     0
7408     0
7406     0
7404     0
22082    0
Length: 8789, dtype: int64

In [21]:
train_x_df = train_df[["Patient Age",
                        "Genes in mother's side",
                        "Inherited from father",
                        "Maternal gene",
                        "Paternal gene",
                        "Blood cell count (mcL)",
                        "Folic acid details (peri-conceptional)",
                        "H/O serious maternal illness",
                        "Assisted conception IVF/ART",
                        "History of anomalies in previous pregnancies",
                        "No. of previous abortion",
                        "White Blood cell count (thousand per microliter)",
                        "Symptom 1",
                        "Symptom 2",
                        "Symptom 3",
                        "Symptom 4",
                        "Symptom 5",
                        "Status_Alive",
                        "Status_Deceased",
                        "Respiratory Rate (breaths/min)_Normal (30-60)",
                        "Respiratory Rate (breaths/min)_Tachypnea",
                        "Heart Rate (rates/min_Normal",
                        "Heart Rate (rates/min_Tachycardia",
                        "Follow-up_High",
                        "Follow-up_Low",
                        "Gender_Ambiguous",
                        "Gender_Female",
                        "Gender_Male",
                        "Birth defects_Multiple",
                        "Birth defects_Singular",
                        "Blood test result_abnormal",
                        "Blood test result_inconclusive",
                        "Blood test result_normal",
                        "Blood test result_slightly abnormal"]]

train_y_df = train_df[["Genetic Disorder_Mitochondrial genetic inheritance disorders",
                        "Genetic Disorder_Multifactorial genetic inheritance disorders",
                        "Genetic Disorder_Single-gene inheritance diseases",
                        "Disorder Subclass_Alzheimer's",
                        "Disorder Subclass_Cancer",
                        "Disorder Subclass_Cystic fibrosis",
                        "Disorder Subclass_Diabetes",
                        "Disorder Subclass_Hemochromatosis",
                        "Disorder Subclass_Leber's hereditary optic neuropathy",
                        "Disorder Subclass_Leigh syndrome",
                        "Disorder Subclass_Mitochondrial myopathy",
                        "Disorder Subclass_Tay-Sachs"]]

test_x_df = test_df

In [22]:
#inspect the dimensons
print('There are {} rows and {} columns for training.'.format(train_x_df.shape[0], train_x_df.shape[1]))
print('There are {} rows and {} columns for training.'.format(train_y_df.shape[0], train_y_df.shape[1]))
print('There are {} rows and {} columns for testing.'.format(test_x_df.shape[0], test_x_df.shape[1]))

There are 8789 rows and 34 columns for training.
There are 8789 rows and 12 columns for training.
There are 1964 rows and 34 columns for testing.


In [23]:
neigh = KNeighborsClassifier(n_neighbors=3)

neigh.fit(train_x_df, train_y_df)

train_pred_df = neigh.predict(train_x_df)

In [24]:
#find and print mse
mse = mean_squared_error(train_y_df, train_pred_df)
print('The mse of the model is: {}'.format(round(mse, 2)))

The mse of the model is: 0.11


In [25]:
# # suggested evaluation
# # Genetic Disorder
# score1 = max(0, 100*metrics.f1_score(actual["Genetic Disorder"], predicted["Genetic Disorder"], average = "macro"))

# # Disorder Subclass
# score2 = max(0, 100*metrics.f1_score(actual["Disorder Subclass"], predicted["Disorder Subclass"], average = "macro"))

# # Final score
# score = (score1/2) + (score2/2)