## Part A: Data Preprocessing Pipeline

This section describes the data preprocessing steps applied to the dataset,
including data cleaning, data transformation, and data reduction.

In [50]:
import pandas as pd

In [51]:
df=pd.read_csv('../achieve/train_genetic_disorders.csv')

In [52]:
total_rows = len(df)
missing_counts = total_rows - df.count()
df

Unnamed: 0,Patient Id,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Patient First Name,Family Name,Father's name,...,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
0,PID0x6418,2.0,Yes,No,Yes,No,4.760603,Richard,,Larre,...,,9.857562,,1.0,1.0,1.0,1.0,1.0,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
1,PID0x25d5,4.0,Yes,Yes,No,No,4.910669,Mike,,Brycen,...,Multiple,5.522560,normal,1.0,,1.0,1.0,0.0,,Cystic fibrosis
2,PID0x4a82,6.0,Yes,No,No,No,4.893297,Kimberly,,Nashon,...,Singular,,normal,0.0,1.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes
3,PID0x4ac8,12.0,Yes,No,Yes,No,4.705280,Jeffery,Hoelscher,Aayaan,...,Singular,7.919321,inconclusive,0.0,0.0,1.0,0.0,0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,PID0x1bf7,11.0,Yes,No,,Yes,4.720703,Johanna,Stutzman,Suave,...,Multiple,4.098210,,0.0,0.0,0.0,0.0,,Multifactorial genetic inheritance disorders,Cancer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22078,,,,,,,,,,,...,,,,,,,,,,
22079,,,,,,,,,,,...,,,,,,,,,,
22080,,,,,,,,,,,...,,,,,,,,,,
22081,,,,,,,,,,,...,,,,,,,,,,


## The first step of cleaning: Delete all blank lines.

In [53]:
df = df.dropna(how='all')
print(df.shape)

(21011, 45)


## Find all the missing values

In [54]:
missing_count = df.isnull().sum()
missing_percent = df.isnull().sum() / len(df) * 100
missing_summary = pd.concat([missing_count, missing_percent], axis=1)
missing_summary.columns = ['missing_count', 'missing_percent']
missing_summary.sort_values(by='missing_percent', ascending=False)

Unnamed: 0,missing_count,missing_percent
Family Name,9240,43.976964
Mother's age,5718,27.214316
Father's age,5689,27.076293
Institute Name,4860,23.130741
Autopsy shows birth defect (if applicable),4164,19.81819
Maternal gene,2694,12.821855
Symptom 2,2112,10.051878
H/O substance abuse,2090,9.947171
Test 5,2072,9.861501
Follow-up,2070,9.851982


 ## Since Family Name missing value >30%,didn't caused any effect towards genetic disorder. Drop might be the most safety way to reduce noise.

In [55]:
df = df.drop(columns=['Family Name'])

 ## Missing value between 20% - 30 % , check the importance of the data

In [56]:
df = df.drop(columns=['Institute Name'])


## Missing rate < 20% , Fill in the data

In [59]:
from sklearn.impute import SimpleImputer

num_cols = df.select_dtypes(include=['int64','float64']).columns
imputer_num = SimpleImputer(strategy='median')
df[num_cols] = imputer_num.fit_transform(df[num_cols])
df[num_cols].isnull().sum().sort_values(ascending=False).head()

Patient Age               0
Blood cell count (mcL)    0
Mother's age              0
Father's age              0
Test 1                    0
dtype: int64

In [58]:
cat_cols = df.select_dtypes(include=['object']).columns
imputer_cat = SimpleImputer(strategy='most_frequent')
df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])
df

Unnamed: 0,Patient Id,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Patient First Name,Father's name,Mother's age,...,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
0,PID0x6418,2.0,Yes,No,Yes,No,4.760603,Richard,Larre,35.0,...,Singular,9.857562,slightly abnormal,1.0,1.0,1.0,1.0,1.0,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
1,PID0x25d5,4.0,Yes,Yes,No,No,4.910669,Mike,Brycen,35.0,...,Multiple,5.522560,normal,1.0,1.0,1.0,1.0,0.0,Mitochondrial genetic inheritance disorders,Cystic fibrosis
2,PID0x4a82,6.0,Yes,No,No,No,4.893297,Kimberly,Nashon,41.0,...,Singular,7.473071,normal,0.0,1.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes
3,PID0x4ac8,12.0,Yes,No,Yes,No,4.705280,Jeffery,Aayaan,21.0,...,Singular,7.919321,inconclusive,0.0,0.0,1.0,0.0,0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,PID0x1bf7,11.0,Yes,No,Yes,Yes,4.720703,Johanna,Suave,32.0,...,Multiple,4.098210,slightly abnormal,0.0,0.0,0.0,0.0,0.0,Multifactorial genetic inheritance disorders,Cancer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21006,PID0x26c,11.0,No,Yes,Yes,No,5.090495,Betty,Letwan,46.0,...,Multiple,7.473071,normal,1.0,1.0,0.0,0.0,0.0,Single-gene inheritance diseases,Tay-Sachs
21007,PID0xa01,7.0,No,No,No,Yes,5.214750,Beth,Trayvion,35.0,...,Singular,7.086312,slightly abnormal,1.0,1.0,1.0,0.0,1.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
21008,PID0x15,5.0,Yes,No,Yes,Yes,5.084831,Craig,Shivaan,28.0,...,Multiple,7.924746,slightly abnormal,1.0,0.0,0.0,1.0,1.0,Multifactorial genetic inheritance disorders,Leigh syndrome
21009,PID0x8d32,1.0,Yes,No,No,Yes,5.224828,Francisco,Dyer,33.0,...,Multiple,12.000000,inconclusive,0.0,1.0,1.0,1.0,0.0,Mitochondrial genetic inheritance disorders,Mitochondrial myopathy


In [60]:
df[['Test 1','Test 2','Test 3','Test 4','Test 5']].dtypes


Test 1    float64
Test 2    float64
Test 3    float64
Test 4    float64
Test 5    float64
dtype: object