## Part A: Data Preprocessing Pipeline

This section describes the data preprocessing steps applied to the dataset,
including data cleaning, data transformation, and data reduction.

## 1. Data Cleaning

In [176]:
import pandas as pd

In [177]:
df=pd.read_csv('../achieve/train_genetic_disorders.csv')

In [178]:
total_rows = len(df)
missing_counts = total_rows - df.count()
df

Unnamed: 0,Patient Id,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Patient First Name,Family Name,Father's name,...,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
0,PID0x6418,2.0,Yes,No,Yes,No,4.760603,Richard,,Larre,...,,9.857562,,1.0,1.0,1.0,1.0,1.0,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
1,PID0x25d5,4.0,Yes,Yes,No,No,4.910669,Mike,,Brycen,...,Multiple,5.522560,normal,1.0,,1.0,1.0,0.0,,Cystic fibrosis
2,PID0x4a82,6.0,Yes,No,No,No,4.893297,Kimberly,,Nashon,...,Singular,,normal,0.0,1.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes
3,PID0x4ac8,12.0,Yes,No,Yes,No,4.705280,Jeffery,Hoelscher,Aayaan,...,Singular,7.919321,inconclusive,0.0,0.0,1.0,0.0,0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,PID0x1bf7,11.0,Yes,No,,Yes,4.720703,Johanna,Stutzman,Suave,...,Multiple,4.098210,,0.0,0.0,0.0,0.0,,Multifactorial genetic inheritance disorders,Cancer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22078,,,,,,,,,,,...,,,,,,,,,,
22079,,,,,,,,,,,...,,,,,,,,,,
22080,,,,,,,,,,,...,,,,,,,,,,
22081,,,,,,,,,,,...,,,,,,,,,,


## The first step of cleaning: Delete all blank lines.

In [179]:
df = df.dropna(how='all')
print(df.shape)

(21011, 45)


## Find all the missing values

In [180]:
missing_count = df.isnull().sum()
missing_percent = df.isnull().sum() / len(df) * 100
missing_summary = pd.concat([missing_count, missing_percent], axis=1)
missing_summary.columns = ['missing_count', 'missing_percent']
missing_summary.sort_values(by='missing_percent', ascending=False)

Unnamed: 0,missing_count,missing_percent
Family Name,9240,43.976964
Mother's age,5718,27.214316
Father's age,5689,27.076293
Institute Name,4860,23.130741
Autopsy shows birth defect (if applicable),4164,19.81819
Maternal gene,2694,12.821855
Symptom 2,2112,10.051878
H/O substance abuse,2090,9.947171
Test 5,2072,9.861501
Follow-up,2070,9.851982


 ## Since Family Name missing value >30%,didn't caused any effect towards genetic disorder. Drop might be the most safety way to reduce noise.

In [181]:
df = df.drop(columns=['Family Name'])

 ## Missing value between 20% - 30 % , check the importance of the data

In [182]:
df = df.drop(columns=['Institute Name'])

## Missing rate < 20% , Fill in the data

In [183]:
cat_cols = df.select_dtypes(include=['object']).columns
imputer_cat = SimpleImputer(strategy='most_frequent')
df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])


In [184]:
from sklearn.impute import SimpleImputer

num_cols = df.select_dtypes(include=['int64','float64']).columns
imputer_num = SimpleImputer(strategy='median')
df[num_cols] = imputer_num.fit_transform(df[num_cols])
df[num_cols].isnull().sum().sort_values(ascending=False).head()

Patient Age               0
Blood cell count (mcL)    0
Mother's age              0
Father's age              0
Test 1                    0
dtype: int64

## Check Duplicated

In [186]:
df.duplicated().sum()

np.int64(0)

## Check the data

In [187]:
df[num_cols].describe()

Unnamed: 0,Patient Age,Blood cell count (mcL),Mother's age,Father's age,Test 1,Test 2,Test 3,Test 4,Test 5,No. of previous abortion,White Blood cell count (thousand per microliter),Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5
count,21011.0,21011.0,21011.0,21011.0,21011.0,21011.0,21011.0,21011.0,21011.0,21011.0,21011.0,21011.0,21011.0,21011.0,21011.0,21011.0
mean,6.976489,4.899004,34.652468,41.958022,0.0,0.0,0.0,1.0,0.0,2.000095,7.48355,0.631955,0.59783,0.581505,0.449479,0.415735
std,4.179495,0.199829,8.403752,11.124985,0.0,0.0,0.0,0.0,0.0,1.340718,2.521118,0.482285,0.490348,0.493324,0.497453,0.49286
min,0.0,4.092727,18.0,20.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,4.76323,29.0,35.0,0.0,0.0,0.0,1.0,0.0,1.0,5.655274,0.0,0.0,0.0,0.0,0.0
50%,7.0,4.899548,35.0,42.0,0.0,0.0,0.0,1.0,0.0,2.0,7.473071,1.0,1.0,1.0,0.0,0.0
75%,10.0,5.033977,40.0,49.0,0.0,0.0,0.0,1.0,0.0,3.0,9.276613,1.0,1.0,1.0,1.0,1.0
max,14.0,5.609829,51.0,64.0,0.0,0.0,0.0,1.0,0.0,4.0,12.0,1.0,1.0,1.0,1.0,1.0


## 2. Data Transformation

In [188]:
low_variance_cols = df.columns[df.nunique() <= 1]
df = df.drop(columns=low_variance_cols)

In [189]:
cat_cols = df.select_dtypes(include=['object']).columns

In [190]:
binary_cols = []

for col in cat_cols:
    if df[col].nunique() == 2:
        binary_cols.append(col)

binary_cols
df[binary_cols] = df[binary_cols].replace({'Yes':1, 'No':0})
df

  df[binary_cols] = df[binary_cols].replace({'Yes':1, 'No':0})


Unnamed: 0,Patient Id,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Patient First Name,Father's name,Mother's age,...,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
0,PID0x6418,2.0,1,0,1,0,4.760603,Richard,Larre,35.0,...,Singular,9.857562,slightly abnormal,1.0,1.0,1.0,1.0,1.0,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
1,PID0x25d5,4.0,1,1,0,0,4.910669,Mike,Brycen,35.0,...,Multiple,5.522560,normal,1.0,1.0,1.0,1.0,0.0,Mitochondrial genetic inheritance disorders,Cystic fibrosis
2,PID0x4a82,6.0,1,0,0,0,4.893297,Kimberly,Nashon,41.0,...,Singular,7.473071,normal,0.0,1.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes
3,PID0x4ac8,12.0,1,0,1,0,4.705280,Jeffery,Aayaan,21.0,...,Singular,7.919321,inconclusive,0.0,0.0,1.0,0.0,0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,PID0x1bf7,11.0,1,0,1,1,4.720703,Johanna,Suave,32.0,...,Multiple,4.098210,slightly abnormal,0.0,0.0,0.0,0.0,0.0,Multifactorial genetic inheritance disorders,Cancer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21006,PID0x26c,11.0,0,1,1,0,5.090495,Betty,Letwan,46.0,...,Multiple,7.473071,normal,1.0,1.0,0.0,0.0,0.0,Single-gene inheritance diseases,Tay-Sachs
21007,PID0xa01,7.0,0,0,0,1,5.214750,Beth,Trayvion,35.0,...,Singular,7.086312,slightly abnormal,1.0,1.0,1.0,0.0,1.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
21008,PID0x15,5.0,1,0,1,1,5.084831,Craig,Shivaan,28.0,...,Multiple,7.924746,slightly abnormal,1.0,0.0,0.0,1.0,1.0,Multifactorial genetic inheritance disorders,Leigh syndrome
21009,PID0x8d32,1.0,1,0,0,1,5.224828,Francisco,Dyer,33.0,...,Multiple,12.000000,inconclusive,0.0,1.0,1.0,1.0,0.0,Mitochondrial genetic inheritance disorders,Mitochondrial myopathy


In [191]:
multi_cat_cols = [c for c in cat_cols if c not in binary_cols]

df = pd.get_dummies(df, columns=multi_cat_cols, drop_first=True)
df

Unnamed: 0,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Mother's age,Father's age,Status,Respiratory Rate (breaths/min),...,Genetic Disorder_Multifactorial genetic inheritance disorders,Genetic Disorder_Single-gene inheritance diseases,Disorder Subclass_Cancer,Disorder Subclass_Cystic fibrosis,Disorder Subclass_Diabetes,Disorder Subclass_Hemochromatosis,Disorder Subclass_Leber's hereditary optic neuropathy,Disorder Subclass_Leigh syndrome,Disorder Subclass_Mitochondrial myopathy,Disorder Subclass_Tay-Sachs
0,2.0,1,0,1,0,4.760603,35.0,42.0,Alive,Normal (30-60),...,False,False,False,False,False,False,True,False,False,False
1,4.0,1,1,0,0,4.910669,35.0,23.0,Deceased,Tachypnea,...,False,False,False,True,False,False,False,False,False,False
2,6.0,1,0,0,0,4.893297,41.0,22.0,Alive,Normal (30-60),...,True,False,False,False,True,False,False,False,False,False
3,12.0,1,0,1,0,4.705280,21.0,42.0,Deceased,Tachypnea,...,False,False,False,False,False,False,False,True,False,False
4,11.0,1,0,1,1,4.720703,32.0,42.0,Alive,Tachypnea,...,True,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21006,11.0,0,1,1,0,5.090495,46.0,42.0,Alive,Normal (30-60),...,False,True,False,False,False,False,False,False,False,True
21007,7.0,0,0,0,1,5.214750,35.0,38.0,Alive,Tachypnea,...,False,False,False,False,False,False,False,True,False,False
21008,5.0,1,0,1,1,5.084831,28.0,28.0,Alive,Normal (30-60),...,True,False,False,False,False,False,False,True,False,False
21009,1.0,1,0,0,1,5.224828,33.0,24.0,Deceased,Tachypnea,...,False,False,False,False,False,False,False,False,True,False


## 3. Feature Scaling by using Standardzation

In [206]:
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

## Extra： Data Validation

In [210]:
# ========== Data Validation After Feature Scaling ==========
import numpy as np

# 1. Re-select numeric columns in the current df (avoid issues from column name changes)
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
print(f"Current numeric columns to validate:\n{num_cols.tolist()}\n")

# 2. Validation 1: No missing values (required for PCA/model training)
missing_check = df[num_cols].isnull().sum()
print("Missing value check for numeric columns (should be 0):")
print(missing_check)
if missing_check.sum() == 0:
    print("✅ Missing value validation passed: All numeric columns have no missing values\n")
else:
    print("❌ Missing value validation failed: Unhandled NaN values exist\n")

# 3. Validation 2: Mean ≈ 0 and variance ≈ 1 after standard scaling (core validation)
# Calculate mean and variance for each column (rounded to 4 decimal places)
scaling_check = pd.DataFrame({
    'Mean': df[num_cols].mean().round(4),
    'Variance': df[num_cols].var().round(4)
})
print("Mean/variance validation after feature scaling (Mean ≈ 0, Variance ≈ 1):")
print(scaling_check)

# Batch check if scaling meets requirements (|Mean| < 0.01, 0.9 < Variance < 1.1)
mean_pass = np.all(np.abs(scaling_check['Mean']) < 0.01)
var_pass = np.all((scaling_check['Variance'] > 0.9) & (scaling_check['Variance'] < 1.1))

if mean_pass and var_pass:
    print("✅ Feature scaling validation passed: Mean ≈ 0, Variance ≈ 1\n")
else:
    print("⚠️ Feature scaling not fully compliant: Mean/variance of some columns deviate from standard values\n")

# 4. Optional: Validate data types (ensure numeric, no string混入)
dtype_check = df[num_cols].dtypes
print("Data type validation for numeric columns (should be int64/float64):")
print(dtype_check)
if all([dtype in ['int64', 'float64'] for dtype in dtype_check]):
    print("✅ Data type validation passed\n")
else:
    print("❌ Data type validation failed: Non-numeric columns exist\n")

# 5. Optional: Preview first 5 rows of scaled data (visual check)
print("Preview of first 5 rows of scaled data:")
print(df[num_cols].head())

Current numeric columns to validate:
['Patient Age', "Genes in mother's side", 'Inherited from father', 'Maternal gene', 'Paternal gene', 'Blood cell count (mcL)', "Mother's age", "Father's age", 'Folic acid details (peri-conceptional)', 'H/O serious maternal illness', 'Assisted conception IVF/ART', 'History of anomalies in previous pregnancies', 'No. of previous abortion', 'White Blood cell count (thousand per microliter)', 'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5']

Missing value check for numeric columns (should be 0):
Patient Age                                         0
Genes in mother's side                              0
Inherited from father                               0
Maternal gene                                       0
Paternal gene                                       0
Blood cell count (mcL)                              0
Mother's age                                        0
Father's age                                        0
Folic acid details