In [1]:
# IMPORT & SETUP
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, ConfusionMatrixDisplay, \
    f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder

In [4]:
file_path = 'data/train.csv' 
dataset = pd.read_csv(file_path)
dataset['Patient Age'].unique()

array([ 2.,  4.,  6., 12., 11., 14.,  3.,  7.,  1.,  0., nan, 10.,  5.,
       13.,  8.,  9.])

In [60]:
label_encoder = LabelEncoder()

file_path = 'data/train.csv' 
dataset = pd.read_csv(file_path)

#----------------------------------PRE PROCESS--------------------------------------#
#----------------------------------%%%%%%%%%%%%%--------------------------------------#

# remove rows where both targets NaN
dataset = dataset.dropna(subset=['Genetic Disorder', 'Disorder Subclass'], how='all')

# remove irrelevant features
columns_to_drop = ['Institute Name', 'Location of Institute', "Father's name", 
                   'Patient First Name', 'Family Name', 'Patient Id', 'Parental consent', 
                'Test 1', 'Test 2', 'Test 3', 'Test 4', 'Test 5']
dataset = dataset.drop(columns=columns_to_drop)


# Fill missing values in 'Genetic Disorder' based on 'Disorder Subclass'
subclass_to_disorder = {
    'Leber\'s hereditary optic neuropathy': 'Mitochondrial genetic inheritance disorders',
    'Leigh syndrome': 'Mitochondrial genetic inheritance disorders',
    'Mitochondrial myopathy': 'Mitochondrial genetic inheritance disorders',
    'Alzheimer\'s': 'Multifactorial genetic inheritance disorders',
    'Cancer': 'Multifactorial genetic inheritance disorders',
    'Diabetes': 'Multifactorial genetic inheritance disorders',
    'Cystic fibrosis': 'Single-gene inheritance diseases',
    'Hemochromatosis': 'Single-gene inheritance diseases',
    'Tay-Sachs': 'Single-gene inheritance diseases',
}


dataset['Genetic Disorder'] = dataset.apply(
    lambda row: subclass_to_disorder[row['Disorder Subclass']] 
    if pd.isna(row['Genetic Disorder']) else row['Genetic Disorder'],
    axis=1
)


#----------------------------------PRE PROCESS--------------------------------------#
#----------------------------------%%%%%%%%%%%%%--------------------------------------#



#  NUMERICAL --> MEDIAN IMPUTER




# CATEGORIAL BINARY
mode_imputer = SimpleImputer(strategy='most_frequent')
dataset['Autopsy shows birth defect (if applicable)'] = dataset['Autopsy shows birth defect (if applicable)'].replace({'Yes': 1.0, 'No': 0.0, 'Not applicable': 0.0, 'None': 0.0})
#dataset['Parental consent'] = dataset['Parental consent'].replace('Yes', 1.0)
dataset['H/O substance abuse'] = dataset['H/O substance abuse'].replace({'No': 0.0, 'Yes':1.0, 'Not applicable':0.0, '-':0.0})
dataset['Inherited from father'] = dataset['Inherited from father'].replace({'No': 0.0, 'Yes':1.0})
dataset['Maternal gene'] = dataset['Maternal gene'].replace({'No': 0.0, 'Yes':1.0})
dataset['History of anomalies in previous pregnancies'] = dataset['History of anomalies in previous pregnancies'].replace({'Yes': 1.0, 'No': 0.0})
dataset['H/O serious maternal illness'] = dataset['H/O serious maternal illness'].replace({'Yes': 1.0, 'No': 0.0})
dataset['H/O radiation exposure (x-ray)'] = dataset['H/O radiation exposure (x-ray)'].replace({'No': 0.0, '-': 0.0, 'Not applicable': 0.0, 'Yes':1.0})

dataset['Place of birth'] = dataset['Place of birth'].replace({'Institute': 1.0, 'Home': 0.0})
dataset['Place of birth'] = mode_imputer.fit_transform(dataset[['Place of birth']])
# NaN non imputed --> -1.0
dataset['Birth asphyxia'] = dataset['Birth asphyxia'].replace({'No': 0.0, 'No record': 0.0, 'Not available': -1.0, 'Yes':1.0})

dataset['Respiratory Rate (breaths/min)'] = dataset['Respiratory Rate (breaths/min)'].replace({'Normal (30-60)': 0.0, 'Tachypnea': 1.0})
dataset['Respiratory Rate (breaths/min)'] = mode_imputer.fit_transform(dataset[['Respiratory Rate (breaths/min)']]) 

dataset['Heart Rate (rates/min'] = dataset['Heart Rate (rates/min'].replace({'Normal': 0.0, 'Tachycardia': 1.0})
dataset['Heart Rate (rates/min'] = mode_imputer.fit_transform(dataset[['Heart Rate (rates/min']])
dataset['Follow-up'] = dataset['Follow-up'].replace({'High': 1.0, 'Low': 0.0})
dataset['Follow-up'] = mode_imputer.fit_transform(dataset[['Follow-up']])

dataset['Folic acid details (peri-conceptional)'] = dataset['Folic acid details (peri-conceptional)'].replace({'Yes': 1.0, 'No': 0.0})
dataset['Assisted conception IVF/ART'] = dataset['Assisted conception IVF/ART'].replace({'Yes': 1.0, 'No': 0.0})
dataset['Birth defects'] = dataset['Birth defects'].replace({'Multiple': 1.0, 'Singular': 0.0})

# CATEGORIAL MULTICLASS
dataset['Gender'] = dataset['Gender'].replace({'Male': 1.0, 'Female': 0.0, 'Ambiguous': 2.0})
dataset['Blood test result'] = dataset['Blood test result'].replace({'normal': 0.0, 'inconclusive': -1.0, 'slightly abnormal': 0.5, 'abnormal': 1.0})

# BINARY FEATURES WITH NO NaN
dataset["Genes in mother's side"] = dataset["Genes in mother's side"].replace({'Yes': 1.0, 'No': 0.0})
dataset["Paternal gene"] = dataset["Paternal gene"].replace({'Yes': 1.0, 'No': 0.0})
dataset["Status"] = dataset["Status"].replace({'Alive': 0.0, 'Deceased': 1.0})

dataset['Symptom 1'] = dataset['Symptom 1']
dataset['Symptom 2'] = dataset['Symptom 2']
dataset['Symptom 3'] = dataset['Symptom 3']
dataset['Symptom 4'] = dataset['Symptom 4']
dataset['Symptom 5'] = dataset['Symptom 5']


#-------------------------------------------------------%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%--------------------------------#




heat_data = dataset.copy().dropna()
heat_data['Genetic Disorder'] = label_encoder.fit_transform(heat_data['Genetic Disorder'])
heat_data['Disorder Subclass'] = label_encoder.fit_transform(heat_data['Disorder Subclass'])


In [61]:
heat_data

Unnamed: 0,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Mother's age,Father's age,Status,Respiratory Rate (breaths/min),...,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
8,11.0,0.0,0.0,1.0,0.0,5.209058,45.0,44.0,0.0,1.0,...,1.0,6.669552,0.5,1.0,1.0,1.0,0.0,1.0,0,6
9,4.0,0.0,1.0,1.0,1.0,4.752272,44.0,42.0,0.0,1.0,...,1.0,6.397702,1.0,0.0,0.0,1.0,1.0,1.0,1,3
12,1.0,1.0,1.0,0.0,0.0,4.612265,50.0,56.0,1.0,0.0,...,0.0,7.995115,0.5,1.0,1.0,0.0,1.0,0.0,0,6
14,6.0,1.0,0.0,1.0,0.0,4.620420,41.0,20.0,0.0,1.0,...,1.0,3.000000,0.5,1.0,0.0,1.0,0.0,1.0,0,6
18,10.0,1.0,1.0,1.0,0.0,4.751452,40.0,57.0,1.0,0.0,...,0.0,9.382407,1.0,1.0,1.0,0.0,0.0,0.0,0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22056,12.0,1.0,0.0,1.0,0.0,4.652915,49.0,40.0,0.0,0.0,...,1.0,8.035952,0.5,0.0,1.0,1.0,1.0,0.0,0,6
22057,6.0,0.0,1.0,0.0,1.0,5.117296,48.0,27.0,0.0,1.0,...,0.0,3.453291,0.5,0.0,0.0,1.0,1.0,0.0,0,7
22066,13.0,0.0,1.0,0.0,1.0,4.777036,27.0,58.0,0.0,0.0,...,0.0,3.000736,0.0,0.0,0.0,0.0,0.0,1.0,0,6
22068,4.0,1.0,0.0,0.0,0.0,5.077554,44.0,34.0,1.0,0.0,...,0.0,9.566549,-1.0,1.0,1.0,1.0,1.0,1.0,1,3


In [64]:
robust_scaler = RobustScaler()

X_train, X_test, Y_train, Y_test = train_test_split(
    robust_scaler.fit_transform(heat_data.drop(columns=['Genetic Disorder', 'Disorder Subclass'])),
    heat_data.loc[:, 'Genetic Disorder'],
    test_size=0.2, random_state=43)
from imblearn.over_sampling import BorderlineSMOTE

sm = BorderlineSMOTE(random_state=42)
X_train, Y_train = sm.fit_resample(X_train, Y_train)


0        2
1        0
2        2
3        2
4        2
        ..
26731    2
26732    2
26733    2
26734    2
26735    2
Name: Genetic Disorder, Length: 26736, dtype: int64

In [65]:

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(bootstrap=True, max_depth=54, max_features='sqrt',
                               min_samples_leaf=6, min_samples_split=4, n_estimators=400)
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)

f1_score(Y_test, y_pred, average='macro'), accuracy_score(Y_test, y_pred), precision_score(Y_test, y_pred,
                                                                                           average='macro'), recall_score(Y_test, y_pred, average='macro')

(0.5446772900126726, 0.6185344827586207, 0.569406201655692, 0.5548717796079484)