In [1]:
# importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,LabelEncoder,MinMaxScaler
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [2]:
#loading the datset
train=pd.read_csv('dataset/train.csv')
test=pd.read_csv('dataset/test.csv')
data_train=train.copy()
data_test=test.copy()

In [3]:
#subset where no all rows being null
data_fea_train=data_train[data_train.isnull().all(1)!=True]

In [4]:
#subset where no all rows being null
data_fea_test=data_test[data_test.isnull().all(1)!=True]

In [5]:
# Dropping the features
data_fea_train=data_fea_train.drop(columns=['Patient Id','Patient First Name','Family Name','Father\'s name','Institute Name','Location of Institute','Test 1','Test 2','Test 3','Test 4',
'Test 5','Parental consent'])
data_fea_test=data_fea_test.drop(columns=['Patient Id','Patient First Name','Family Name','Father\'s name','Institute Name','Location of Institute','Test 1','Test 2','Test 3','Test 4',
'Test 5','Parental consent'])

In [6]:
# renaming the columns
data_fea_train=data_fea_train.rename(columns={"Genes in mother's side":'defective_mother',
                    'Inherited from father':'defective_father',
                    'Maternal gene':'maternal_gene','Paternal gene':'paternal_gene',
                    'Respiratory Rate (breaths/min)':'respiratory_rate','Heart Rate (rates/min':'heart_rate',
                    'Parental consent':'parental_consent','Follow-up':'follow_up','Birth asphyxia':'birth_asphyxia',
                    'Autopsy shows birth defect (if applicable)':'birth_defect_autopsy','Place of birth':'birth_place',
                    'Folic acid details (peri-conceptional)':'folic_acid_periconceptional',
                    'H/O serious maternal illness':'maternal_illness','H/O radiation exposure (x-ray)':'radiation_exposure',
                    'H/O substance abuse':'substance_abuse','Assisted conception IVF/ART':'assisted_conception',
                    'History of anomalies in previous pregnancies':'previous_pregnancy_anomalies',
                    'Birth defects':'birth_defects','Blood test result':'blood_test_result','Genetic Disorder':'genetic_disorder',
                    'Disorder Subclass':'disorder_subclass','Patient Age':'patient_age','Blood cell count (mcL)':'blood_cell_count',
                    "Mother's age":'mother_age',"Father's age":'father_age','No. of previous abortion':'num_previous_abortion',
                    'White Blood cell count (thousand per microliter)':'WBC_count'})

In [7]:
data_fea_test=data_fea_test.rename(columns={"Genes in mother's side":'defective_mother',
                    'Inherited from father':'defective_father',
                    'Maternal gene':'maternal_gene','Paternal gene':'paternal_gene',
                    'Respiratory Rate (breaths/min)':'respiratory_rate','Heart Rate (rates/min':'heart_rate',
                    'Parental consent':'parental_consent','Follow-up':'follow_up','Birth asphyxia':'birth_asphyxia',
                    'Autopsy shows birth defect (if applicable)':'birth_defect_autopsy','Place of birth':'birth_place',
                    'Folic acid details (peri-conceptional)':'folic_acid_periconceptional',
                    'H/O serious maternal illness':'maternal_illness','H/O radiation exposure (x-ray)':'radiation_exposure',
                    'H/O substance abuse':'substance_abuse','Assisted conception IVF/ART':'assisted_conception',
                    'History of anomalies in previous pregnancies':'previous_pregnancy_anomalies',
                    'Birth defects':'birth_defects','Blood test result':'blood_test_result','Genetic Disorder':'genetic_disorder',
                    'Disorder Subclass':'disorder_subclass','Patient Age':'patient_age','Blood cell count (mcL)':'blood_cell_count',
                    "Mother's age":'mother_age',"Father's age":'father_age','No. of previous abortion':'num_previous_abortion',
                    'White Blood cell count (thousand per microliter)':'WBC_count'})

In [8]:
# removing rows of missing target variables
data_fea_train=data_fea_train[(data_fea_train['genetic_disorder'].isnull()!=True)&(data_fea_train['disorder_subclass'].isnull()!=True)]

In [9]:
data_fea_train.shape

(18047, 33)

In [10]:
#Subsetting
X=data_fea_train.iloc[:,:-2]
y1=data_fea_train.iloc[:,-2]
y2=data_fea_train.iloc[:,-1]

In [12]:
# test data
X_test=data_fea_test

In [13]:
#converting dissimilar datatype to one
for i in X_test.columns:
    if X_test[i].dtype!=X[i].dtype:
        X_test[i]=X_test[i].astype(X[i].dtype.name)

In [14]:
# Data Cleaning
X_test=X_test.replace('-99',np.nan)
# Cleaning_data
# replace '-' with other values
X['radiation_exposure']=X['radiation_exposure'].replace('-','others')
X['substance_abuse']=X['substance_abuse'].replace('-','others')
# Cleaning_data
X_test['radiation_exposure']=X_test['radiation_exposure'].replace('-','others')
X_test['substance_abuse']=X_test['substance_abuse'].replace('-','others')

In [15]:
# Data Cleaning
X_test['WBC_count']=X_test['WBC_count'].mask(X_test['WBC_count']<0,np.nan)
X_test['num_previous_abortion']=X_test['num_previous_abortion'].mask(X_test['num_previous_abortion']<0,np.nan)

In [16]:
#Splitting the data
X_train1,X_val1,y_train1,y_val1= train_test_split(X,y1,stratify=y1,test_size=0.20)
X_train2,X_val2,y_train2,y_val2= train_test_split(X,y2,stratify=y2,test_size=0.20)

In [17]:
# Missing value imputation
from sklearn.impute import SimpleImputer
imp_mode=SimpleImputer(strategy='most_frequent')
imp_mode_num=SimpleImputer(strategy='most_frequent')
imp_median=SimpleImputer(strategy='median')

In [18]:
pd.options.mode.chained_assignment = None  

In [19]:
# missing value imputation
for i in X.columns:
    if (X[i].dtype.name!='object')&(X[i].nunique()<=3):
        imp_mode_num.fit(np.array(X_train1[i]).reshape(-1,1))
        X_train1[i]=imp_mode_num.transform(np.array(X_train1[i]).reshape(-1,1))
        X_val1[i]=imp_mode_num.transform(np.array(X_val1[i]).reshape(-1,1))
        X_test[i]=imp_mode_num.transform(np.array(X_test[i]).reshape(-1,1))
    elif (X[i].dtype.name!='object')&(X[i].nunique()>3):
        imp_median.fit(np.array(X_train1[i]).reshape(-1,1))
        X_train1[i]=imp_median.transform(np.array(X_train1[i]).reshape(-1,1))
        X_val1[i]=imp_median.transform(np.array(X_val1[i]).reshape(-1,1))
        X_test[i]=imp_median.transform(np.array(X_test[i]).reshape(-1,1))
    else:
        imp_mode.fit(np.array(X_train1[i]).reshape(-1,1))
        X_train1[i]=imp_mode.transform(np.array(X_train1[i]).reshape(-1,1))
        X_val1[i]=imp_mode.transform(np.array(X_val1[i]).reshape(-1,1))
        X_test[i]=imp_mode.transform(np.array(X_test[i]).reshape(-1,1))

In [20]:
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,LabelEncoder,MinMaxScaler
ord_enc=OrdinalEncoder()
ohe_enc=OneHotEncoder()
min_max=MinMaxScaler()

In [21]:
# reset index
X_train1.reset_index(inplace=True)
X_val1.reset_index(inplace=True)

In [22]:
# encoding the features
for i in X.columns:
    if (X[i].dtype.name=='object'):
        if i in X and X[i].nunique()<=2:
            ord_enc.fit(np.array(X_train1[i]).reshape(-1,1))
            X_train1.loc[:,i]=ord_enc.transform(np.array(X_train1[i]).reshape(-1,1))
            X_val1.loc[:,i]=ord_enc.transform(np.array(X_val1[i]).reshape(-1,1))
            X_test.loc[:,i]=ord_enc.transform(np.array(X_test[i]).reshape(-1,1))
        else:
            ohe_enc.fit(np.array(X_train1[i]).reshape(-1,1))
            X_encode_tr1=pd.DataFrame(ohe_enc.transform(np.array(X_train1[i]).reshape(-1,1)).toarray(),columns=ohe_enc.get_feature_names_out([i]))
            X_encode_va1=pd.DataFrame(ohe_enc.transform(np.array(X_val1[i]).reshape(-1,1)).toarray(),columns=ohe_enc.get_feature_names_out([i]))
            X_encode1=pd.DataFrame(ohe_enc.transform(np.array(X_test[i]).reshape(-1,1)).toarray(),columns=ohe_enc.get_feature_names_out([i]))
            X_train1=pd.concat([X_train1,X_encode_tr1],axis=1)
            X_val1=pd.concat([X_val1,X_encode_va1],axis=1)
            X_test=pd.concat([X_test,X_encode1],axis=1)
            X_train1.drop(columns=[i],inplace=True)
            X_val1.drop(columns=[i],inplace=True)
            X_test.drop(columns=[i],inplace=True)

  X_train1.loc[:,i]=ord_enc.transform(np.array(X_train1[i]).reshape(-1,1))
  X_val1.loc[:,i]=ord_enc.transform(np.array(X_val1[i]).reshape(-1,1))
  X_test.loc[:,i]=ord_enc.transform(np.array(X_test[i]).reshape(-1,1))
  X_train1.loc[:,i]=ord_enc.transform(np.array(X_train1[i]).reshape(-1,1))
  X_val1.loc[:,i]=ord_enc.transform(np.array(X_val1[i]).reshape(-1,1))
  X_test.loc[:,i]=ord_enc.transform(np.array(X_test[i]).reshape(-1,1))
  X_train1.loc[:,i]=ord_enc.transform(np.array(X_train1[i]).reshape(-1,1))
  X_val1.loc[:,i]=ord_enc.transform(np.array(X_val1[i]).reshape(-1,1))
  X_test.loc[:,i]=ord_enc.transform(np.array(X_test[i]).reshape(-1,1))
  X_train1.loc[:,i]=ord_enc.transform(np.array(X_train1[i]).reshape(-1,1))
  X_val1.loc[:,i]=ord_enc.transform(np.array(X_val1[i]).reshape(-1,1))
  X_test.loc[:,i]=ord_enc.transform(np.array(X_test[i]).reshape(-1,1))
  X_train1.loc[:,i]=ord_enc.transform(np.array(X_train1[i]).reshape(-1,1))
  X_val1.loc[:,i]=ord_enc.transform(np.array(X_val1[i]).r

In [24]:
X_train1.drop(columns='index',inplace=True)
X_val1.drop(columns='index',inplace=True)

In [25]:
from sklearn.preprocessing import MinMaxScaler
min_max=MinMaxScaler()
X2=min_max.fit_transform(X_train1)

In [26]:
# normalised minmax
X2=pd.DataFrame(X2,columns=X_train1.columns)

In [27]:
#normalised val1
X2_val=min_max.transform(X_val1)
X2_val=pd.DataFrame(X2_val,columns=X_val1.columns)

In [28]:
#normalised test
X2_test=min_max.transform(X_test)
X2_test=pd.DataFrame(X2_test,columns=X_test.columns)

In [29]:
# enoding the target variables1
lab_enc1=LabelEncoder()
y1_en=lab_enc1.fit_transform(y_train1)
y1_en_val=lab_enc1.transform(y_val1)

In [31]:
# enoding the target variables2
lab_enc2=LabelEncoder()
y2_en=lab_enc2.fit_transform(y_train2)
y2_en_val=lab_enc2.transform(y_val2)

### genetic_disorder

In [33]:
from imblearn.over_sampling import BorderlineSMOTE
sm = BorderlineSMOTE(random_state=42)
X_sm, y_sm = sm.fit_resample(X2, pd.DataFrame(y1_en))
print(f'''shape of X before SMOTE: {X2.shape} 
shape of X after SMOTE: {X_sm.shape}''')
print('balanced class (%):')
y_sm.value_counts(normalize=True) * 100

shape of X before SMOTE: (14437, 48) 
shape of X after SMOTE: (22176, 48)
balanced class (%):


0    33.333333
1    33.333333
2    33.333333
dtype: float64

In [34]:
y1_enco=np.array(y_sm).ravel()

In [35]:
from sklearn.feature_selection import SelectKBest,chi2

In [61]:
sel1=SelectKBest(chi2, k=15).fit(X_sm,y1_enco)

In [62]:
cols=sel1.get_support(indices=True)
print(X_sm.iloc[:,cols].shape)
result_kbest_20=X_sm.iloc[:,cols]

(22176, 15)


In [63]:
sele_fea= X2.columns[(sel1.get_support())]
print(sele_fea)

Index(['defective_mother', 'defective_father', 'maternal_gene',
       'paternal_gene', 'folic_acid_periconceptional',
       'previous_pregnancy_anomalies', 'Symptom 1', 'Symptom 2', 'Symptom 3',
       'Symptom 4', 'Symptom 5', 'birth_defect_autopsy_None',
       'blood_test_result_abnormal', 'blood_test_result_normal',
       'blood_test_result_slightly abnormal'],
      dtype='object')


In [64]:
print(X2_val.iloc[:,cols].shape)
result_kbest_val=X2_val.iloc[:,cols]

(3610, 15)


In [65]:
print(X2_test.iloc[:,cols].shape)
result_kbest_test20=X2_test.iloc[:,cols]

(9465, 15)


In [66]:
from sklearn.metrics import f1_score

In [67]:
DT = DecisionTreeClassifier(max_depth=20,min_samples_leaf=50,random_state=42)
DT.fit(result_kbest_20,y1_enco)
cal_clf = CalibratedClassifierCV(DT, method="sigmoid")
cal_clf.fit(result_kbest_20,y1_enco)
predict_y =cal_clf .predict(result_kbest_20)
print ('The train f1_macro is:',f1_score(y1_enco, predict_y,average='macro'))
predict_y = cal_clf.predict(result_kbest_val)
print('The cross validation f1_macro is:',f1_score(y1_en_val, predict_y,average='macro'))

The train f1_macro is: 0.6832191567014755
The cross validation f1_macro is: 0.5588130468927824


In [43]:
rfc=RandomForestClassifier(random_state=42)
params1={'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
random_rfc=RandomizedSearchCV(rfc,param_distributions=params1,verbose=10,n_jobs=-1,random_state=42)
random_rfc.fit(result_kbest_20,y1_enco)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV 3/5; 3/10] START bootstrap=True, max_depth=60, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1400
[CV 3/5; 3/10] END bootstrap=True, max_depth=60, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1400;, score=0.713 total time=  33.8s
[CV 4/5; 4/10] START bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1800
[CV 4/5; 4/10] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1800;, score=0.745 total time= 1.0min


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


KeyboardInterrupt: 

In [None]:
print(random_rfc.best_estimator_)

In [None]:
print(random_rfc.best_score_)

In [60]:
rfc = RandomForestClassifier(n_estimators=100,max_depth=20,max_features='sqrt',bootstrap=False, min_samples_leaf=2, min_samples_split=10,random_state=42)
rfc.fit(result_kbest_20,y1_enco)
cal_clf = CalibratedClassifierCV(rfc, method="sigmoid")
cal_clf.fit(result_kbest_20,y1_enco)
predict_y =cal_clf .predict(result_kbest_20)
print ('The train f1_macro is:',f1_score(y1_enco, predict_y,average='macro'))
predict_y = cal_clf.predict(result_kbest_val)
print('The cross validation f1_macro is:',f1_score(y1_en_val, predict_y,average='macro'))

The train f1_macro is: 0.7458470950466963
The cross validation f1_macro is: 0.5472886257931504


#### Disorder_subclass

In [None]:
from imblearn.over_sampling import BorderlineSMOTE

In [None]:
smd=BorderlineSMOTE(random_state=42)
X_smd, y_smd = smd.fit_resample(X2, pd.DataFrame(y2_en))
print(f'''shape of X before SMOTE: {X2.shape} 
shape of X after SMOTE: {X_smd.shape}''')
print('balanced class (%):')
y_smd.value_counts(normalize=True) * 100

In [None]:
X_smd.head(2)

In [None]:
y2_enco=np.array(y_smd).ravel()

In [None]:
from sklearn.feature_selection import SelectKBest,chi2

In [None]:
# feature selection 
sel2=SelectKBest(chi2, k=25).fit(X_smd,y2_enco)

In [None]:
cols=sel2.get_support(indices=True)
print(X_smd.iloc[:,cols].shape)
result_kbest_20d=X_smd.iloc[:,cols]

In [None]:
sele_fead= X2.columns[(sel2.get_support())]
print(sele_fead)

In [None]:
print(X2_val.iloc[:,cols].shape)
result_kbest_vald=X2_val.iloc[:,cols]

In [None]:
print(X2_test.iloc[:,cols].shape)
result_kbest_test20d=X2_test.iloc[:,cols]

In [None]:
nn=[x for x in range(1, 15, 2)]
cv_f1_macro=[]
for i in nn:
    knn=KNeighborsClassifier(n_neighbors=i,n_jobs=-1)
    knn.fit(result_kbest_20d,y2_enco)
    cal_clf = CalibratedClassifierCV(knn, method="sigmoid")
    cal_clf.fit(result_kbest_20d,y2_enco)
    predict_y=cal_clf.predict(result_kbest_vald)
    cv_f1_macro.append(f1_score(y2_en_val, predict_y,average='macro'))
for i in range(len(cv_f1_macro)):
    print ('f1_macro for k = ',nn[i],'is',cv_f1_macro[i])
best_nn = np.argmax(cv_f1_macro)
knn=KNeighborsClassifier(n_neighbors=nn[best_nn])
knn.fit(result_kbest_20d,y2_enco)
cal_clf = CalibratedClassifierCV(knn, method="sigmoid")
cal_clf.fit(result_kbest_20d,y2_enco)

predict_y =cal_clf .predict(result_kbest_20d)
print ('For values of best nn = ', nn[best_nn], "The train f1_macro is:",f1_score(y2_enco, predict_y,average='macro'))
predict_y = cal_clf.predict(result_kbest_vald)
print('For values of best nn = ', nn[best_nn], "The cross validation f1_macro is:",f1_score(y2_en_val, predict_y,average='macro'))

In [None]:
C1= [10 ** x for x in range(-5, 4)]
cv_f1_macro=[]
for i in C1:
    logisticR=LogisticRegression(penalty='l2',C=i,class_weight='balanced',max_iter=1000)
    logisticR.fit(result_kbest_20d,y2_enco)
    cal_clf = CalibratedClassifierCV(logisticR, method="sigmoid")
    cal_clf.fit(result_kbest_20d,y2_enco)
    predict_y=cal_clf.predict(result_kbest_vald)
    cv_f1_macro.append(f1_score(y2_en_val, predict_y,average='macro'))
for i in range(len(cv_f1_macro)):
    print ('f1_macro for C = ',C1[i],'is',cv_f1_macro[i])
best_C1 = np.argmax(cv_f1_macro)
logisticR=LogisticRegression(penalty='l2',C=C1[best_C1],class_weight='balanced',max_iter=1000)
logisticR.fit(result_kbest_20d,y2_enco)
cal_clf = CalibratedClassifierCV(logisticR, method="sigmoid")
cal_clf.fit(result_kbest_20d,y2_enco)

predict_y =cal_clf .predict(result_kbest_20d)
print ('For values of best C = ',C1[best_C1], "The train f1_macro is:",f1_score(y2_enco, predict_y,average='macro'))
predict_y = cal_clf.predict(result_kbest_vald)
print('For values of best C = ',C1[best_C1], "The cross validation f1_macro is:",f1_score(y2_en_val, predict_y,average='macro'))

In [None]:
DT = DecisionTreeClassifier(random_state=42)
params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [10, 20, 50, 100],
    'criterion': ["gini", "entropy"]
}
random_dt=RandomizedSearchCV(DT,param_distributions=params,n_jobs=-1,random_state=42)
random_dt.fit(result_kbest_20d,y2_enco)

In [None]:
print(random_dt.best_estimator_)

In [None]:
print(random_dt.best_score_)

In [None]:
DT = DecisionTreeClassifier(max_depth=20,min_samples_leaf=10,random_state=42)
DT.fit(result_kbest_20d,y2_enco)
cal_clf = CalibratedClassifierCV(DT, method="sigmoid")
cal_clf.fit(result_kbest_20d,y2_enco)
predict_y =cal_clf .predict(result_kbest_20d)
print ('The train f1_macro is:',f1_score(y2_enco, predict_y,average='macro'))
predict_y = cal_clf.predict(result_kbest_vald)
print('The cross validation f1_macro is:',f1_score(y2_en_val, predict_y,average='macro'))

In [None]:
rfc1=r_cfl=RandomForestClassifier(random_state=42)
params1={'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10,15],
 'n_estimators': [200, 400, 500, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
random_rfc1=RandomizedSearchCV(rfc1,param_distributions=params1,n_jobs=-1,random_state=42)
random_rfc1.fit(result_kbest_20d,y2_enco)

In [None]:
print(random_rfc1.best_estimator_)

In [None]:
print(random_rfc1.best_score_)

In [None]:
rfc1= RandomForestClassifier(n_estimators=500,max_depth=30,min_samples_leaf=2,min_samples_split=5,bootstrap=False,random_state=42)
rfc1.fit(result_kbest_20d,y2_enco)
cal_clf = CalibratedClassifierCV(rfc1, method="sigmoid")
cal_clf.fit(result_kbest_20d,y2_enco)
predict_y =cal_clf .predict(result_kbest_20d)
print ('The train f1_macro is:',f1_score(y2_enco, predict_y,average='macro'))
predict_y = cal_clf.predict(result_kbest_vald)
print('The cross validation f1_macro is:',f1_score(y2_en_val, predict_y,average='macro'))

In [None]:
from prettytable import PrettyTable

In [None]:
Pred_gene = PrettyTable(["Model", "Average_F1_Score"])
Pred_gene.add_row(['KNN','28.50'])
Pred_gene.add_row(['Logistic_Regression','25.80'])
Pred_gene.add_row(['Decision_Tree','32.69'])
Pred_gene.add_row(['RFC','31.49'])

In [None]:
print(Pred_gene)

#### Final Model

In [None]:
DT = DecisionTreeClassifier(max_depth=20,min_samples_leaf=50,random_state=42)
DT.fit(result_kbest_20,y1_enco)
cal_clf = CalibratedClassifierCV(DT, method="sigmoid")
cal_clf.fit(result_kbest_20,y1_enco)
predict_y =cal_clf .predict(result_kbest_20)
print ('The train f1_macro is:',f1_score(y1_enco, predict_y,average='macro'))
predict_y = cal_clf.predict(result_kbest_test20)

In [None]:
Genetic_disorder=lab_enc1.inverse_transform(predict_y)

In [None]:
DT = DecisionTreeClassifier(max_depth=20,min_samples_leaf=10,random_state=42)
DT.fit(result_kbest_20d,y2_enco)
cal_clf = CalibratedClassifierCV(DT, method="sigmoid")
cal_clf.fit(result_kbest_20d,y2_enco)
predict_y =cal_clf .predict(result_kbest_20d)
print ('The train f1_macro is:',f1_score(y2_enco, predict_y,average='macro'))
predict_yd = cal_clf.predict(result_kbest_test20)

In [None]:
Disorder_Subclass=lab_enc2.inverse_transform(predict_yd)

In [None]:
data_fea_test1=data_test[data_test.isnull().all(1)!=True]

In [None]:
ids=data_fea_test1['Patient Id']
output=pd.DataFrame({'Patient Id': ids,'Genetic_Disorder':Genetic_disorder,'Disorder_Subclass':Disorder_Subclass})
output.to_csv('submission.csv',index=False)