# HackerEarth ML - Of Genomes And Genetics

# Step 1: Reading and Understanding the Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')
plt.style.use('seaborn-deep')
plt.style.use('fivethirtyeight')
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 10
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 8
plt.rcParams['ytick.labelsize'] = 8
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['figure.titlesize'] = 14
plt.rcParams['figure.figsize'] = (12, 8)

pd.options.mode.chained_assignment = None
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 400)
import warnings
warnings.filterwarnings('ignore')
import sklearn.base as skb
import sklearn.metrics as skm
import sklearn.model_selection as skms
import sklearn.preprocessing as skp
import sklearn.utils as sku
import sklearn.linear_model as sklm
import sklearn.neighbors as skn
import sklearn.ensemble as ske
import catboost as cb
import scipy.stats as sstats
import random
seed = 12
np.random.seed(seed)

from datetime import date

In [None]:
!pip install pandas-profiling --quiet
import pandas_profiling as pp

In [None]:
# important funtions
def datasetShape(df):
    rows, cols = df.shape
    print("The dataframe has",rows,"rows and",cols,"columns.")
    
# select numerical and categorical features
def divideFeatures(df):
    numerical_features = df.select_dtypes(include=[np.number])
    categorical_features = df.select_dtypes(include=[np.object])
    return numerical_features, categorical_features

In [None]:
base = '/kaggle/input/of-genomes-and-genetics-hackerearth-ml/'
data_file = base + "train.csv"
df = pd.read_csv(data_file)
df.head()

In [None]:
data_file = base + "test.csv"
df_test = pd.read_csv(data_file)
df_test.head()

In [None]:
# set target feature
targetFeature='Genetic Disorder'
targetFeature2='Disorder Subclass'

In [None]:
# check dataset shape
datasetShape(df)

In [None]:
# remove ID from train data
df.drop(['Patient Id'], inplace=True, axis=1)

In [None]:
# check for duplicates
print(df.shape)
df.drop_duplicates(inplace=True)
print(df.shape)

In [None]:
df.info()

In [None]:
df_test.info()

# Step 2: EDA

In [None]:
# remove irrelevant columns
df.drop(['Patient First Name', 'Family Name', "Father's name", "Father's age", "Mother's age", 'Institute Name', 'Location of Institute', 'Status', 'Parental consent', 'Autopsy shows birth defect (if applicable)', 'Place of birth', 'No. of previous abortion'], axis=1, inplace=True)
df_test.drop(['Patient First Name', 'Family Name', "Father's name", "Father's age", "Mother's age", 'Institute Name', 'Location of Institute', 'Status', 'Parental consent', 'Autopsy shows birth defect (if applicable)', 'Place of birth', 'No. of previous abortion'], axis=1, inplace=True)
df.describe()

In [None]:
cont_features, cat_features = divideFeatures(df)
cat_features.head()

### Univariate Analysis

In [None]:
# check target feature distribution
df[targetFeature].hist()
plt.show()

In [None]:
# check target feature distribution
df[targetFeature2].hist()
plt.show()

In [None]:
# boxplots of numerical features for outlier detection

fig = plt.figure(figsize=(16,16))
for i in range(len(cont_features.columns)):
    fig.add_subplot(4, 4, i+1)
    sns.boxplot(y=cont_features.iloc[:,i])
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(32,32))
sns.pairplot(df)
plt.show()

In [None]:
# correlation heatmap for all features
corr = df.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, mask = mask, annot=True)
plt.show()

### Profiling for Whole Data

In [None]:
profile = pp.ProfileReport(df, title='Pandas Profiling Report', explorative=True)
profile.to_file("profile.html")

In [None]:
profile.to_notebook_iframe()

# Step 3: Data Preparation

### Handle Missing

In [None]:
# remove all columns having no values
df.dropna(axis=1, how="all", inplace=True)
df_test.dropna(axis=1, how="all", inplace=True)
df.dropna(axis=0, how="all", inplace=True)

# drop rows where target features are not available
df.dropna(subset=['Genetic Disorder', 'Disorder Subclass'], how='any', inplace=True)
datasetShape(df)

In [None]:
# drop single valued column
keep = [c for c in list(df) if df[c].nunique() > 1]
df = df[keep]
keep.remove('Disorder Subclass')
keep.remove('Genetic Disorder')
keep.insert(0, 'Patient Id')
df_test = df_test[keep]
datasetShape(df)

In [None]:
# plot missing values

def calc_missing(df):
    missing = df.isna().sum().sort_values(ascending=False)
    missing = missing[missing != 0]
    missing_perc = missing/df.shape[0]*100
    return missing, missing_perc

if df.isna().any().sum()>0:
    missing, missing_perc = calc_missing(df)
    missing.plot(kind='bar',figsize=(30,8))
    plt.title('Missing Values')
    plt.show()
else:
    print("No Missing Values")

In [None]:
def fillNan(df, col, value):
    df[col].fillna(value, inplace=True)

In [None]:
# setting Assisted conception IVF/ART missing values to Not available
fillNan(df, 'Assisted conception IVF/ART', 'Yes')
fillNan(df_test, 'Assisted conception IVF/ART', 'Yes')
df['Assisted conception IVF/ART'].isna().any()

In [None]:
# setting H/O radiation exposure (x-ray) missing values to -
fillNan(df, 'H/O radiation exposure (x-ray)', '-')
fillNan(df_test, 'H/O radiation exposure (x-ray)', '-')
df['H/O radiation exposure (x-ray)'].isna().any()

In [None]:
# setting Respiratory Rate (breaths/min) missing values to Normal (30-60)
fillNan(df, 'Respiratory Rate (breaths/min)', 'Normal (30-60)')
fillNan(df_test, 'Respiratory Rate (breaths/min)', 'Normal (30-60)')
df['Respiratory Rate (breaths/min)'].isna().any()

In [None]:
# setting Folic acid details (peri-conceptional) missing values to Yes
fillNan(df, 'Folic acid details (peri-conceptional)', 'Yes')
fillNan(df_test, 'Folic acid details (peri-conceptional)', 'Yes')
df['Folic acid details (peri-conceptional)'].isna().any()

In [None]:
# setting H/O serious maternal illness missing values to No
fillNan(df, 'H/O serious maternal illness', 'No')
fillNan(df_test, 'H/O serious maternal illness', 'No')
df['H/O serious maternal illness'].isna().any()

In [None]:
# setting Birth asphyxia missing values to Not available
fillNan(df, 'Birth asphyxia', 'Not available')
fillNan(df_test, 'Birth asphyxia', 'Not available')
df['Birth asphyxia'].isna().any()

In [None]:
# setting Birth defects missing values to Singular
fillNan(df, 'Birth defects', 'Singular')
fillNan(df_test, 'Birth defects', 'Singular')
df['Birth defects'].isna().any()

In [None]:
# setting Blood test result missing values to inconclusive
fillNan(df, 'Blood test result', 'inconclusive')
fillNan(df_test, 'Blood test result', 'inconclusive')
df['Blood test result'].isna().any()

In [None]:
# setting H/O substance abuse missing values to -
fillNan(df, 'H/O substance abuse', '-')
fillNan(df_test, 'H/O substance abuse', '-')
df['H/O substance abuse'].isna().any()

In [None]:
# setting missing values to mean values
fillNan(df, 'White Blood cell count (thousand per microliter)', df['White Blood cell count (thousand per microliter)'].mean())
fillNan(df_test, 'White Blood cell count (thousand per microliter)', df['White Blood cell count (thousand per microliter)'].mean())
df['White Blood cell count (thousand per microliter)'].isna().any()

In [None]:
# setting History of anomalies in previous pregnancies missing values to No
fillNan(df, 'History of anomalies in previous pregnancies', 'No')
fillNan(df_test, 'History of anomalies in previous pregnancies', 'No')
df['History of anomalies in previous pregnancies'].isna().any()

In [None]:
# setting Inherited from father missing values to No
fillNan(df, 'Inherited from father', 'No')
fillNan(df_test, 'Inherited from father', 'No')
df['Inherited from father'].isna().any()

In [None]:
# setting Gender missing values to Ambiguous
fillNan(df, 'Gender', 'Ambiguous')
fillNan(df_test, 'Gender', 'Ambiguous')
df['Gender'].isna().any()

In [None]:
# setting Follow-up missing values to Low
fillNan(df, 'Follow-up', 'Low')
fillNan(df_test, 'Follow-up', 'Low')
df['Follow-up'].isna().any()

In [None]:
# setting Maternal gene missing values to No
fillNan(df, 'Maternal gene', 'No')
fillNan(df_test, 'Maternal gene', 'No')
df['Maternal gene'].isna().any()

In [None]:
# setting missing values to mean values
fillNan(df, 'Patient Age', df['Patient Age'].mean())
fillNan(df_test, 'Patient Age', df['Patient Age'].mean())
df['Patient Age'].isna().any()

In [None]:
# setting missing values to most occurring values
fillNan(df, 'Symptom 1', df['Symptom 1'].mode()[0])
fillNan(df_test, 'Symptom 1', df['Symptom 1'].mode()[0])

fillNan(df, 'Symptom 2', df['Symptom 2'].mode()[0])
fillNan(df_test, 'Symptom 2', df['Symptom 2'].mode()[0])

fillNan(df, 'Symptom 3', df['Symptom 3'].mode()[0])
fillNan(df_test, 'Symptom 3', df['Symptom 3'].mode()[0])

fillNan(df, 'Symptom 4', df['Symptom 4'].mode()[0])
fillNan(df_test, 'Symptom 4', df['Symptom 4'].mode()[0])

fillNan(df, 'Symptom 5', df['Symptom 5'].mode()[0])
fillNan(df_test, 'Symptom 5', df['Symptom 5'].mode()[0])
            
fillNan(df, 'Heart Rate (rates/min', df['Heart Rate (rates/min'].mode()[0])
fillNan(df_test, 'Heart Rate (rates/min', df['Heart Rate (rates/min'].mode()[0])

In [None]:
print("Train Missing:",df.isna().any().sum())
print("Test Missing:",df_test.isna().any().sum())

### One-hot Encoding

In [None]:
cont_features, cat_features = divideFeatures(df)
cat_features

In [None]:
custom_feat = ["Genes in mother's side",
 'Inherited from father',
 'Maternal gene',
 'Paternal gene',
 'Respiratory Rate (breaths/min)',
 'Heart Rate (rates/min',
 'Follow-up',
 'Gender',
 'Birth asphyxia',
 'Folic acid details (peri-conceptional)',
 'H/O serious maternal illness',
 'H/O radiation exposure (x-ray)',
 'H/O substance abuse',
 'Assisted conception IVF/ART',
 'History of anomalies in previous pregnancies',
 'Birth defects',
 'Blood test result']

In [None]:
# extract numerical and categorical for dummy and scaling later
for feat in custom_feat:
    dummyVars = pd.get_dummies(df[feat], drop_first=True, prefix=feat+"_")
    df = pd.concat([df, dummyVars], axis=1)
    df.drop(feat, axis=1, inplace=True)
datasetShape(df)

df.head()

In [None]:
# extract numerical and categorical for dummy and scaling later
for feat in custom_feat:
    dummyVars = pd.get_dummies(df_test[feat], drop_first=True, prefix=feat+"_")
    df_test = pd.concat([df_test, dummyVars], axis=1)
    df_test.drop(feat, axis=1, inplace=True)
datasetShape(df_test)

df_test.head()

# Step 4: Data Modelling

### Split Train-Test Data

In [None]:
# helper functions

def printScore(y_train, y_train_pred):
    print(skm.f1_score(y_train, y_train_pred, average="macro"))

In [None]:
df_f1 = df.sample(frac=1, random_state=seed).reset_index(drop=True)
df_f2 = df.sample(frac=1, random_state=seed).reset_index(drop=True)

In [None]:
# remove Disorder Subclass from df1
df_f1.drop('Disorder Subclass', inplace=True, axis=1)

# convert Genetic Disorder to one-hot
# dummyVars = pd.get_dummies(df_f2['Genetic Disorder'], drop_first=True, prefix="GeneticDisorder_")
# df_f2 = pd.concat([df_f2, dummyVars], axis=1)
# df_f2.drop('Genetic Disorder', axis=1, inplace=True)

# convert Genetic Disorder to label-encoding
gdle = skp.LabelEncoder()
df_f2['Genetic Disorder'] = gdle.fit_transform(df_f2['Genetic Disorder'])

In [None]:
# shuffle samples
df_f1_shuffle = df_f1.sample(frac=1, random_state=seed).reset_index(drop=True)
df_f2_shuffle = df_f2.sample(frac=1, random_state=seed).reset_index(drop=True)

# separate target feature
df_f1_y = df_f1_shuffle.pop(targetFeature)
df_f1_X = df_f1_shuffle

# transform the text label to integers
f1_le = skp.LabelEncoder()
df_f1_y = f1_le.fit_transform(df_f1_y)
# print(f1_le.classes_)

# split into train dev and test
X_f1_train, X_f1_test, y_f1_train, y_f1_test = skms.train_test_split(df_f1_X, df_f1_y, train_size=0.8, random_state=seed)
print(f"Train set has {X_f1_train.shape[0]} records out of {len(df_f1_shuffle)} which is {round(X_f1_train.shape[0]/len(df_f1_shuffle)*100)}%")
print(f"Test set has {X_f1_test.shape[0]} records out of {len(df_f1_shuffle)} which is {round(X_f1_test.shape[0]/len(df_f1_shuffle)*100)}%")

# separate target feature
df_f2_y = df_f2_shuffle.pop(targetFeature2)
df_f2_X = df_f2_shuffle

# transform the text label to integers
f2_le = skp.LabelEncoder()
df_f2_y = f2_le.fit_transform(df_f2_y)
# print(f2_le.classes_)

# split into train dev and test
X_f2_train, X_f2_test, y_f2_train, y_f2_test = skms.train_test_split(df_f2_X, df_f2_y, train_size=0.8, random_state=seed)
print(f"Train set has {X_f2_train.shape[0]} records out of {len(df_f2_shuffle)} which is {round(X_f2_train.shape[0]/len(df_f2_shuffle)*100)}%")
print(f"Test set has {X_f2_test.shape[0]} records out of {len(df_f2_shuffle)} which is {round(X_f2_test.shape[0]/len(df_f2_shuffle)*100)}%")

### Feature Scaling

In [None]:
# reset index for X_train and X_test
X_f1_train.reset_index(drop=True, inplace=True)
X_f1_test.reset_index(drop=True, inplace=True)
X_f1_train.index[:5]

In [None]:
# reset index for X_train and X_test
X_f2_train.reset_index(drop=True, inplace=True)
X_f2_test.reset_index(drop=True, inplace=True)
X_f2_train.index[:5]

In [None]:
# scaler = skp.RobustScaler()
# scaler = skp.MinMaxScaler()
scaler = skp.StandardScaler()

# apply scaling to all numerical variables except dummy variables as they are already between 0 and 1
X_f1_train[cont_features.columns] = pd.DataFrame(scaler.fit_transform(X_f1_train[cont_features.columns]), columns=cont_features.columns)

# scale test data with transform()
X_f1_test[cont_features.columns] = pd.DataFrame(scaler.transform(X_f1_test[cont_features.columns]), columns=cont_features.columns)

# view sample data
X_f1_train.describe()

In [None]:
# scaler = skp.RobustScaler()
# scaler = skp.MinMaxScaler()
scaler = skp.StandardScaler()

# apply scaling to all numerical variables except dummy variables as they are already between 0 and 1
X_f2_train[cont_features.columns] = pd.DataFrame(scaler.fit_transform(X_f2_train[cont_features.columns]), columns=cont_features.columns)

# scale test data with transform()
X_f2_test[cont_features.columns] = pd.DataFrame(scaler.transform(X_f2_test[cont_features.columns]), columns=cont_features.columns)

# view sample data
X_f2_train.describe()

## Model Building

In [None]:
class_weights_f1 = sku.class_weight.compute_class_weight('balanced', np.unique(y_f1_train), y_f1_train)
class_weights_f1 = dict(enumerate(class_weights_f1))
class_weights_f1

In [None]:
class_weights_f2 = sku.class_weight.compute_class_weight('balanced', np.unique(y_f2_train), y_f2_train)
class_weights_f2 = dict(enumerate(class_weights_f2))
class_weights_f2

In [None]:
sample_weights_f1 = sku.class_weight.compute_sample_weight('balanced', y_f1_train)
sample_weights_f1

In [None]:
sample_weights_f2 = sku.class_weight.compute_sample_weight('balanced', y_f2_train)
sample_weights_f2

### CatBoost

In [None]:
import catboost as cb

cat_model_f1 = cb.CatBoostClassifier(verbose=0, iterations=70, 
#                                   eval_metric='F1', 
                                  class_weights=class_weights_f1, 
#                                   use_best_model=True
                                 )
cat_model_f1.fit(X_f1_train, y_f1_train, eval_set=(X_f1_test, y_f1_test))
print(cat_model_f1.best_score_)

y_f1_train_pred = cat_model_f1.predict(X_f1_train)
y_f1_test_pred = cat_model_f1.predict(X_f1_test)
print(skm.accuracy_score(y_f1_train, y_f1_train_pred))
print(skm.accuracy_score(y_f1_test, y_f1_test_pred))
printScore(y_f1_train, y_f1_train_pred)
printScore(y_f1_test, y_f1_test_pred)

In [None]:
import catboost as cb

cat_model_f2 = cb.CatBoostClassifier(verbose=0, iterations=80, 
#                                   eval_metric='F1', 
                                  class_weights=class_weights_f2, 
#                                   use_best_model=True
                                 )
cat_model_f2.fit(X_f2_train, y_f2_train, eval_set=(X_f2_test, y_f2_test))
print(cat_model_f2.best_score_)

y_f2_train_pred = cat_model_f2.predict(X_f2_train)
y_f2_test_pred = cat_model_f2.predict(X_f2_test)
print(skm.accuracy_score(y_f2_train, y_f2_train_pred))
print(skm.accuracy_score(y_f2_test, y_f2_test_pred))
printScore(y_f2_train, y_f2_train_pred)
printScore(y_f2_test, y_f2_test_pred)

### RandomForest

In [None]:
rf_model_f1 = ske.RandomForestClassifier(verbose=0, random_state=1, n_jobs=-1, class_weight='balanced_subsample',
                                 n_estimators=100,max_depth=10, 
                                 min_samples_split = 5, min_samples_leaf = 3
                                )
rf_model_f1.fit(X_f1_train, y_f1_train)

# predict
y_f1_train_pred = rf_model_f1.predict(X_f1_train)
y_f1_test_pred = rf_model_f1.predict(X_f1_test)
print(skm.accuracy_score(y_f1_train, y_f1_train_pred))
print(skm.accuracy_score(y_f1_test, y_f1_test_pred))
printScore(y_f1_train, y_f1_train_pred)
printScore(y_f1_test, y_f1_test_pred)

In [None]:
rf_model_f2 = ske.RandomForestClassifier(verbose=0, random_state=1, n_jobs=-1, class_weight='balanced_subsample',
                                 n_estimators=300,max_depth=10, 
                                 min_samples_split = 10, min_samples_leaf = 5
                                )
rf_model_f2.fit(X_f2_train, y_f2_train)

# predict
y_f2_train_pred = rf_model_f2.predict(X_f2_train)
y_f2_test_pred = rf_model_f2.predict(X_f2_test)
print(skm.accuracy_score(y_f2_train, y_f2_train_pred))
print(skm.accuracy_score(y_f2_test, y_f2_test_pred))
printScore(y_f2_train, y_f2_train_pred)
printScore(y_f2_test, y_f2_test_pred)

### XGBoost

In [None]:
import xgboost as xg

In [None]:
# # Grid used for parameter tuning
# param_test1 = {
#     'max_depth': np.arange(5, 12, 2),
#     'learning_rate': np.arange(0.04, 0.07, 0.01)
# }
# xgb_cv1 = skms.GridSearchCV(estimator = xg.XGBClassifier(n_estimators=100, objective='macro', nthread=4, seed=seed), 
#                              param_grid = param_test1, scoring='f1', n_jobs=4, 
#                              cv=3, verbose=1)
# xgb_cv1.fit(X_f1_train, y_f1_train)
# print(xgb_cv1.best_params_, xgb_cv1.best_score_)
# # max_depth = 10
# # learning_rate = 0.04

In [None]:
# # Grid used for parameter tuning
# param_test2 = {
#  'subsample': np.arange(0.5, 1, 0.1),
#  'min_child_weight': range(1, 6, 1)
# }
# xgb_cv2 = skms.GridSearchCV(estimator = xg.XGBClassifier(n_estimators=500, max_depth = 10, 
#                                                      objective= 'multi:softprob', nthread=4, seed=seed), 
#                             param_grid = param_test2, scoring='f1', n_jobs=4,
#                             cv=5, verbose=1)
# xgb_cv2.fit(X_train_small, y_train_small)
# print(xgb_cv2.best_params_, xgb_cv2.best_score_)
# print(xgb_cv2.best_estimator_)
# # subsample = 0.5
# # min_child_weight = 2

In [None]:
xgb_model_f1 = xg.XGBClassifier(objective ='multi:softprob', random_state=seed, scoring='f1', 
                             learning_rate=0.0001, subsample=0.5, n_jobs=-1, sample_weight=sample_weights_f1,
                             n_estimators=100, max_depth = 8)
xgb_model_f1.fit(X_f1_train, y_f1_train)

# predict
y_f1_train_pred = xgb_model_f1.predict(X_f1_train)
y_f1_test_pred = xgb_model_f1.predict(X_f1_test)
print(skm.accuracy_score(y_f1_train, y_f1_train_pred))
print(skm.accuracy_score(y_f1_test, y_f1_test_pred))
printScore(y_f1_train, y_f1_train_pred)
printScore(y_f1_test, y_f1_test_pred)

In [None]:
xgb_model_f2 = xg.XGBClassifier(objective ='multi:softprob', random_state=seed, scoring='f1', 
                             learning_rate=0.15, subsample=1, n_jobs=-1, sample_weight=sample_weights_f2,
                             n_estimators=100, max_depth = 5)
xgb_model_f2.fit(X_f2_train, y_f2_train)

# predict
y_f2_train_pred = xgb_model_f2.predict(X_f2_train)
y_f2_test_pred = xgb_model_f2.predict(X_f2_test)
print(skm.accuracy_score(y_f2_train, y_f2_train_pred))
print(skm.accuracy_score(y_f2_test, y_f2_test_pred))
printScore(y_f2_train, y_f2_train_pred)
printScore(y_f2_test, y_f2_test_pred)

### LightGBM

In [None]:
import lightgbm as lgb
lgb_model_f1 = lgb.LGBMClassifier(objective='multi', random_state=1, n_jobs=-1, 
                               class_weight=class_weights_f1,
                               learning_rate=0.1, n_estimators=70)
lgb_model_f1.fit(X_f1_train, y_f1_train)

# predict
y_f1_train_pred = lgb_model_f1.predict(X_f1_train)
y_f1_test_pred = lgb_model_f1.predict(X_f1_test)
print(skm.accuracy_score(y_f1_train, y_f1_train_pred))
print(skm.accuracy_score(y_f1_test, y_f1_test_pred))
printScore(y_f1_train, y_f1_train_pred)
printScore(y_f1_test, y_f1_test_pred)

In [None]:
import lightgbm as lgb
lgb_model_f2 = lgb.LGBMClassifier(objective='multi', random_state=1, n_jobs=-1, 
#                                class_weight=class_weights_f2,
                               learning_rate=0.08, n_estimators=100)
lgb_model_f2.fit(X_f2_train, y_f2_train)

# predict
y_f2_train_pred = lgb_model_f2.predict(X_f2_train)
y_f2_test_pred = lgb_model_f2.predict(X_f2_test)
print(skm.accuracy_score(y_f2_train, y_f2_train_pred))
print(skm.accuracy_score(y_f2_test, y_f2_test_pred))
printScore(y_f2_train, y_f2_train_pred)
printScore(y_f2_test, y_f2_test_pred)

# Step 5: Test Evaluation & Submission

In [None]:
# Generate Ensembles

def rmse_cv(model):
    '''
    Use this function to get quickly the rmse score over a cv
    '''
    rmse = np.sqrt(-skms.cross_val_score(model, X_train, y_train, 
                                         scoring="neg_mean_squared_error", cv = 5, n_jobs=-1))
    return rmse

class MixModel(skb.BaseEstimator, skb.RegressorMixin, skb.TransformerMixin):
    '''
    Here we will get a set of models as parameter already trained and 
    will calculate the mean of the predictions for using each model predictions
    '''
    def __init__(self, algs):
        self.algs = algs

    # Define clones of parameters models
    def fit(self, X, y):
        self.algs_ = [skb.clone(x) for x in self.algs]
        
        # Train cloned base models
        for alg in self.algs_:
            alg.fit(X, y)

        return self
    
    # Average predictions of all cloned models
    def predict(self, X):
        predictions = np.column_stack([
            stacked_model.predict(X) for stacked_model in self.algs_
        ])
        return np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=predictions)

In [None]:
mixed_model_f1 = MixModel(algs = [
    cat_model_f1,
    rf_model_f1,
    xgb_model_f1,
    lgb_model_f1
])
# score = rmse_cv(mixed_model)
# print("\nAveraged base algs score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

mixed_model_f1.fit(X_f1_train, y_f1_train)

# predict
y_f1_train_pred = mixed_model_f1.predict(X_f1_train)
y_f1_test_pred = mixed_model_f1.predict(X_f1_test)
printScore(y_f1_train, y_f1_train_pred)
printScore(y_f1_test, y_f1_test_pred)

In [None]:
mixed_model_f2 = MixModel(algs = [
#     cat_model_f2,
    rf_model_f2,
#     xgb_model_f2,
#     lgb_model_f2
])
# score = rmse_cv(mixed_model)
# print("\nAveraged base algs score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

mixed_model_f2.fit(X_f2_train, y_f2_train)

# predict
y_f2_train_pred = mixed_model_f2.predict(X_f2_train)
y_f2_test_pred = mixed_model_f2.predict(X_f2_test)
printScore(y_f2_train, y_f2_train_pred)
printScore(y_f2_test, y_f2_test_pred)

In [None]:
# generate test results for targetFeature
def getTestResults():
    df_final_f1 = df_f1.sample(frac=1, random_state=1).reset_index(drop=True)
    test_cols_f1 = [x for x in df_final_f1.columns if targetFeature not in x]
    df_final_test_f1 = df_test[test_cols_f1]
    df_y_f1 = df_final_f1.pop(targetFeature)
    df_X_f1 = df_final_f1
    
    df_y_f1 = f1_le.transform(df_y_f1)

    scaler_f1 = skp.RobustScaler()
#     scaler = skp.MinMaxScaler()
#     scaler = skp.StandardScaler()

    df_X_f1[cont_features.columns] = pd.DataFrame(scaler.fit_transform(df_X_f1[cont_features.columns]), columns=cont_features.columns)
    df_final_test_f1[cont_features.columns] = pd.DataFrame(scaler.transform(df_final_test_f1[cont_features.columns]), columns=cont_features.columns)

#     sample_weights_f1 = sku.class_weight.compute_sample_weight('balanced', df_y_f1)
    
    model_f1 = MixModel(algs = [
#         cat_model_f1,
#         rf_model_f1,
        xgb_model_f1,
#         lgb_model_f1
    ])

    model_f1.fit(df_X_f1, df_y_f1)

    # predict
    y_train_pred_f1 = model_f1.predict(df_X_f1)
    y_test_pred_f1 = model_f1.predict(df_final_test_f1)
    print("Accuracy Score for Train:",skm.accuracy_score(df_y_f1, y_train_pred_f1))
    printScore(df_y_f1, y_train_pred_f1)
    return y_test_pred_f1

# ML models
results = getTestResults()

In [None]:
submission = pd.DataFrame({
    'Patient Id': df_test['Patient Id'],
    targetFeature: f1_le.inverse_transform(results.ravel()),
})
print(submission[targetFeature].value_counts())

In [None]:
# generate test results for targetFeature2
def getTestResults():
    df_final_f2 = df_f2.sample(frac=1, random_state=1).reset_index(drop=True)
    test_cols_f2 = [x for x in df_final_f2.columns if targetFeature2 not in x]
    df_final_test_f2 = df_test[test_cols_f2]
    df_y_f2 = df_final_f2.pop(targetFeature2)
    df_X_f2 = df_final_f2
    
    df_y_f2 = f2_le.transform(df_y_f2)

    scaler_f2 = skp.RobustScaler()
#     scaler = skp.MinMaxScaler()
#     scaler = skp.StandardScaler()

    df_X_f2[cont_features.columns] = pd.DataFrame(scaler.fit_transform(df_X_f2[cont_features.columns]), columns=cont_features.columns)
    df_final_test_f2[cont_features.columns] = pd.DataFrame(scaler.transform(df_final_test_f2[cont_features.columns]), columns=cont_features.columns)

#     sample_weights_f2 = sku.class_weight.compute_sample_weight('balanced', df_y_f2)
    
    model_f2 = MixModel(algs = [
#         cat_model_f2,
        rf_model_f2,
#         xgb_model_f2,
#         lgb_model_f2
    ])

    model_f2.fit(df_X_f2, df_y_f2)

    # predict
    y_train_pred_f2 = model_f2.predict(df_X_f2)
    y_test_pred_f2 = model_f2.predict(df_final_test_f2)
    print("Accuracy Score for Train:",skm.accuracy_score(df_y_f2, y_train_pred_f2))
    printScore(df_y_f2, y_train_pred_f2)
    return y_test_pred_f2

# ML models
df_test[targetFeature] = results.ravel()
results2 = getTestResults()

In [None]:
submission[targetFeature2] = f2_le.inverse_transform(results2.ravel())
print(submission[targetFeature2].value_counts())

In [None]:
# generate submission file
submission.to_csv('./submission_XGB_RF1.csv', index=False)

With XGB and RF, 33.87 LB is scored. You can clone and try with other models.