# Backward and Forward Selection

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('data_processed.csv')

cols = df.select_dtypes(include=['int64']).columns
df[cols]=df[cols].astype('category')
df.dtypes

X_df = df.iloc[:, 2:]
y_df = df.iloc[:, 1]

X_df.drop('OTHER',axis=1,inplace=True)

In [None]:
# We use an approx 6:4 train test splitting
cases = ['train','test']
np.random.seed(12)
case_list = np.random.choice(cases,size=X_df.shape[0],replace=True,p=[0.6,0.4])

X_df_train = X_df.iloc[case_list=='train',:]
X_df_test = X_df.iloc[case_list=='test',:]
y_df_train = y_df.iloc[case_list=='train']
y_df_test = y_df.iloc[case_list=='test']

In [None]:
# Tools in sklearn to select best model
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV

# We use f1 score to test model performance
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression as logit # use build-in logistic regression model in sklearn

In [None]:
parameters = {
    'C':np.arange(start=0.005,stop=0.1,step=0.005)
}
parameters

In [None]:
stratifiedCV = StratifiedKFold(n_splits=10)
model_L1 = logit(penalty='l1', solver='liblinear')
BestL1 = GridSearchCV(
    model_L1,
    param_grid=parameters,
    scoring='f1',
    cv=stratifiedCV,
    verbose=1,
    n_jobs=-1
)
BestL1.fit(X_df,y_df)

In [None]:
print(BestL1.best_estimator_)

print(BestL1.best_score_)

In [None]:
import pickle
from sklearn.linear_model import LogisticRegression as logit # use build-in logistic regression model in sklearn
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.metrics import roc_curve, precision_recall_curve

In [None]:
model = logit(penalty='l1',C=1/40,solver='liblinear') # c: 1/(strength of L1 regularization)

# Forward feature selection.
forward_selection = SFS(
    model, n_features_to_select="auto", direction="forward", tol=1e-4
).fit(X_df_train, y_df_train)

# save as a pickle file
model_pkl_file = "model_forwardSelect.pkl"

with open(model_pkl_file, 'wb') as file:
    pickle.dump(forward_selection, file)



# Backward feature selection.
backward_selection = SFS(
    model, n_features_to_select="auto", direction="backward", tol=-1e-4
).fit(X_df_train, y_df_train)


# save as a pickle file
model_pkl_file = "model_backwardSelect.pkl"

with open(model_pkl_file, 'wb') as file:
    pickle.dump(backward_selection, file)


print(forward_selection.get_feature_names_out())

print(backward_selection.get_feature_names_out())

['gender' 'admission_age' 'sbp_min' 'dbp_min' 'mbp_max' 'resp_rate_max'
 'temperature_min' 'temperature_max' 'spo2_min' 'ph_max' 'totalco2_max'
 'calcium_min' 'bun_min' 'ptt_max' 'height' 'weight_admit']
['gender' 'admission_age' 'heart_rate_max' 'sbp_min' 'sbp_max' 'dbp_min'
 'dbp_mean' 'mbp_min' 'mbp_max' 'resp_rate_max' 'temperature_min'
 'temperature_max' 'spo2_min' 'ph_max' 'so2_max' 'po2_min' 'pco2_max'
 'pao2fio2ratio_min' 'pao2fio2ratio_max' 'bicarbonate_min'
 'bicarbonate_max' 'totalco2_max' 'chloride_max' 'calcium_max'
 'albumin_min' 'aniongap_min' 'bun_min' 'abs_neutrophils_max' 'inr_max'
 'ptt_min' 'ptt_max' 'gcs_min' 'weight_admit']


In [None]:
len(forward_selection.get_feature_names_out())

16

In [None]:
len(backward_selection.get_feature_names_out())

33

In [None]:
# Full model
model.fit(X_df_train,y_df_train)
y_pred_full = model.predict_proba(X_df_test)

# Model with forward selected features
model.fit(forward_selection.transform(X_df_train),y_df_train)
y_pred_FS = model.predict_proba(forward_selection.transform(X_df_test))

# Model with backward selected features
model.fit(backward_selection.transform(X_df_train),y_df_train)
y_pred_BS = model.predict_proba(backward_selection.transform(X_df_test))

In [None]:
# roc_curve
fpr_full, tpr_full, _ = roc_curve(y_df_test,y_pred_full[:,1]) #the function returns 3 values: false pos values, true pos values and a thershold using which the model categorizes the output
fpr_FS, tpr_FS, _ = roc_curve(y_df_test,y_pred_FS[:,1])
fpr_BS, tpr_BS, _ = roc_curve(y_df_test,y_pred_BS[:,1])

roc_df = pd.DataFrame(
    {
        'False Positive Rate':np.hstack([fpr_full,fpr_FS,fpr_BS]),
        'True Positive Rate':np.hstack([tpr_full,tpr_FS,tpr_BS]),
        'method':['full_model']*len(fpr_full)+['FS']*len(fpr_FS)+['BS']*len(fpr_BS)
    }
)


plt.figure(figsize=(5, 5))

# Plot ROC curves
plt.plot(fpr_full, tpr_full, label='full_model')
plt.plot(fpr_FS, tpr_FS, label='FS')
plt.plot(fpr_BS, tpr_BS, label='BS')

# Customize the plot
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()

# Show the plot
plt.show()

In [None]:
# precision recall curves
p_full, r_full, _ = precision_recall_curve(y_df_test,y_pred_full[:,1])
p_FS, r_FS, _ = precision_recall_curve(y_df_test,y_pred_FS[:,1])
p_BS, r_BS, _ = precision_recall_curve(y_df_test,y_pred_BS[:,1])

pr_df = pd.DataFrame(
    {
        'Precision':np.hstack([p_full,p_FS,p_BS]),
        'Recall':np.hstack([r_full,r_FS,r_BS]),
        'method':['Full Model']*len(p_full)+['Forward Selection']*len(p_FS)+['Backward Selection']*len(p_BS)
    }
)

# Visualize precision recall curve
# Visualize Precision-Recall curve using Matplotlib
plt.figure(figsize=(5, 5))

# Plot Precision-Recall curves
plt.plot(r_full, p_full, label='Full Model')
plt.plot(r_FS, p_FS, label='Forward Selection')
plt.plot(r_BS, p_BS, label='Backward Selection')

# Customize the plot
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()

# Show the plot
plt.show()

# Hyperparameter tuning of different models

In [None]:
# Tools in sklearn to select best model
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV

# We use f1 score to test model performance
from sklearn.metrics import f1_score, accuracy_score

# Import matplotlib.pyplot to visualize tree models
import matplotlib.pyplot as plt

In [None]:
cols_select = forward_selection.get_feature_names_out()
print(len(cols_select))
print(cols_select)

16
['gender' 'admission_age' 'sbp_min' 'dbp_min' 'mbp_max' 'resp_rate_max'
 'temperature_min' 'temperature_max' 'spo2_min' 'ph_max' 'totalco2_max'
 'calcium_min' 'bun_min' 'ptt_max' 'height' 'weight_admit']


In [None]:
X_df = df.iloc[:, 2:]
y_df = df.iloc[:, 1]

X_select = X_df[cols_select]

X_train, X_test, y_train, y_test = train_test_split(X_select, y_df, test_size=0.4, random_state=12)

X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.1, random_state=12)

In [None]:
from imblearn.over_sampling import SMOTENC

# Use SMOTE to resample minority class.
smote_sampler = SMOTENC(random_state=12,sampling_strategy='minority',categorical_features=['gender'])
X_train_SMOTE, y_train_SMOTE = smote_sampler.fit_resample(X_train, y_train)

## Logsitic Regressions

In [None]:
from sklearn.linear_model import LogisticRegression as logit

model_LR = logit(solver='liblinear')
model_LR.fit(X_train_SMOTE,y_train_SMOTE)

pred = model_LR.predict_proba(X_test)

threshold = 0.5

y_pred = np.where(pred[:,1]>threshold, 1, 0)

print('F1 score on test set: {:.4f}'.format(f1_score(y_test,y_pred)))
print('accuracy score on test set: {:.4f}'.format(accuracy_score(y_test,y_pred)))
pd.crosstab(
    index=y_test,
    columns=y_pred,
    rownames=['True'],
    colnames=['Pred']
)

F1 score on test set: 0.7308
accuracy score on test set: 0.6677


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2109,1162
1,2076,4396


In [None]:
pred = model_LR.predict_proba(X_val)

threshold = 0.5

y_pred = np.where(pred[:,1]>threshold, 1, 0)

print('F1 score on validation set: {:.4f}'.format(f1_score(y_val,y_pred)))
print('accuracy score on validation set: {:.4f}'.format(accuracy_score(y_val,y_pred)))
pd.crosstab(
    index=y_val,
    columns=y_pred,
    rownames=['True'],
    colnames=['Pred']
)

F1 score on validation set: 0.7260
accuracy score on validation set: 0.6620


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,232,135
1,231,485


In [None]:
model_L2 = logit(penalty='l2', solver='liblinear')
model_L2.fit(X_train_SMOTE,y_train_SMOTE)

pred_L2 = model_L2.predict_proba(X_test)

threshold = 0.5

y_pred = np.where(pred_L2[:,1]>threshold, 1, 0)

print('F1 score on test set: {:.4f}'.format(f1_score(y_test,y_pred)))
print('accuracy score on test set: {:.4f}'.format(accuracy_score(y_test,y_pred)))
pd.crosstab(
    index=y_test,
    columns=y_pred,
    rownames=['True'],
    colnames=['Pred']
)

F1 score on test set: 0.7308
accuracy score on test set: 0.6677


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2109,1162
1,2076,4396


In [None]:
pred = model_L2.predict_proba(X_val)

threshold = 0.5

y_pred = np.where(pred[:,1]>threshold, 1, 0)

print('F1 score on validation set: {:.4f}'.format(f1_score(y_val,y_pred)))
print('accuracy score on validation set: {:.4f}'.format(accuracy_score(y_val,y_pred)))
pd.crosstab(
    index=y_val,
    columns=y_pred,
    rownames=['True'],
    colnames=['Pred']
)

F1 score on validation set: 0.7260
accuracy score on validation set: 0.6620


Pred,0,1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
0,232,135
1,231,485


## Decision Trees

In [None]:
# Decision tree classifier in sklearn
from sklearn.tree import DecisionTreeClassifier as DTC, plot_tree

### Basic Tree

In [None]:
# The `max_depth` parameter is important for decision tree.
# We use `GridSearchCV` to select the best `max_depth`.

parameters = {'max_depth':np.arange(start=1,stop=16,step=1)}
parameters

{'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])}

In [None]:
stratifiedCV = StratifiedKFold(n_splits=10)
TreeModel = DTC(criterion='entropy')
BestTree = GridSearchCV(
    TreeModel,
    param_grid=parameters,
    scoring='f1',
    cv=stratifiedCV
)
BestTree.fit(X_train_SMOTE,y_train_SMOTE)

In [None]:
print(BestTree.best_estimator_)

print(BestTree.best_score_)

DecisionTreeClassifier(criterion='entropy', max_depth=15)
0.6660705509866454


In [None]:
y_pred = BestTree.predict(X_test)
print('F1 score on test set: {:.4f}'.format(f1_score(y_test,y_pred)))
print('accuracy score on test set: {:.4f}'.format(accuracy_score(y_test,y_pred)))
pd.crosstab(y_test,y_pred)

F1 score on test set: 0.6928
accuracy score on test set: 0.6152


col_0,0,1
aki,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1767,1504
1,2245,4227


In [None]:
y_pred = BestTree.predict(X_val)
print('F1 score on validation set: {:.4f}'.format(f1_score(y_val,y_pred)))
print('accuracy score on validation set: {:.4f}'.format(accuracy_score(y_val,y_pred)))
pd.crosstab(y_val,y_pred)

F1 score on validation set: 0.6946
accuracy score on validation set: 0.6159


col_0,0,1
aki,Unnamed: 1_level_1,Unnamed: 2_level_1
0,194,173
1,243,473


### Gradient Boosting

In [None]:
from xgboost import XGBClassifier as XGBC

In [None]:
parameters = {
    'n_estimators':np.arange(start=2,stop=150,step=10),
    'max_depth':np.arange(start=2,stop=17,step=2),
    'learning_rate':np.arange(start=0.05,stop=0.4,step=0.05)
}

parameters

{'n_estimators': array([  2,  12,  22,  32,  42,  52,  62,  72,  82,  92, 102, 112, 122,
        132, 142]),
 'max_depth': array([ 2,  4,  6,  8, 10, 12, 14, 16]),
 'learning_rate': array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35])}

In [None]:
stratifiedCV = StratifiedKFold(n_splits=10)
# XGBC: XGBoost classifier
XGBoostModel = XGBC(enable_categorical=True)
BestXGBoost = GridSearchCV(
    XGBoostModel,
    param_grid=parameters,
    scoring='f1',
    cv=stratifiedCV,
    verbose=1,
    n_jobs=-1 # use all cpu cores to speedup grid search
)
BestXGBoost.fit(X_train_SMOTE,y_train_SMOTE)

Fitting 10 folds for each of 840 candidates, totalling 8400 fits


In [None]:
import pickle

file_n = 'xgboost_gridcv.pkl'
with open(file_n, 'wb') as file:
    pickle.dump(BestXGBoost, file)

In [None]:
print(BestXGBoost.best_params_)

print(BestXGBoost.best_score_)

{'learning_rate': 0.1, 'max_depth': 14, 'n_estimators': 132}
0.8017640920802431


In [None]:
y_pred = BestXGBoost.predict(X_test)
print('F1 score on test set: {:.4f}'.format(f1_score(y_test,y_pred)))
print('accuracy score on test set: {:.4f}'.format(accuracy_score(y_test,y_pred)))
pd.crosstab(y_test,y_pred)

F1 score on test set: 0.7854
accuracy score on test set: 0.7003


col_0,0,1
aki,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1480,1791
1,1129,5343


In [None]:
y_pred = BestXGBoost.predict(X_val)
print('F1 score on validation set: {:.4f}'.format(f1_score(y_val,y_pred)))
print('accuracy score on validation set: {:.4f}'.format(accuracy_score(y_val,y_pred)))
pd.crosstab(y_val,y_pred)

F1 score on validation set: 0.7888
accuracy score on validation set: 0.7082


col_0,0,1
aki,Unnamed: 1_level_1,Unnamed: 2_level_1
0,177,190
1,126,590


### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier as RFC

parameters = {
    'n_estimators':np.arange(start=100,stop=250,step=10),
    'max_depth':np.arange(start=2,stop=17,step=2)
}

parameters

{'n_estimators': array([100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220,
        230, 240]),
 'max_depth': array([ 2,  4,  6,  8, 10, 12, 14, 16])}

In [None]:
stratifiedCV = StratifiedKFold(n_splits=10)
# RFC: Random Forest classifier
RandForest = RFC()
BestRForest = GridSearchCV(
    RandForest,
    param_grid=parameters,
    scoring='f1',
    cv=stratifiedCV,
    verbose=1,
    n_jobs=-1 # use all cpu cores to speedup grid search
)
BestRForest.fit(X_train_SMOTE,y_train_SMOTE)

file_n = 'randomForest_gridcv.pkl'
with open(file_n, 'wb') as file:
    pickle.dump(BestRForest, file)

Fitting 10 folds for each of 120 candidates, totalling 1200 fits


In [None]:
print(BestRForest.best_params_)

print(BestRForest.best_score_)

{'max_depth': 16, 'n_estimators': 220}
0.7751867233536436


In [None]:
y_pred = BestRForest.predict(X_test)
print('F1 score on test set: {:.4f}'.format(f1_score(y_test,y_pred)))
print('accuracy score on test set: {:.4f}'.format(accuracy_score(y_test,y_pred)))
pd.crosstab(y_test,y_pred)

F1 score on test set: 0.7674
accuracy score on test set: 0.6943


col_0,0,1
aki,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1853,1418
1,1560,4912


In [None]:
y_pred = BestRForest.predict(X_val)
print('F1 score on validation set: {:.4f}'.format(f1_score(y_val,y_pred)))
print('accuracy score on validation set: {:.4f}'.format(accuracy_score(y_val,y_pred)))
pd.crosstab(y_val,y_pred)

F1 score on validation set: 0.7586
accuracy score on validation set: 0.6833


col_0,0,1
aki,Unnamed: 1_level_1,Unnamed: 2_level_1
0,201,166
1,177,539


In [4]:
from sklearn.ensemble import AdaBoostClassifier

parameters = {
    'n_estimators':np.arange(start=100,stop=360,step=20),
    'learning_rate':np.arange(start=0.05,stop=2.5,step=0.1)
}

parameters

{'n_estimators': array([100, 120, 140, 160, 180, 200, 220, 240, 260, 280, 300, 320, 340]),
 'learning_rate': array([0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95, 1.05,
        1.15, 1.25, 1.35, 1.45, 1.55, 1.65, 1.75, 1.85, 1.95, 2.05, 2.15,
        2.25, 2.35, 2.45])}

In [None]:
stratifiedCV = StratifiedKFold(n_splits=10)
# RFC: Random Forest classifier
AdaBoost = AdaBoostClassifier()
BestAdBoost = GridSearchCV(
    AdaBoost,
    param_grid=parameters,
    scoring='f1',
    cv=stratifiedCV,
    verbose=1,
    n_jobs=-1 # use all cpu cores to speedup grid search
)
BestAdBoost.fit(X_train_SMOTE,y_train_SMOTE)

file_n = 'adaBoost_gridcv_new.pkl'
with open(file_n, 'wb') as file:
    pickle.dump(BestAdBoost, file)

In [None]:
print(BestAdBoost.best_params_)

print(BestAdBoost.best_score_)

{'learning_rate': 1.8000000000000007, 'n_estimators': 300}
0.7877447958541812


In [None]:
y_pred = BestAdBoost.predict(X_test)
print('F1 score on test set: {:.4f}'.format(f1_score(y_test,y_pred)))
print('accuracy score on test set: {:.4f}'.format(accuracy_score(y_test,y_pred)))
pd.crosstab(y_test,y_pred)

F1 score on test set: 0.7909
accuracy score on test set: 0.7053


col_0,0,1
aki,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1444,1827
1,1044,5428


In [None]:
y_pred = BestAdBoost.predict(X_val)
print('F1 score on validation set: {:.4f}'.format(f1_score(y_val,y_pred)))
print('accuracy score on validation set: {:.4f}'.format(accuracy_score(y_val,y_pred)))
pd.crosstab(y_val,y_pred)

F1 score on validation set: 0.7815
accuracy score on validation set: 0.6953


col_0,0,1
aki,Unnamed: 1_level_1,Unnamed: 2_level_1
0,163,204
1,126,590


## SVM

In [None]:
# Support Vector Classifier
from sklearn.svm import SVC

#### Linear

In [None]:
# 'C': strength of L2 regularization on linear SVM. Larger 'C' --> smaller regularization.
parameters = {
    'C':np.arange(start=0.5,stop=40,step=2)
}
parameters

{'C': array([ 0.5,  2.5,  4.5,  6.5,  8.5, 10.5, 12.5, 14.5, 16.5, 18.5, 20.5,
        22.5, 24.5, 26.5, 28.5, 30.5, 32.5, 34.5, 36.5, 38.5])}

In [None]:

stratifiedCV = StratifiedKFold(n_splits=10)
SVCModel = SVC(kernel='linear')
BestSVC = GridSearchCV(
    SVCModel,
    param_grid=parameters,
    scoring='f1',
    cv=stratifiedCV,
    verbose=1,
    n_jobs=-1
)
BestSVC.fit(X_train_SMOTE,y_train_SMOTE)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


In [None]:
print(BestSVC.best_estimator_)

print(BestSVC.best_score_)

SVC(C=32.5, kernel='linear')
0.6762594020176682


In [None]:
y_pred = BestSVC.predict(X_test)
print('F1 score on test set: {:.4f}'.format(f1_score(y_test,y_pred)))
print('accuracy score on test set: {:.4f}'.format(accuracy_score(y_test,y_pred)))
pd.crosstab(y_test,y_pred)

F1 score on test set: 0.7278
accuracy score on test set: 0.6640


col_0,0,1
aki,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2093,1178
1,2096,4376


In [None]:
y_pred = BestSVC.predict(X_val)
print('F1 score on validation set: {:.4f}'.format(f1_score(y_val,y_pred)))
print('accuracy score on validation set: {:.4f}'.format(accuracy_score(y_val,y_pred)))
pd.crosstab(y_val,y_pred)

F1 score on validation set: 0.7217
accuracy score on validation set: 0.6574


col_0,0,1
aki,Unnamed: 1_level_1,Unnamed: 2_level_1
0,231,136
1,235,481


### Non-linear SVM

#### RBF kernel

In [None]:
parameters = {
    'C':np.arange(start=0.5,stop=40,step=2)
}

stratifiedCV = StratifiedKFold(n_splits=10)
SVM_rbf = SVC(kernel='rbf')
BestSVC_rbf = GridSearchCV(
    SVM_rbf,
    param_grid=parameters,
    scoring='f1',
    cv=stratifiedCV,
    verbose=1,
    n_jobs=-1
)
BestSVC_rbf.fit(X_train_SMOTE,y_train_SMOTE)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


In [None]:
print(BestSVC_rbf.best_estimator_)

print(BestSVC_rbf.best_score_)

SVC(C=12.5)
0.7105357242371673


In [None]:
y_pred = BestSVC_rbf.predict(X_test)
print('F1 score on test set: {:.4f}'.format(f1_score(y_test,y_pred)))
print('accuracy score on test set: {:.4f}'.format(accuracy_score(y_test,y_pred)))
pd.crosstab(y_test,y_pred)

F1 score on test set: 0.7151
accuracy score on test set: 0.6463


col_0,0,1
aki,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1972,1299
1,2147,4325


In [None]:
y_pred = BestSVC_rbf.predict(X_val)
print('F1 score on validaiton set: {:.4f}'.format(f1_score(y_val,y_pred)))
print('accuracy score on validation set: {:.4f}'.format(accuracy_score(y_val,y_pred)))
pd.crosstab(y_val,y_pred)

F1 score on validaiton set: 0.7131
accuracy score on validation set: 0.6464


col_0,0,1
aki,Unnamed: 1_level_1,Unnamed: 2_level_1
0,224,143
1,240,476


#### Sigmoid kernel

In [None]:
parameters = {
    'C':np.arange(start=0.5,stop=40,step=2)
}

stratifiedCV = StratifiedKFold(n_splits=10)
SVM_sig = SVC(kernel='sigmoid')
BestSVC_sig = GridSearchCV(
    SVM_sig,
    param_grid=parameters,
    scoring='f1',
    cv=stratifiedCV,
    verbose=1,
    n_jobs=-1
)
BestSVC_sig.fit(X_train_SMOTE,y_train_SMOTE)


Fitting 10 folds for each of 20 candidates, totalling 200 fits


In [None]:
print(BestSVC_sig.best_estimator_)

print(BestSVC_sig.best_score_)

SVC(C=0.5, kernel='sigmoid')
0.5912651178597199


In [None]:
y_pred = BestSVC_sig.predict(X_test)
print('F1 score on test set: {:.4f}'.format(f1_score(y_test,y_pred)))
print('accuracy score on test set: {:.4f}'.format(accuracy_score(y_test,y_pred)))
pd.crosstab(y_test,y_pred)

F1 score on test set: 0.6489
accuracy score on test set: 0.5792


col_0,0,1
aki,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1855,1416
1,2684,3788


In [None]:
y_pred = BestSVC_sig.predict(X_val)
print('F1 score on validation set: {:.4f}'.format(f1_score(y_val,y_pred)))
print('accuracy score on validation set: {:.4f}'.format(accuracy_score(y_val,y_pred)))
pd.crosstab(y_val,y_pred)

F1 score on validation set: 0.6723
accuracy score on validation set: 0.6039


col_0,0,1
aki,Unnamed: 1_level_1,Unnamed: 2_level_1
0,214,153
1,276,440


#### Polynomial kernel

In [None]:
parameters = {
    'C':np.arange(start=0.5,stop=40,step=2)
}

stratifiedCV = StratifiedKFold(n_splits=10)
SVM_poly = SVC(kernel='poly')
BestSVC_poly = GridSearchCV(
    SVM_poly,
    param_grid=parameters,
    scoring='f1',
    cv=stratifiedCV,
    verbose=1,
    n_jobs=-1
)
BestSVC_poly.fit(X_train_SMOTE,y_train_SMOTE)


Fitting 10 folds for each of 20 candidates, totalling 200 fits


In [None]:
print(BestSVC_poly.best_estimator_)

print(BestSVC_poly.best_score_)

SVC(C=0.5, kernel='poly')
0.6793603770296458


In [None]:
y_pred = BestSVC_poly.predict(X_test)
print('F1 score on test set: {:.4f}'.format(f1_score(y_test,y_pred)))
print('accuracy score on test set: {:.4f}'.format(accuracy_score(y_test,y_pred)))
pd.crosstab(y_test,y_pred)

F1 score on test set: 0.7170
accuracy score on test set: 0.6557


col_0,0,1
aki,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2137,1134
1,2221,4251


In [None]:
y_pred = BestSVC_sig.predict(X_val)
print('F1 score on validation set: {:.4f}'.format(f1_score(y_val,y_pred)))
print('accuracy score on validation set: {:.4f}'.format(accuracy_score(y_val,y_pred)))
pd.crosstab(y_val,y_pred)

F1 score on validation set: 0.6723
accuracy score on validation set: 0.6039


col_0,0,1
aki,Unnamed: 1_level_1,Unnamed: 2_level_1
0,214,153
1,276,440


## Ensemble

In [None]:
y_pred_xgb = BestXGBoost.predict(X_val)
print('F1 score on validation set: {:.4f}'.format(f1_score(y_val,y_pred_xgb)))
print('accuracy score on validation set: {:.4f}'.format(accuracy_score(y_val,y_pred_xgb)))
pd.crosstab(y_val,y_pred_xgb)

F1 score on validation set: 0.7888
accuracy score on validation set: 0.7082


col_0,0,1
aki,Unnamed: 1_level_1,Unnamed: 2_level_1
0,177,190
1,126,590


In [None]:
y_pred_rf = BestRForest.predict(X_val)
print('F1 score on validation set: {:.4f}'.format(f1_score(y_val,y_pred_rf)))
print('accuracy score on validation set: {:.4f}'.format(accuracy_score(y_val,y_pred_rf)))
pd.crosstab(y_val,y_pred_rf)

F1 score on validation set: 0.7586
accuracy score on validation set: 0.6833


col_0,0,1
aki,Unnamed: 1_level_1,Unnamed: 2_level_1
0,201,166
1,177,539


In [None]:
y_pred_ab = BestAdaBoost.predict(X_val)
print('F1 score on validation set: {:.4f}'.format(f1_score(y_val,y_pred_ab)))
print('accuracy score on validation set: {:.4f}'.format(accuracy_score(y_val,y_pred_ab)))
pd.crosstab(y_val,y_pred_ab)

F1 score on validation set: 0.7815
accuracy score on validation set: 0.6953


col_0,0,1
aki,Unnamed: 1_level_1,Unnamed: 2_level_1
0,163,204
1,126,590


In [None]:
pred_df = pd.DataFrame({'XGB': y_pred_xgb, 'RF': y_pred_rf, 'AB': y_pred_ab})
pred_df

Unnamed: 0,XGB,RF,AB
0,0,0,0
1,1,1,1
2,1,1,1
3,1,0,0
4,1,1,1
...,...,...,...
1078,1,0,1
1079,1,1,1
1080,1,1,0
1081,1,1,1


In [None]:
y_pred_en = pred_df.mode(axis=1).squeeze()

In [None]:
print('F1 score on validation set: {:.4f}'.format(f1_score(y_val,y_pred_en)))
print('accuracy score on validation set: {:.4f}'.format(accuracy_score(y_val,y_pred_en)))
pd.crosstab(y_val,y_pred_ab)

F1 score on validation set: 0.7802
accuracy score on validation set: 0.7008


col_0,0,1
aki,Unnamed: 1_level_1,Unnamed: 2_level_1
0,163,204
1,126,590
