Scoreboard

* RandomForest
  - CV: 0.8641400684149557
  - submission: 0.7352 (508 rank)

* RF (hyper paramteter optimized)
  - CV: 0.8647371805748448
  - submission: 0.7433 (510)

In [79]:
!pip install scikit-optimize --quiet

[?25l[K     |███▎                            | 10kB 18.2MB/s eta 0:00:01[K     |██████▌                         | 20kB 23.1MB/s eta 0:00:01[K     |█████████▊                      | 30kB 15.4MB/s eta 0:00:01[K     |█████████████                   | 40kB 16.3MB/s eta 0:00:01[K     |████████████████▏               | 51kB 8.1MB/s eta 0:00:01[K     |███████████████████▍            | 61kB 9.4MB/s eta 0:00:01[K     |██████████████████████▊         | 71kB 8.9MB/s eta 0:00:01[K     |██████████████████████████      | 81kB 9.2MB/s eta 0:00:01[K     |█████████████████████████████▏  | 92kB 9.0MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 5.2MB/s 
[?25h

In [104]:
## Imports
import joblib
import os
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn import model_selection
from xgboost import XGBClassifier

from skopt import gp_minimize
from skopt import space
from functools import partial
from skopt.utils import use_named_args

In [55]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [56]:
dir = '/content/drive/MyDrive/Colab/ML-Projects/Predict-Flu-Vaccine/'

test = pd.read_csv(dir + 'test_set_features.csv')
train = pd.read_csv(dir + 'training_set_features.csv')
train_labels = pd.read_csv(dir + 'training_set_labels.csv')

train_folds_h1n1 = pd.read_csv(dir + 'train_folds_h1n1.csv')
train_folds_seasonal = pd.read_csv(dir + 'train_folds_seasonal.csv')

imputed_test = pd.read_csv(dir + 'imputed_test.csv')

predictions = pd.DataFrame()
predictions['respondent_id'] = test.respondent_id

In [57]:
train_folds_h1n1.shape

(26707, 60)

In [114]:
models = {
    "LR": LogisticRegression(),
    "decision_tree_gini": tree.DecisionTreeClassifier(criterion='gini'),
    "decision_tree_entropy": tree.DecisionTreeClassifier(criterion='entropy'),
    "rf": RandomForestClassifier(),
    "rf-optimized": RandomForestClassifier(
        max_depth= 15,
        n_estimators= 1500,
        criterion= 'entropy',
        max_features= 1.0
    ),
    "xgb": XGBClassifier()
}

In [115]:
num_folds = 5

def run_fold(fold, df, target, model):
  df_train = df[df.kfold != fold].reset_index(drop=True)
  df_valid = df[df.kfold == fold].reset_index(drop=True)

  x_train = df_train.drop([target, 'kfold'], axis=1).values
  y_train = df_train[target].values

  x_valid = df_valid.drop([target, 'kfold'], axis=1).values
  y_valid = df_valid[target].values

  clf = models[model]
  clf.fit(x_train, y_train)

  preds = clf.predict(x_valid)
  if model == 'xgb':
    preds = [round(value) for value in preds]
    score = roc_auc_score(y_valid, preds, average='micro')
  else:  
    score = roc_auc_score(y_valid, clf.predict_proba(x_valid)[:, 1], average='micro')

  print(f"{model} - Fold {fold}, roc_auc_score {score}")
  joblib.dump(clf, os.path.join(dir + f"/models/", f"{model}_{fold}.bin"))

  if fold == num_folds-1:
    predictions[target] = np.float32(clf.predict_proba(imputed_test))

In [28]:
for i in range(num_folds):
  run_fold(i, train_folds_h1n1, 'h1n1_vaccine', 'decision_tree_gini')

decision_tree_gini - Fold 0, roc_auc_score 0.6749757327047746
decision_tree_gini - Fold 1, roc_auc_score 0.6672839163592459
decision_tree_gini - Fold 2, roc_auc_score 0.6846100121197182
decision_tree_gini - Fold 3, roc_auc_score 0.6869695274843364
decision_tree_gini - Fold 4, roc_auc_score 0.6712225664615893


In [116]:
model = 'rf'
for i in range(num_folds):
  run_fold(i, train_folds_h1n1, 'h1n1_vaccine', model)
  run_fold(i, train_folds_seasonal, 'seasonal_vaccine', model)

predictions.to_csv(f"{model}_submission.csv", index=False)  

rf - Fold 0, roc_auc_score 0.8554836757282022
rf - Fold 0, roc_auc_score 0.8482143071516833
rf - Fold 1, roc_auc_score 0.8572611831131038
rf - Fold 1, roc_auc_score 0.8602139470531662
rf - Fold 2, roc_auc_score 0.8595977184242773
rf - Fold 2, roc_auc_score 0.8550589834807955
rf - Fold 3, roc_auc_score 0.8643446220105115
rf - Fold 3, roc_auc_score 0.8489344570981596
rf - Fold 4, roc_auc_score 0.8538177053548424
rf - Fold 4, roc_auc_score 0.8483289785587247


In [None]:
model = 'rf-optimized'
for i in range(num_folds):
  run_fold(i, train_folds_h1n1, 'h1n1_vaccine', model)
  run_fold(i, train_folds_seasonal, 'seasonal_vaccine', model)

predictions.to_csv(f"{model}_submission.csv", index=False)  

In [82]:
# model = 'xgb'
# for i in range(num_folds):
#   run_fold(i, train_folds_h1n1, 'h1n1_vaccine', model)
#   run_fold(i, train_folds_seasonal, 'seasonal_vaccine', model)

# predictions.to_csv(f"{model}_submission.csv", index=False)  

In [108]:
def optimize(params, param_names, x, y):
    params = dict((zip(param_names, params)))

    model = RandomForestClassifier(**params)

    kf = model_selection.StratifiedKFold(n_splits=5)
    roc_auc_scores = []
    
    for idx in kf.split(X=x, y=y):
      train_idx, test_idx = idx[0], idx[1]
      xtrain = x[train_idx]
      ytrain = y[train_idx]

      xtest = x[test_idx]
      ytest = y[test_idx]

      model.fit(xtrain, ytrain)

      preds = model.predict(xtest)

      score = roc_auc_score(ytest, preds)
      roc_auc_scores.append(score)

    return -1*np.mean(roc_auc_scores)  

In [109]:
df = train_folds_h1n1
target = 'h1n1_vaccine' 

X = df.drop([target, 'kfold'], axis=1).values
y = df[target].values

param_space = [
    space.Integer(3, 15, name="max_depth"),
    space.Integer(100, 1500, name="n_estimators"),
    space.Categorical(["gini", "entropy"], name="criterion"),
    space.Real(0.01, 1, prior="uniform", name="max_features")
]

param_names = [
    "max_depth",
    "n_estimators",
    "criterion",
    "max_features"
]

optimization_function = partial(optimize, param_names=param_names, x=X, y=y)

result = gp_minimize(
    optimization_function,
    dimensions=param_space,
    n_calls=15,
    n_random_starts=10,
    verbose=10
)

best_params = dict(zip(param_names, result.x))

print(best_params)

Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 665.7040
Function value obtained: -0.7187
Current minimum: -0.7187
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 515.1678
Function value obtained: -0.7148
Current minimum: -0.7187
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 291.7896
Function value obtained: -0.7181
Current minimum: -0.7187
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 210.8257
Function value obtained: -0.7169
Current minimum: -0.7187
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 119.6853
Function value obtained: -0.7099
Current minimum: -0.7187
Iteration No: 6

In [131]:
!cd drive/MyDrive

In [126]:
!git add .

fatal: not a git repository (or any of the parent directories): .git


In [132]:
ls

[0m[01;34mdrive[0m/  rf_submission.csv  RF_submission.csv  [01;34msample_data[0m/
