In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb

import optuna
from optuna import Trial, visualization


  from pandas import MultiIndex, Int64Index


In [2]:
!nvidia-smi

Sun Apr 24 21:28:55 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 512.15       Driver Version: 512.15       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   49C    P8     4W /  N/A |      0MiB /  6144MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
data = pd.read_csv("../Data/train_features_computed_tabular.csv")
labels = pd.read_csv("../Data/train_labels.csv")
data = data.merge(labels, on="sequence", how="left")
X, y = data.drop(["sequence", "state"], axis=1).values, data["state"].values


In [4]:
scaler = StandardScaler()
model = scaler.fit(X)
X = model.transform(X)


In [5]:
test = pd.read_csv("../Data/test_features_computed_tabular.csv")
test.drop("sequence", axis=1, inplace=True)
test = scaler.transform(test.values)


In [5]:
def Objective(trial):

    param = {
        "tree_method": "gpu_hist",  # this parameter means using the GPU when training our model to speedup the training process
        "lambda": trial.suggest_loguniform("lambda", 1e-3, 10.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-3, 10.0),
        "colsample_bytree": trial.suggest_categorical(
            "colsample_bytree", [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
        ),
        "subsample": trial.suggest_categorical(
            "subsample", [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]
        ),
        "learning_rate": trial.suggest_categorical(
            "learning_rate", [0.008, 0.009, 0.01, 0.012, 0.014, 0.016, 0.018, 0.02]
        ),
        "n_estimators": 4000,
        "max_depth": trial.suggest_categorical(
            "max_depth", [5, 7, 9, 11, 13, 15, 17, 20]
        ),
        "random_state": trial.suggest_categorical("random_state", [24, 48, 2020]),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 300),
    }

    kf = KFold(n_splits=3, random_state=42, shuffle=True)
    CV_score_array = []
    for train_index, test_index in kf.split(X):
        X_train, X_valid = X[train_index], X[test_index]
        y_train, y_valid = y[train_index], y[test_index]
        clf = xgb.XGBClassifier(**param, use_label_encoder=False)
        clf.fit(
            X_train,
            y_train,
            eval_set=[(X_valid, y_valid)],
        )

        preds = clf.predict(X_valid)
        auc = roc_auc_score(y_valid, preds)

        CV_score_array.append(auc)
    avg = np.mean(CV_score_array)
    return avg


In [6]:
study = optuna.create_study(direction="maximize", study_name="xgb optimization")
study.optimize(Objective, timeout=6 * 60)  # 5 hours

best = study.best_params
print("The best parameters are ", best)


[32m[I 2022-04-12 13:26:15,293][0m A new study created in memory with name: xgb optimization[0m


[0]	validation_0-logloss:0.68897
[1]	validation_0-logloss:0.68490
[2]	validation_0-logloss:0.68087
[3]	validation_0-logloss:0.67691
[4]	validation_0-logloss:0.67334
[5]	validation_0-logloss:0.66957
[6]	validation_0-logloss:0.66588
[7]	validation_0-logloss:0.66236
[8]	validation_0-logloss:0.65889
[9]	validation_0-logloss:0.65537
[10]	validation_0-logloss:0.65198
[11]	validation_0-logloss:0.64867
[12]	validation_0-logloss:0.64538
[13]	validation_0-logloss:0.64227
[14]	validation_0-logloss:0.63915
[15]	validation_0-logloss:0.63609
[16]	validation_0-logloss:0.63312
[17]	validation_0-logloss:0.63018
[18]	validation_0-logloss:0.62736
[19]	validation_0-logloss:0.62453
[20]	validation_0-logloss:0.62174
[21]	validation_0-logloss:0.61910
[22]	validation_0-logloss:0.61643
[23]	validation_0-logloss:0.61382
[24]	validation_0-logloss:0.61130
[25]	validation_0-logloss:0.60885
[26]	validation_0-logloss:0.60639
[27]	validation_0-logloss:0.60402
[28]	validation_0-logloss:0.60179
[29]	validation_0-loglos

[32m[I 2022-04-12 13:27:34,534][0m Trial 0 finished with value: 0.836301238022444 and parameters: {'lambda': 0.015601071351228516, 'alpha': 4.077946373932309, 'colsample_bytree': 0.9, 'subsample': 1.0, 'learning_rate': 0.012, 'max_depth': 5, 'random_state': 24, 'min_child_weight': 40}. Best is trial 0 with value: 0.836301238022444.[0m


[0]	validation_0-logloss:0.68801
[1]	validation_0-logloss:0.68348
[2]	validation_0-logloss:0.67851
[3]	validation_0-logloss:0.67375
[4]	validation_0-logloss:0.66896
[5]	validation_0-logloss:0.66438
[6]	validation_0-logloss:0.65990
[7]	validation_0-logloss:0.65575
[8]	validation_0-logloss:0.65153
[9]	validation_0-logloss:0.64770
[10]	validation_0-logloss:0.64397
[11]	validation_0-logloss:0.64006
[12]	validation_0-logloss:0.63611
[13]	validation_0-logloss:0.63257
[14]	validation_0-logloss:0.62902
[15]	validation_0-logloss:0.62557
[16]	validation_0-logloss:0.62215
[17]	validation_0-logloss:0.61871
[18]	validation_0-logloss:0.61541
[19]	validation_0-logloss:0.61213
[20]	validation_0-logloss:0.60887
[21]	validation_0-logloss:0.60570
[22]	validation_0-logloss:0.60276
[23]	validation_0-logloss:0.59982
[24]	validation_0-logloss:0.59706
[25]	validation_0-logloss:0.59446
[26]	validation_0-logloss:0.59161
[27]	validation_0-logloss:0.58900
[28]	validation_0-logloss:0.58652
[29]	validation_0-loglos

[32m[I 2022-04-12 13:30:55,299][0m Trial 1 finished with value: 0.8419950300706085 and parameters: {'lambda': 0.6783614934699764, 'alpha': 1.7965845621375391, 'colsample_bytree': 0.6, 'subsample': 0.6, 'learning_rate': 0.014, 'max_depth': 20, 'random_state': 2020, 'min_child_weight': 19}. Best is trial 1 with value: 0.8419950300706085.[0m


[0]	validation_0-logloss:0.68966
[1]	validation_0-logloss:0.68622
[2]	validation_0-logloss:0.68289
[3]	validation_0-logloss:0.67962
[4]	validation_0-logloss:0.67641
[5]	validation_0-logloss:0.67325
[6]	validation_0-logloss:0.67017
[7]	validation_0-logloss:0.66740
[8]	validation_0-logloss:0.66444
[9]	validation_0-logloss:0.66166
[10]	validation_0-logloss:0.65871
[11]	validation_0-logloss:0.65587
[12]	validation_0-logloss:0.65310
[13]	validation_0-logloss:0.65041
[14]	validation_0-logloss:0.64768
[15]	validation_0-logloss:0.64519
[16]	validation_0-logloss:0.64254
[17]	validation_0-logloss:0.63989
[18]	validation_0-logloss:0.63737
[19]	validation_0-logloss:0.63486
[20]	validation_0-logloss:0.63239
[21]	validation_0-logloss:0.62998
[22]	validation_0-logloss:0.62767
[23]	validation_0-logloss:0.62528
[24]	validation_0-logloss:0.62300
[25]	validation_0-logloss:0.62079
[26]	validation_0-logloss:0.61864
[27]	validation_0-logloss:0.61661
[28]	validation_0-logloss:0.61447
[29]	validation_0-loglos

[32m[I 2022-04-12 13:32:24,293][0m Trial 2 finished with value: 0.8366935505789516 and parameters: {'lambda': 0.03039710666087754, 'alpha': 0.006540356765704074, 'colsample_bytree': 0.9, 'subsample': 0.8, 'learning_rate': 0.01, 'max_depth': 11, 'random_state': 2020, 'min_child_weight': 95}. Best is trial 1 with value: 0.8419950300706085.[0m


The best parameters are  {'lambda': 0.6783614934699764, 'alpha': 1.7965845621375391, 'colsample_bytree': 0.6, 'subsample': 0.6, 'learning_rate': 0.014, 'max_depth': 20, 'random_state': 2020, 'min_child_weight': 19}


### Fit the above output

In [6]:
clf = xgb.XGBClassifier(reg_lambda = 0.6783614934699764, alpha = 1.7965845621375391, colsample_bytree = 0.6, subsample= 0.6, learning_rate= 0.014, max_depth= 20, random_state = 2020, min_child_weight= 19, use_label_encoder=False)
clf.fit(
    X,
    y,
)

preds = clf.predict(test)



In [15]:
sub = pd.read_csv('../Data/sample_submission.csv')
sub['state'] = preds
sub.to_csv('../Submissions/xgb_sub.csv', index = False)