In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split, cross_val_score
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import VotingClassifier
from tqdm import tqdm
import matplotlib.pyplot as plt
from hyperopt import fmin, STATUS_OK, tpe, Trials, hp, space_eval
from boruta import BorutaPy

In [2]:
pd.set_option('display.max_rows',None)

In [3]:
base_path = Path().cwd().joinpath('/kaggle/input/molecular-descriptors/')
train_df = pd.read_csv(base_path.joinpath('train_molecular_data.csv'))
test_df = pd.read_csv(base_path.joinpath('test_molecular_data.csv'))

In [5]:
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)
train_df.isnull().sum()

Assay Id                    0
MaxEStateIndex              0
MinEStateIndex              0
MaxAbsEStateIndex           0
MinAbsEStateIndex           0
qed                         0
MolWt                       0
HeavyAtomMolWt              0
ExactMolWt                  0
NumValenceElectrons         0
NumRadicalElectrons         0
MaxPartialCharge            0
MinPartialCharge            0
MaxAbsPartialCharge         0
MinAbsPartialCharge         0
FpDensityMorgan1            0
FpDensityMorgan2            0
FpDensityMorgan3            0
BCUT2D_MWHI                 0
BCUT2D_MWLOW                0
BCUT2D_CHGHI                0
BCUT2D_CHGLO                0
BCUT2D_LOGPHI               0
BCUT2D_LOGPLOW              0
BCUT2D_MRHI                 0
BCUT2D_MRLOW                0
BalabanJ                    0
BertzCT                     0
Chi0                        0
Chi0n                       0
Chi0v                       0
Chi1                        0
Chi1n                       0
Chi1v     

In [7]:
train_y = train_df.loc[:, 'Expected']
train_X = train_df.drop(['Expected'], axis=1)

In [8]:
skf = StratifiedKFold(random_state=10, n_splits=5, shuffle=True)

In [9]:
lab = LabelEncoder()
train_yT = lab.fit_transform(train_y)

In [16]:
xgb = XGBClassifier(n_jobs = -1, tree_method='gpu_hist')

In [19]:
feat_select = BorutaPy(estimator=xgb, n_estimators='auto', verbose=2, random_state=10, max_iter=200)

In [20]:
feat_select.fit(train_X.to_numpy(), train_yT)

Iteration: 	1 / 200
Confirmed: 	0
Tentative: 	209
Rejected: 	0
Iteration: 	2 / 200
Confirmed: 	0
Tentative: 	209
Rejected: 	0
Iteration: 	3 / 200
Confirmed: 	0
Tentative: 	209
Rejected: 	0
Iteration: 	4 / 200
Confirmed: 	0
Tentative: 	209
Rejected: 	0
Iteration: 	5 / 200
Confirmed: 	0
Tentative: 	209
Rejected: 	0
Iteration: 	6 / 200
Confirmed: 	0
Tentative: 	209
Rejected: 	0
Iteration: 	7 / 200
Confirmed: 	0
Tentative: 	209
Rejected: 	0
Iteration: 	8 / 200
Confirmed: 	0
Tentative: 	124
Rejected: 	85
Iteration: 	9 / 200
Confirmed: 	6
Tentative: 	118
Rejected: 	85
Iteration: 	10 / 200
Confirmed: 	6
Tentative: 	118
Rejected: 	85
Iteration: 	11 / 200
Confirmed: 	6
Tentative: 	118
Rejected: 	85
Iteration: 	12 / 200
Confirmed: 	11
Tentative: 	93
Rejected: 	105
Iteration: 	13 / 200
Confirmed: 	11
Tentative: 	93
Rejected: 	105
Iteration: 	14 / 200
Confirmed: 	11
Tentative: 	93
Rejected: 	105
Iteration: 	15 / 200
Confirmed: 	11
Tentative: 	93
Rejected: 	105
Iteration: 	16 / 200
Confirmed: 	11
T

BorutaPy(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                 callbacks=None, colsample_bylevel=1,
                                 colsample_bynode=1, colsample_bytree=1,
                                 early_stopping_rounds=None,
                                 enable_categorical=False, eval_metric=None,
                                 gamma=0, gpu_id=0, grow_policy='depthwise',
                                 importance_type=None,
                                 interaction_constraints='',
                                 learning_rate=0.300000012, max_bin=256,
                                 max_cat_to_onehot=4, max_delta_step=0,
                                 max_depth=6, max_leaves=0, min_child_weight=1,
                                 missing=nan, monotone_constraints='()',
                                 n_estimators=206, n_jobs=-1,
                                 num_parallel_tree=1, predictor='auto',
                           

In [21]:
feat_select.ranking_

array([  1,  48,   1, 130,  68,  36,  77,   1,   1,  10,   2,  36,   1,
         1,   2,   1,   1,  40,  53,   1,  75,   1,   1,  13,  63,  32,
        64,   1,   1,   1,   1,   2,   1,   1,  33,  42,   1,  25,  78,
        48,   1,   1,  57,   1,   1,  48,  65,  39,  34,  55,   1,   1,
        63,   1,  59,   2,  52,  45,  73,  83,  95,   1,   2,  48,  30,
        71,   1,   1, 130,  94,  58,   2,  60,  72,   1,  27,  74,   1,
        48,  27,  91, 130,   1,  81,  92, 119,  86,  30,  87,  69,  68,
        90,   1,  52,   2,   1,   1,   1,   1,   1,   1,  57,  66,   1,
         1, 104, 101,  88,   1,  96, 105,   1,  23,   1,  98,  97,  54,
        71,   1,  83,  81,  79,   1,   1,  61, 101,  99,   1,  94,  15,
        23,  23,   1,  11,   5,   5,   1, 114,  39,   2,   1,   1, 119,
       114,  16, 130,  42,   1, 109,  12,  85,  44, 106,   1,  89, 130,
        83, 130,  30, 130,  23, 130, 130,   8,  42,   2, 109, 107,   1,
       114, 130,   3,  19, 130, 130,   3,  77, 130,   1, 102, 13

In [22]:
combined_dict = {'features': train_X.columns.to_list(), 'rankings':feat_select.ranking_}
feature_ranks = pd.DataFrame(data=combined_dict)

In [24]:
feature_ranks

Unnamed: 0,features,rankings
0,Assay Id,1
1,MaxEStateIndex,48
2,MinEStateIndex,1
3,MaxAbsEStateIndex,130
4,MinAbsEStateIndex,68
5,qed,36
6,MolWt,77
7,HeavyAtomMolWt,1
8,ExactMolWt,1
9,NumValenceElectrons,10


In [29]:
feature_ranks.to_csv('feature_ranks_v2.csv', index=False)

In [10]:
feature_ranks = pd.read_csv('/kaggle/input/selected-features/feature_ranks_v2.csv')

In [11]:
feature_ranks.sort_values('rankings', inplace=True)
feature_ranks.reset_index(drop=True, inplace=True)

In [12]:
feature_ranks

Unnamed: 0,features,rankings
0,Assay Id,1
1,fr_NH1,1
2,fr_C_S,1
3,fr_COO,1
4,fr_ArN,1
5,MolMR,1
6,MolLogP,1
7,NumAromaticRings,1
8,Kappa2,1
9,FractionCSP3,1


In [13]:
train_fs = train_df.loc[:, feature_ranks[(feature_ranks['rankings'] <= 10)]['features']]

In [14]:
train_fs.head()

Unnamed: 0,Assay Id,fr_NH1,fr_C_S,fr_COO,fr_ArN,MolMR,MolLogP,NumAromaticRings,Kappa2,FractionCSP3,...,fr_NH0,fr_imidazole,fr_ketone,fr_C_O_noCOO,fr_nitro,fr_C_O,fr_unbrch_alkane,fr_epoxide,fr_nitro_arom,NumValenceElectrons
0,1644,0,0,0,0,78.3466,4.5999,2,5.494957,0.142857,...,0,0,0,0,0,0,0,0,0,100
1,2451,0,0,0,0,48.674,3.326,0,7.773413,0.9,...,0,0,1,1,0,1,4,0,0,66
2,1384,0,0,0,0,107.0624,4.3482,0,19.573095,1.0,...,1,0,0,0,0,0,12,0,0,148
3,16,1,0,0,0,62.0891,0.6879,1,4.960836,0.333333,...,4,0,0,0,1,0,0,0,0,90
4,1856,0,0,0,0,0.0,-5.992,0,2.73,0.0,...,0,0,0,0,0,0,0,0,0,8


In [15]:
test_fs = test_df.loc[:, feature_ranks[(feature_ranks['rankings'] <= 10)]['features']]

In [16]:
params = {
    'tree_method': 'gpu_hist',
    'n_jobs': -1
#     'max_bin': 255
}

In [69]:
xgb_x = XGBClassifier(**params, n_estimators=600, colsample_bytree=0.63, learning_rate=0.13, max_depth=15, subsample=0.96, gamma=0.015)
scores = cross_validate(estimator=xgb_x, X=train_fs, y=train_yT,cv=skf, scoring='f1', n_jobs=-1)
print(np.mean(scores['test_score']))

0.9480115632845866


In [17]:
xgb_1 = XGBClassifier(**params, n_estimators=500)
xgb_2 = XGBClassifier(**params, n_estimators=500, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_3 = XGBClassifier(**params, n_estimators=450, max_depth=12, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_4 = XGBClassifier(**params, n_estimators=650, max_depth=10, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_5 = XGBClassifier(**params, n_estimators=500, max_depth=10, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_6 = XGBClassifier(**params, n_estimators=800, colsample_bytree=0.7, learning_rate=0.13)

classifier_list = [
    ('xgb_1', xgb_1), ('xgb_2', xgb_2), ('xgb_3', xgb_3), ('xgb_4', xgb_4), ('xgb_5', xgb_5), ('xgb_6', xgb_6)
]
voting_clf = VotingClassifier(estimators=classifier_list, n_jobs=-1, verbose=2, voting='soft', weights=[1,2,1,1,1,1])

In [18]:
scores = cross_validate(estimator=voting_clf, X=train_fs, y=train_yT,cv=skf, scoring='f1', n_jobs=-1)
print(np.mean(scores['test_score']))

0.9500678367549013
[Voting] .................... (2 of 6) Processing xgb_2, total=  23.1s
[Voting] .................... (1 of 6) Processing xgb_1, total=  23.2s
[Voting] .................... (4 of 6) Processing xgb_4, total= 1.4min
[Voting] .................... (3 of 6) Processing xgb_3, total= 1.6min
[Voting] .................... (6 of 6) Processing xgb_6, total=  25.2s
[Voting] .................... (5 of 6) Processing xgb_5, total=  49.5s
[Voting] .................... (2 of 6) Processing xgb_2, total=  14.7s
[Voting] .................... (1 of 6) Processing xgb_1, total=  15.1s
[Voting] .................... (4 of 6) Processing xgb_4, total= 1.4min
[Voting] .................... (3 of 6) Processing xgb_3, total= 1.6min
[Voting] .................... (6 of 6) Processing xgb_6, total=  24.6s
[Voting] .................... (5 of 6) Processing xgb_5, total=  47.9s
[Voting] .................... (2 of 6) Processing xgb_2, total=  23.1s
[Voting] .................... (1 of 6) Processing xgb_1, t

In [19]:
voting_clf.fit(train_fs, train_yT)

VotingClassifier(estimators=[('xgb_1',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None, gamma=None,
                                            gpu_id=None, grow_policy=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=None, max_bin=N...
                                            interaction_constraints=None,
                                          

In [23]:
y_pred = voting_clf.predict(test_fs)

In [24]:
predT = lab.inverse_transform(y_pred)
print(predT)
final_df = pd.read_csv("/kaggle/input/initial-data/test_II.csv")
final_df['Predicted'] = predT
final_df.columns = ['Id', 'Predicted']
final_df.to_csv('submission44.csv', index=False)

[2 2 2 ... 2 2 2]


In [30]:
feature_ranks

Unnamed: 0,features,rankings
0,Assay Id,1
1,fr_NH1,1
2,fr_C_S,1
3,fr_COO,1
4,fr_ArN,1
5,MolMR,1
6,MolLogP,1
7,NumAromaticRings,1
8,Kappa2,1
9,FractionCSP3,1


In [31]:
train_fs = train_df.loc[:, feature_ranks[(feature_ranks['rankings'] <= 99)]['features']]
test_fs = test_df.loc[:, feature_ranks[(feature_ranks['rankings'] <= 99)]['features']]

In [60]:
X_train, X_val, y_train, y_val = train_test_split(train_fs, train_yT, test_size=0.25, random_state=10, stratify=train_yT)
standard_params = {
    'n_jobs':-1,
    'tree_method': 'gpu_hist',
    'booster': 'gbtree',
    'max_bin': 256
}
def objective(space):
    xgb_cl = XGBClassifier(**space, **standard_params, eval_metric="error", early_stopping_rounds=10)
    
    fit_params={'verbose': False, 'eval_set': [[X_val, y_val]]}
    
    score = cross_val_score(estimator=xgb_cl, X=X_train, y=y_train, cv=skf, fit_params=fit_params, scoring='f1').mean()

    return {'loss': -score, 'status':STATUS_OK}
hyper_space = {
    'n_estimators': hp.randint('n_estimators', 100, 1000),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'max_depth': hp.randint('max_depth', 3, 18),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
#     'gamma': hp.uniform ('gamma', 0.01,9),
#     'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
#     'reg_lambda' : hp.uniform('reg_lambda', 0,1),
    'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1)
}
trials = Trials()
best_params = fmin(fn=objective, space=hyper_space, algo=tpe.suggest, max_evals=100, trials=trials)

100%|██████████| 100/100 [30:50<00:00, 18.51s/trial, best loss: -0.9466631434360927]


In [61]:
best_params

{'colsample_bytree': 0.5280605127857014,
 'learning_rate': 0.1380537386450915,
 'max_depth': 15,
 'min_child_weight': 7.0,
 'n_estimators': 905,
 'subsample': 0.9892366661343831}

In [70]:
xgb_x = XGBClassifier(**params, n_estimators=600, max_depth=10, colsample_bytree=0.63, learning_rate=0.14, subsample=0.99)
scores = cross_validate(estimator=xgb_x, X=train_fs, y=train_yT,cv=skf, scoring='f1', n_jobs=-1)
print(np.mean(scores['test_score']))

0.9491659531456745


In [72]:
xgb_1 = XGBClassifier(**params, n_estimators=500)
xgb_2 = XGBClassifier(**params, n_estimators=700, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_3 = XGBClassifier(**params, n_estimators=450, max_depth=12, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_4 = XGBClassifier(**params, n_estimators=800, max_depth=15, colsample_bytree=0.68, learning_rate=0.09, subsample=0.90, min_child_weight=0.0, gamma=1.3)
xgb_5 = XGBClassifier(**params, n_estimators=800, max_depth=12, colsample_bytree=0.68, learning_rate=0.09, subsample=0.90, min_child_weight=0.0, gamma=1.3)
xgb_6 = XGBClassifier(**params, n_estimators=700, max_depth=10, colsample_bytree=0.68, learning_rate=0.2, subsample=0.90, min_child_weight=0.0, gamma=0.3)
xgb_7 = XGBClassifier(**params, n_estimators=700, max_depth=15, colsample_bytree=0.53, learning_rate=0.14, subsample=0.99, min_child_weight=7.0)
xgb_8 = XGBClassifier(**params, n_estimators=600, max_depth=10, colsample_bytree=0.63, learning_rate=0.14, subsample=0.99)

classifier_list = [
    ('xgb_1', xgb_1), ('xgb_2', xgb_2), ('xgb_3', xgb_3), ('xgb_4', xgb_4), ('xgb_5', xgb_5), ('xgb_6', xgb_6), ('xgb_7', xgb_7), ('xgb_8', xgb_8)
]
voting_clf = VotingClassifier(estimators=classifier_list, n_jobs=-1, verbose=2, voting='soft', weights=[1,1,2,1,2,1, 1, 1])

In [73]:
scores = cross_validate(estimator=voting_clf, X=train_fs, y=train_yT,cv=skf, scoring='f1', n_jobs=-1)
print(np.mean(scores['test_score']))

[Voting] .................... (1 of 8) Processing xgb_1, total=  22.6s
[Voting] .................... (2 of 8) Processing xgb_2, total=  28.2s
[Voting] .................... (4 of 8) Processing xgb_4, total=  57.3s
[Voting] .................... (5 of 8) Processing xgb_5, total=  46.0s
[Voting] .................... (3 of 8) Processing xgb_3, total= 2.0min
[Voting] .................... (6 of 8) Processing xgb_6, total=  52.8s
[Voting] .................... (7 of 8) Processing xgb_7, total= 2.1min
[Voting] .................... (8 of 8) Processing xgb_8, total= 1.5min
[Voting] .................... (1 of 8) Processing xgb_1, total=  20.5s
[Voting] .................... (2 of 8) Processing xgb_2, total=  25.6s
[Voting] .................... (4 of 8) Processing xgb_4, total=  56.0s
[Voting] .................... (5 of 8) Processing xgb_5, total=  46.2s
[Voting] .................... (3 of 8) Processing xgb_3, total= 2.0min
[Voting] .................... (6 of 8) Processing xgb_6, total=  53.3s
[Votin



[Voting] .................... (1 of 8) Processing xgb_1, total=  22.4s
[Voting] .................... (2 of 8) Processing xgb_2, total=  28.5s
[Voting] .................... (4 of 8) Processing xgb_4, total=  57.3s
[Voting] .................... (5 of 8) Processing xgb_5, total=  45.4s
[Voting] .................... (3 of 8) Processing xgb_3, total= 2.0min
[Voting] .................... (6 of 8) Processing xgb_6, total=  53.6s
[Voting] .................... (8 of 8) Processing xgb_8, total= 1.5min
[Voting] .................... (7 of 8) Processing xgb_7, total= 2.1min
[Voting] .................... (1 of 8) Processing xgb_1, total=  21.2s
[Voting] .................... (2 of 8) Processing xgb_2, total=  28.2s
[Voting] .................... (4 of 8) Processing xgb_4, total=  57.1s
[Voting] .................... (5 of 8) Processing xgb_5, total=  46.7s
[Voting] .................... (3 of 8) Processing xgb_3, total= 2.0min
[Voting] .................... (6 of 8) Processing xgb_6, total=  51.4s
[Votin

In [75]:
def pred_and_sub(classifier, sub_no):
    classifier.fit(X=train_fs, y=train_yT)
    pred_vals = classifier.predict(test_fs)
    print(pred_vals)
    new_pred = lab.inverse_transform(pred_vals)
    print(new_pred)
    final_df = pd.read_csv("/kaggle/input/initial-data/test_II.csv")
    final_df['Predicted'] = new_pred
    final_df.columns = ['Id', 'Predicted']
    final_df.to_csv(f'submission{sub_no}.csv', index=False)

In [76]:
pred_and_sub(voting_clf, 45)

[1 1 1 ... 1 1 1]
[2 2 2 ... 2 2 2]
[Voting] .................... (1 of 8) Processing xgb_1, total=  11.4s
[Voting] .................... (2 of 8) Processing xgb_2, total=  13.7s
[Voting] .................... (4 of 8) Processing xgb_4, total=  22.6s
[Voting] .................... (5 of 8) Processing xgb_5, total=  17.2s
[Voting] .................... (3 of 8) Processing xgb_3, total=  48.7s
[Voting] .................... (6 of 8) Processing xgb_6, total=  20.5s
[Voting] .................... (8 of 8) Processing xgb_8, total=  33.7s
[Voting] .................... (7 of 8) Processing xgb_7, total=  48.8s
[Voting] .................... (1 of 8) Processing xgb_1, total=   7.5s
[Voting] .................... (3 of 8) Processing xgb_3, total= 1.1min
[Voting] .................... (6 of 8) Processing xgb_6, total=  30.8s
[Voting] .................... (8 of 8) Processing xgb_8, total=  48.3s
[Voting] .................... (2 of 8) Processing xgb_2, total=  18.0s
[Voting] .................... (4 of 8) Pr

In [77]:
xgb_1 = XGBClassifier(**params, n_estimators=500)
xgb_2 = XGBClassifier(**params, n_estimators=700, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_3 = XGBClassifier(**params, n_estimators=450, max_depth=12, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_4 = XGBClassifier(**params, n_estimators=800, max_depth=15, colsample_bytree=0.68, learning_rate=0.09, subsample=0.90, min_child_weight=0.0, gamma=1.3)
xgb_5 = XGBClassifier(**params, n_estimators=800, max_depth=12, colsample_bytree=0.68, learning_rate=0.09, subsample=0.90, min_child_weight=0.0, gamma=1.3)
xgb_6 = XGBClassifier(**params, n_estimators=700, max_depth=10, colsample_bytree=0.68, learning_rate=0.2, subsample=0.90, min_child_weight=0.0, gamma=0.3)
xgb_7 = XGBClassifier(**params, n_estimators=700, max_depth=15, colsample_bytree=0.53, learning_rate=0.14, subsample=0.99, min_child_weight=7.0)
xgb_8 = XGBClassifier(**params, n_estimators=600, max_depth=10, colsample_bytree=0.63, learning_rate=0.14, subsample=0.99)
xgb_9 = XGBClassifier(**params, n_estimators=700, max_depth=8, learning_rate=0.14)
xgb_10 = XGBClassifier(**params, n_estimators=600, max_depth=10, learning_rate=0.2)
xgb_11 = XGBClassifier(**params, n_estimators=700, max_depth=12, colsample_bytree=0.7, learning_rate=0.14, subsample=0.99)
xgb_12 = XGBClassifier(**params, n_estimators=650, max_depth=10, learning_rate=0.2)

classifier_list = [
    ('xgb_1', xgb_1), ('xgb_2', xgb_2), ('xgb_3', xgb_3), ('xgb_4', xgb_4), ('xgb_5', xgb_5), ('xgb_6', xgb_6), ('xgb_7', xgb_7), ('xgb_8', xgb_8), ('xgb_9', xgb_9), ('xgb_10', xgb_10), ('xgb_11', xgb_11), ('xgb_12', xgb_12)
]
voting_clf_2 = VotingClassifier(estimators=classifier_list, n_jobs=-1, verbose=2, voting='soft', weights=[1,1,2,1,2,1, 1, 1, 1, 1, 1, 1])

In [78]:
scores = cross_validate(estimator=voting_clf_2, X=train_fs, y=train_yT,cv=skf, scoring='f1', n_jobs=-1)
print(np.mean(scores['test_score']))

0.9503831590649063


# **Best Submission**

In [79]:
pred_and_sub(voting_clf_2, 46)

[1 1 1 ... 1 1 1]
[2 2 2 ... 2 2 2]
[Voting] ................... (1 of 12) Processing xgb_1, total=  27.4s
[Voting] ................... (2 of 12) Processing xgb_2, total=  33.1s
[Voting] ................... (4 of 12) Processing xgb_4, total=  57.0s
[Voting] ................... (5 of 12) Processing xgb_5, total=  45.4s
[Voting] ................... (3 of 12) Processing xgb_3, total= 2.0min
[Voting] ................... (6 of 12) Processing xgb_6, total=  52.5s
[Voting] ................... (7 of 12) Processing xgb_7, total= 2.1min
[Voting] ................... (8 of 12) Processing xgb_8, total= 1.5min
[Voting] ................... (9 of 12) Processing xgb_9, total= 1.1min
[Voting] ................. (10 of 12) Processing xgb_10, total= 1.6min
[Voting] ................. (12 of 12) Processing xgb_12, total= 1.7min
[Voting] ................. (11 of 12) Processing xgb_11, total= 2.4min
[Voting] ................... (1 of 12) Processing xgb_1, total=  21.2s
[Voting] ................... (2 of 12) Pr

In [80]:
train_fs = train_df.loc[:, feature_ranks[(feature_ranks['rankings'] <= 75)]['features']]
test_fs = test_df.loc[:, feature_ranks[(feature_ranks['rankings'] <= 75)]['features']]

In [81]:
X_train, X_val, y_train, y_val = train_test_split(train_fs, train_yT, test_size=0.25, random_state=10, stratify=train_yT)
standard_params = {
    'n_jobs':-1,
    'tree_method': 'gpu_hist',
    'booster': 'gbtree',
    'max_bin': 256
}
def objective(space):
    xgb_cl = XGBClassifier(**space, **standard_params, eval_metric="error", early_stopping_rounds=10)
    
    fit_params={'verbose': False, 'eval_set': [[X_val, y_val]]}
    
    score = cross_val_score(estimator=xgb_cl, X=X_train, y=y_train, cv=skf, fit_params=fit_params, scoring='f1').mean()

    return {'loss': -score, 'status':STATUS_OK}
hyper_space = {
    'n_estimators': hp.randint('n_estimators', 100, 1000),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'max_depth': hp.randint('max_depth', 3, 18),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'gamma': hp.uniform ('gamma', 0.01,9),
#     'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
#     'reg_lambda' : hp.uniform('reg_lambda', 0,1),
    'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1)
}
trials = Trials()
best_params = fmin(fn=objective, space=hyper_space, algo=tpe.suggest, max_evals=100, trials=trials)

100%|██████████| 100/100 [19:39<00:00, 11.80s/trial, best loss: -0.946003188916184]


In [84]:
best_params

{'colsample_bytree': 0.6895392474881956,
 'gamma': 1.908100072278277,
 'learning_rate': 0.09352980239795879,
 'max_depth': 17,
 'min_child_weight': 5.0,
 'n_estimators': 574,
 'subsample': 0.7701277124090848}

In [99]:
xgb_x = XGBClassifier(**params, n_estimators=600, max_depth=10, colsample_bytree=0.63, learning_rate=0.14, subsample=0.99)
scores = cross_validate(estimator=xgb_x, X=train_fs, y=train_yT,cv=skf, scoring='f1', n_jobs=-1)
print(np.mean(scores['test_score']))

0.9494140361679799


In [101]:
xgb_1 = XGBClassifier(**params, n_estimators=574, max_depth=17, colsample_bytree=0.69, learning_rate=0.09, subsample=0.77, gamma=1.9)
xgb_2 = XGBClassifier(**params, n_estimators=674, max_depth=12, colsample_bytree=0.69, learning_rate=0.09, subsample=0.77, gamma=1.9)
xgb_3 = XGBClassifier(**params, n_estimators=450, max_depth=12, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_4 = XGBClassifier(**params, n_estimators=800, max_depth=15, colsample_bytree=0.68, learning_rate=0.09, subsample=0.90, min_child_weight=0.0, gamma=1.3)
xgb_5 = XGBClassifier(**params, n_estimators=800, max_depth=12, colsample_bytree=0.68, learning_rate=0.09, subsample=0.90, min_child_weight=0.0, gamma=1.3)
xgb_6 = XGBClassifier(**params, n_estimators=700, max_depth=10, colsample_bytree=0.68, learning_rate=0.2, subsample=0.90, min_child_weight=0.0, gamma=0.3)
xgb_7 = XGBClassifier(**params, n_estimators=600, max_depth=10, colsample_bytree=0.69, learning_rate=0.2, subsample=0.97)
xgb_8 = XGBClassifier(**params, n_estimators=600, max_depth=10, colsample_bytree=0.63, learning_rate=0.14, subsample=0.99)
xgb_9 = XGBClassifier(**params, n_estimators=700, max_depth=8, learning_rate=0.14)
xgb_10 = XGBClassifier(**params, n_estimators=600, max_depth=10, learning_rate=0.2)
xgb_11 = XGBClassifier(**params, n_estimators=700, max_depth=12, colsample_bytree=0.7, learning_rate=0.14, subsample=0.99)
xgb_12 = XGBClassifier(**params, n_estimators=650, max_depth=10, learning_rate=0.2)

classifier_list = [
    ('xgb_1', xgb_1), ('xgb_2', xgb_2), ('xgb_3', xgb_3), ('xgb_4', xgb_4), ('xgb_5', xgb_5), ('xgb_6', xgb_6), ('xgb_7', xgb_7), ('xgb_8', xgb_8), ('xgb_9', xgb_9), ('xgb_10', xgb_10), ('xgb_11', xgb_11), ('xgb_12', xgb_12)
]
voting_clf_3 = VotingClassifier(estimators=classifier_list, n_jobs=-1, verbose=2, voting='soft', weights=[2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1])

In [102]:
scores = cross_validate(estimator=voting_clf_3, X=train_fs, y=train_yT,cv=skf, scoring='f1', n_jobs=-1)
print(np.mean(scores['test_score']))

[Voting] ................... (2 of 12) Processing xgb_2, total=  34.1s
[Voting] ................... (1 of 12) Processing xgb_1, total=  41.6s
[Voting] ................... (4 of 12) Processing xgb_4, total=  52.7s
[Voting] ................... (5 of 12) Processing xgb_5, total=  43.0s
[Voting] ................... (3 of 12) Processing xgb_3, total= 1.9min
[Voting] ................... (6 of 12) Processing xgb_6, total=  51.1s
[Voting] ................... (7 of 12) Processing xgb_7, total= 1.4min
[Voting] ................... (8 of 12) Processing xgb_8, total= 1.4min
[Voting] ................... (9 of 12) Processing xgb_9, total= 1.0min
[Voting] ................. (10 of 12) Processing xgb_10, total= 1.5min
[Voting] ................. (11 of 12) Processing xgb_11, total= 2.6min
[Voting] ................. (12 of 12) Processing xgb_12, total= 1.5min




[Voting] ................... (2 of 12) Processing xgb_2, total=  34.5s
[Voting] ................... (1 of 12) Processing xgb_1, total=  42.3s
[Voting] ................... (4 of 12) Processing xgb_4, total=  53.1s
[Voting] ................... (5 of 12) Processing xgb_5, total=  43.7s
[Voting] ................... (3 of 12) Processing xgb_3, total= 1.9min
[Voting] ................... (6 of 12) Processing xgb_6, total=  50.1s
[Voting] ................... (7 of 12) Processing xgb_7, total= 1.4min
[Voting] ................... (8 of 12) Processing xgb_8, total= 1.4min
[Voting] ................... (9 of 12) Processing xgb_9, total= 1.0min
[Voting] ................. (10 of 12) Processing xgb_10, total= 1.5min
[Voting] ................. (11 of 12) Processing xgb_11, total= 2.6min
[Voting] ................. (12 of 12) Processing xgb_12, total= 1.6min
0.9503621012358241


In [103]:
pred_and_sub(voting_clf_3, 47)

[1 1 1 ... 1 1 1]
[2 2 2 ... 2 2 2]
[Voting] ................... (2 of 12) Processing xgb_2, total=  36.9s
[Voting] ................... (1 of 12) Processing xgb_1, total=  44.6s
[Voting] ................... (4 of 12) Processing xgb_4, total=  51.9s
[Voting] ................... (5 of 12) Processing xgb_5, total=  41.4s
[Voting] ................... (3 of 12) Processing xgb_3, total= 1.9min
[Voting] ................... (6 of 12) Processing xgb_6, total=  51.8s
[Voting] ................... (7 of 12) Processing xgb_7, total= 1.4min
[Voting] ................... (8 of 12) Processing xgb_8, total= 1.4min
[Voting] ................... (9 of 12) Processing xgb_9, total=  58.9s
[Voting] ................. (10 of 12) Processing xgb_10, total= 1.5min
[Voting] ................. (11 of 12) Processing xgb_11, total= 2.6min
[Voting] ................. (12 of 12) Processing xgb_12, total= 1.6min
[Voting] ................... (2 of 12) Processing xgb_2, total=  13.9s
[Voting] ................... (1 of 12) Pr

In [104]:
xgb_1 = XGBClassifier(**params, n_estimators=574, max_depth=14, colsample_bytree=0.63, learning_rate=0.19, subsample=0.97)
xgb_2 = XGBClassifier(**params, n_estimators=500, max_depth=12, colsample_bytree=0.69, learning_rate=0.09, subsample=0.77)
xgb_3 = XGBClassifier(**params, n_estimators=450, max_depth=8, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_4 = XGBClassifier(**params, n_estimators=800, max_depth=15, colsample_bytree=0.68, learning_rate=0.09, subsample=0.90, min_child_weight=0.0, gamma=1.3)
xgb_5 = XGBClassifier(**params, n_estimators=800, max_depth=12, colsample_bytree=0.68, learning_rate=0.09, subsample=0.90, min_child_weight=0.0, gamma=1.3)
xgb_6 = XGBClassifier(**params, n_estimators=700, max_depth=10, colsample_bytree=0.68, learning_rate=0.2, subsample=0.90, min_child_weight=0.0)
xgb_7 = XGBClassifier(**params, n_estimators=650, max_depth=8, colsample_bytree=0.69, learning_rate=0.2, subsample=0.97)
xgb_8 = XGBClassifier(**params, n_estimators=600, max_depth=10, colsample_bytree=0.63, learning_rate=0.14, subsample=0.99)

classifier_list = [
    ('xgb_1', xgb_1), ('xgb_2', xgb_2), ('xgb_3', xgb_3), ('xgb_4', xgb_4), ('xgb_5', xgb_5), ('xgb_6', xgb_6), ('xgb_7', xgb_7), ('xgb_8', xgb_8)
]
voting_clf_4 = VotingClassifier(estimators=classifier_list, n_jobs=-1, verbose=2, voting='soft', weights=[1, 1, 1, 1, 2, 1, 1, 2])

In [105]:
scores = cross_validate(estimator=voting_clf_4, X=train_fs, y=train_yT,cv=skf, scoring='f1', n_jobs=-1)
print(np.mean(scores['test_score']))

0.9499831497740564


In [106]:
pred_and_sub(voting_clf_4, 48)

[Voting] .................... (2 of 8) Processing xgb_2, total= 2.6min
[Voting] .................... (1 of 8) Processing xgb_1, total= 2.9min
[Voting] .................... (3 of 8) Processing xgb_3, total=  37.4s
[Voting] .................... (4 of 8) Processing xgb_4, total=  51.2s
[Voting] .................... (5 of 8) Processing xgb_5, total=  43.7s
[Voting] .................... (7 of 8) Processing xgb_7, total=  52.8s
[Voting] .................... (8 of 8) Processing xgb_8, total= 1.4min
[Voting] .................... (6 of 8) Processing xgb_6, total= 2.8min
[Voting] .................... (2 of 8) Processing xgb_2, total= 2.5min
[Voting] .................... (1 of 8) Processing xgb_1, total= 2.8min
[Voting] .................... (3 of 8) Processing xgb_3, total=  36.5s
[Voting] .................... (4 of 8) Processing xgb_4, total=  51.5s
[Voting] .................... (5 of 8) Processing xgb_5, total=  41.9s
[Voting] .................... (7 of 8) Processing xgb_7, total=  52.8s
[Votin



[Voting] .................... (2 of 8) Processing xgb_2, total= 2.6min
[Voting] .................... (1 of 8) Processing xgb_1, total= 2.9min
[Voting] .................... (3 of 8) Processing xgb_3, total=  36.0s
[Voting] .................... (4 of 8) Processing xgb_4, total=  52.1s
[Voting] .................... (5 of 8) Processing xgb_5, total=  43.8s
[Voting] .................... (7 of 8) Processing xgb_7, total=  53.4s
[Voting] .................... (8 of 8) Processing xgb_8, total= 1.4min
[Voting] .................... (6 of 8) Processing xgb_6, total= 2.8min
[Voting] .................... (2 of 8) Processing xgb_2, total= 2.5min
[Voting] .................... (1 of 8) Processing xgb_1, total= 2.8min
[Voting] .................... (3 of 8) Processing xgb_3, total=  36.8s
[Voting] .................... (4 of 8) Processing xgb_4, total=  51.6s
[Voting] .................... (5 of 8) Processing xgb_5, total=  43.9s
[Voting] .................... (7 of 8) Processing xgb_7, total=  52.4s
[Votin