In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split, cross_val_score
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import VotingClassifier
from tqdm import tqdm
import matplotlib.pyplot as plt
from hyperopt import fmin, STATUS_OK, tpe, Trials, hp, space_eval
from boruta import BorutaPy

In [2]:
train_mv = pd.read_csv(Path('/kaggle/input/mol2vec/train_molvec_data.csv'))
test_mv = pd.read_csv(Path('/kaggle/input/mol2vec/test_molvec_data.csv'))

train_md = pd.read_csv(Path('/kaggle/input/molecular-descriptors/train_molecular_data.csv'))
test_md = pd.read_csv(Path('/kaggle/input/molecular-descriptors/test_molecular_data.csv'))

train_md.drop(index=[10135, 26306, 42332, 47225, 62942, 72002], axis=0, inplace=True)
train_md.reset_index(drop=True, inplace=True)

In [4]:
train_md.fillna(0, inplace=True)
test_md.fillna(0, inplace=True)

In [5]:
pd.set_option('display.max_rows',None)

In [10]:
train_mv.drop(['Assay Id', 'Expected'], axis=1, inplace=True)

In [11]:
train_mv.head()

Unnamed: 0,MV0,MV1,MV2,MV3,MV4,MV5,MV6,MV7,MV8,MV9,...,MV290,MV291,MV292,MV293,MV294,MV295,MV296,MV297,MV298,MV299
0,0.591556,0.436794,-2.616474,5.393537,0.979889,1.760543,-8.516331,0.634881,8.672752,4.849902,...,-2.0925,3.835511,10.87436,2.934474,-2.489406,-2.592273,-4.072971,-1.010019,-6.477407,0.750421
1,-1.387351,-0.650103,-1.308442,-0.534801,4.614061,0.907463,-6.503389,-1.544066,2.970018,-3.843607,...,-0.547428,3.197913,5.954277,1.360025,-3.836096,-1.468281,0.407472,-4.784484,-6.204209,-3.584467
2,0.057016,-5.191211,-0.658686,-0.64294,10.475833,4.291732,-19.384228,-1.689491,4.949928,-8.926942,...,-1.191639,2.139539,16.158161,0.729103,-9.222128,-5.543819,2.537033,-14.175414,-11.747466,-5.981764
3,1.559935,-3.572063,-1.491086,3.473112,-0.838103,2.379557,-7.592897,-1.499726,5.498289,3.850152,...,0.228761,7.029467,4.817788,-2.289021,-4.723467,-1.145345,-1.12672,-2.132176,-6.25475,-0.356743
4,-0.060041,0.155101,0.073155,0.180541,0.421257,-0.185311,-0.51644,0.324106,0.047974,0.58056,...,0.127916,0.04258,-0.797013,-0.323537,0.43283,-0.738765,-0.222924,0.272511,-0.600159,0.432861


In [13]:
test_mv.drop(['Assay Id'], axis=1, inplace=True)
test_mv.head()

Unnamed: 0,MV0,MV1,MV2,MV3,MV4,MV5,MV6,MV7,MV8,MV9,...,MV290,MV291,MV292,MV293,MV294,MV295,MV296,MV297,MV298,MV299
0,0.5942,1.089279,-1.363318,2.377262,1.464866,0.371191,-6.059753,0.101892,3.868654,1.486833,...,-0.93885,2.852365,4.955368,-0.641309,-3.365838,-1.881012,-2.443057,-2.199804,-6.663648,-2.183817
1,2.071164,-5.903744,-1.911758,5.981367,2.007313,-0.135413,-11.257625,0.037847,8.810589,3.97497,...,1.290284,8.588354,12.169148,-2.063678,-9.252829,-3.912638,-6.905687,-2.09555,-12.356855,-4.284906
2,3.513798,-10.650757,-2.338746,11.947106,-7.020576,-1.278094,-20.097008,3.868305,14.147479,7.462498,...,-2.64843,21.444168,15.451662,-2.483602,-18.042824,-7.744145,-12.428687,-7.628102,-21.449327,-3.476709
3,3.318751,-1.376666,-3.21667,-0.402242,0.33608,0.124505,-4.069031,0.83907,-0.885285,1.046237,...,-0.575894,3.989857,-0.158101,-1.802653,-4.841865,-2.714195,-3.040305,-3.154216,-4.1821,-0.138847
4,-1.795259,-6.074874,-5.024946,1.376537,-0.354517,-10.956933,-12.585599,3.02255,4.726954,-1.108699,...,-2.153061,15.212841,5.397811,-2.250856,-13.275173,-15.389323,-9.747753,-6.150456,-18.290964,1.317636


In [14]:
train_df = pd.concat([train_md, train_mv], axis=1)
test_df = pd.concat([test_md, test_mv], axis=1)

In [10]:
train_y = train_df.loc[:, 'Expected']
train_X = train_df.drop(['Expected'], axis=1)

In [34]:
skf = StratifiedKFold(random_state=10, n_splits=5, shuffle=True)

In [12]:
lab = LabelEncoder()
train_yT = lab.fit_transform(train_y)

In [13]:
xgb = XGBClassifier(n_jobs = -1, tree_method='gpu_hist')

In [14]:
feat_select = BorutaPy(estimator=xgb, n_estimators='auto', verbose=2, random_state=10, max_iter=100)

In [15]:
feat_select.fit(train_X.to_numpy(), train_yT)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	509
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	509
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	509
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	509
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	509
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	509
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	509
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	111
Rejected: 	398
Iteration: 	9 / 100
Confirmed: 	5
Tentative: 	106
Rejected: 	398
Iteration: 	10 / 100
Confirmed: 	5
Tentative: 	106
Rejected: 	398
Iteration: 	11 / 100
Confirmed: 	5
Tentative: 	106
Rejected: 	398
Iteration: 	12 / 100
Confirmed: 	5
Tentative: 	96
Rejected: 	408
Iteration: 	13 / 100
Confirmed: 	6
Tentative: 	95
Rejected: 	408
Iteration: 	14 / 100
Confirmed: 	6
Tentative: 	95
Rejected: 	408
Iteration: 	15 / 100
Confirmed: 	6
Tentative: 	95
Rejected: 	408
Iteration: 	16 / 100
Confirmed: 	7
Te

BorutaPy(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                 callbacks=None, colsample_bylevel=1,
                                 colsample_bynode=1, colsample_bytree=1,
                                 early_stopping_rounds=None,
                                 enable_categorical=False, eval_metric=None,
                                 gamma=0, gpu_id=0, grow_policy='depthwise',
                                 importance_type=None,
                                 interaction_constraints='',
                                 learning_rate=0.300000012, max_bin=256,
                                 max_cat_to_onehot=4, max_delta_step=0,
                                 max_depth=6, max_leaves=0, min_child_weight=1,
                                 missing=nan, monotone_constraints='()',
                                 n_estimators=214, n_jobs=-1,
                                 num_parallel_tree=1, predictor='auto',
                           

In [16]:
feat_select.ranking_

array([  1, 126, 156, 413,  76, 184, 105,   1,   1, 369,  59,   1,   1,
        29,   2, 214,  93, 169,   1,   1, 262,   1,   1, 103, 215, 111,
        24, 159,  21,  11,   1,   2,   1,   1, 274,  36,   2, 135,  10,
       192,   1,   3, 332, 176, 227, 359, 265, 324,  48, 205, 343,   1,
       330,   1,   1,   1, 128, 190, 238, 282, 224, 100, 174, 307, 347,
       113,   1,   1, 413, 246, 335, 275, 349,   1, 235, 144, 339,   1,
       288, 360, 312, 413,   1, 291, 354, 413, 284, 328, 249, 316, 269,
        73,   1, 126, 261,   2,  15, 219,  50,   1, 163, 270, 280,   1,
        91, 380,  18, 301,   1, 367, 376,  21, 368, 370,  26, 363,  27,
        20,   7, 413, 413, 364,   1,   1,   3,  25, 376, 380, 342,   3,
         9,   3,   1,   2,   2, 361,   1, 413, 380, 257, 357, 375, 413,
       371, 388, 413, 413, 388, 413,   6, 388, 336, 413,   1, 355, 413,
       413, 413, 413, 413,   1, 413, 413, 388,   2,   1, 413, 373,   1,
       413, 413, 380, 413, 413, 413, 380, 388, 413, 413, 365, 41

In [17]:
combined_dict = {'features': train_X.columns.to_list(), 'rankings':feat_select.ranking_}
feature_ranks = pd.DataFrame(data=combined_dict)

In [18]:
feature_ranks.to_csv('md_mv_feature_ranks.csv', index=False)

In [19]:
feature_ranks

Unnamed: 0,features,rankings
0,Assay Id,1
1,MaxEStateIndex,126
2,MinEStateIndex,156
3,MaxAbsEStateIndex,413
4,MinAbsEStateIndex,76
5,qed,184
6,MolWt,105
7,HeavyAtomMolWt,1
8,ExactMolWt,1
9,NumValenceElectrons,369


# **Import ranked features**

In [21]:
feature_ranks = pd.read_csv('/kaggle/input/selected-features/md_mv_feature_ranks.csv')

In [22]:
feature_ranks.sort_values('rankings', inplace=True)
feature_ranks.reset_index(drop=True, inplace=True)

In [23]:
feature_ranks

Unnamed: 0,features,rankings
0,Assay Id,1
1,fr_phenol,1
2,fr_nitrile,1
3,fr_halogen,1
4,fr_ether,1
5,fr_bicyclic,1
6,fr_aniline,1
7,fr_C_S,1
8,fr_COO,1
9,MolMR,1


In [30]:
train_fs = train_df.loc[:, feature_ranks[(feature_ranks['rankings'] <= 2)]['features']]
test_fs = test_df.loc[:, feature_ranks[(feature_ranks['rankings'] <= 2)]['features']]
train_y = train_df.loc[:, 'Expected']

In [32]:
le = LabelEncoder()
train_yT = le.fit_transform(train_y)

In [35]:
X_train, X_val, y_train, y_val = train_test_split(train_fs, train_yT, test_size=0.25, random_state=10, stratify=train_yT)
standard_params = {
    'n_jobs':-1,
    'tree_method': 'gpu_hist',
    'booster': 'gbtree',
    'max_bin': 256
}
def objective(space):
    xgb_cl = XGBClassifier(**space, **standard_params, eval_metric="error", early_stopping_rounds=10)
    
    fit_params={'verbose': False, 'eval_set': [[X_val, y_val]]}
    
    score = cross_val_score(estimator=xgb_cl, X=X_train, y=y_train, cv=skf, fit_params=fit_params, scoring='f1').mean()

    return {'loss': -score, 'status':STATUS_OK}
hyper_space = {
    'n_estimators': hp.randint('n_estimators', 100, 1000),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'max_depth': hp.randint('max_depth', 3, 18),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
#     'gamma': hp.uniform ('gamma', 0.01,9),
#     'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
#     'reg_lambda' : hp.uniform('reg_lambda', 0,1),
    'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1)
}
trials = Trials()
best_params = fmin(fn=objective, space=hyper_space, algo=tpe.suggest, max_evals=100, trials=trials)

100%|██████████| 100/100 [21:51<00:00, 13.12s/trial, best loss: -0.946215410669135]


In [36]:
best_params

{'colsample_bytree': 0.7808480741997409,
 'learning_rate': 0.11033727506031693,
 'max_depth': 17,
 'min_child_weight': 4.0,
 'n_estimators': 272,
 'subsample': 0.9139935952095666}

In [38]:
params = {
    'tree_method': 'gpu_hist',
    'n_jobs': -1
}

In [78]:
xgb_x = XGBClassifier(**params, n_estimators=650, max_depth=6, learning_rate=0.2)
scores = cross_validate(estimator=xgb_x, X=train_fs, y=train_yT,cv=skf, scoring='f1', n_jobs=-1)
print(np.mean(scores['test_score']))

0.9487240317546259


In [79]:
xgb_1 = XGBClassifier(**params, n_estimators=500)
xgb_2 = XGBClassifier(**params, n_estimators=700, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_3 = XGBClassifier(**params, n_estimators=450, max_depth=8, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_4 = XGBClassifier(**params, n_estimators=800, max_depth=15, colsample_bytree=0.68, learning_rate=0.09, subsample=0.90, min_child_weight=0.0, gamma=1.3)
xgb_5 = XGBClassifier(**params, n_estimators=800, max_depth=12, colsample_bytree=0.68, learning_rate=0.09, subsample=0.90, min_child_weight=0.0, gamma=1.3)
xgb_6 = XGBClassifier(**params, n_estimators=600, max_depth=10, colsample_bytree=0.78, learning_rate=0.11, subsample=0.91)
xgb_7 = XGBClassifier(**params, n_estimators=700, max_depth=15, colsample_bytree=0.78, learning_rate=0.14, subsample=0.91, gamma=1.3)
xgb_8 = XGBClassifier(**params, n_estimators=600, max_depth=8, colsample_bytree=0.78, learning_rate=0.2, subsample=0.99)
xgb_9 = XGBClassifier(**params, n_estimators=700, max_depth=8, learning_rate=0.14)
xgb_10 = XGBClassifier(**params, n_estimators=200, max_depth=8, learning_rate=0.2)
xgb_11 = XGBClassifier(**params, n_estimators=200, max_depth=10, learning_rate=0.13)
xgb_12 = XGBClassifier(**params, n_estimators=650, max_depth=6, learning_rate=0.2)

classifier_list = [
    ('xgb_1', xgb_1), ('xgb_2', xgb_2), ('xgb_3', xgb_3), ('xgb_4', xgb_4), ('xgb_5', xgb_5), ('xgb_6', xgb_6), ('xgb_7', xgb_7), ('xgb_8', xgb_8), ('xgb_9', xgb_9), ('xgb_10', xgb_10), ('xgb_11', xgb_11), ('xgb_12', xgb_12)
]
voting_clf_1 = VotingClassifier(estimators=classifier_list, n_jobs=-1, verbose=2, voting='soft', weights=[1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1])

In [80]:
scores = cross_validate(estimator=voting_clf_1, X=train_fs, y=train_yT,cv=skf, scoring='f1', n_jobs=-1)
print(np.mean(scores['test_score']))

[Voting] ................... (1 of 12) Processing xgb_1, total=  17.7s
[Voting] ................... (2 of 12) Processing xgb_2, total=  22.5s
[Voting] ................... (3 of 12) Processing xgb_3, total=  35.3s
[Voting] ................... (4 of 12) Processing xgb_4, total=  44.1s
[Voting] ................... (5 of 12) Processing xgb_5, total=  35.6s
[Voting] ................... (7 of 12) Processing xgb_7, total=  31.2s
[Voting] ................... (6 of 12) Processing xgb_6, total= 1.5min
[Voting] ................... (8 of 12) Processing xgb_8, total=  44.6s
[Voting] ................. (10 of 12) Processing xgb_10, total=  17.6s
[Voting] ................... (9 of 12) Processing xgb_9, total=  54.9s
[Voting] ................. (11 of 12) Processing xgb_11, total=  35.2s
[Voting] ................. (12 of 12) Processing xgb_12, total=  12.8s
[Voting] ................... (1 of 12) Processing xgb_1, total=  16.1s
[Voting] ................... (2 of 12) Processing xgb_2, total=  21.7s
[Votin



[Voting] ................... (1 of 12) Processing xgb_1, total=  17.5s
[Voting] ................... (2 of 12) Processing xgb_2, total=  22.3s
[Voting] ................... (3 of 12) Processing xgb_3, total=  34.5s
[Voting] ................... (4 of 12) Processing xgb_4, total=  44.3s
[Voting] ................... (5 of 12) Processing xgb_5, total=  36.5s
[Voting] ................... (7 of 12) Processing xgb_7, total=  31.1s
[Voting] ................... (6 of 12) Processing xgb_6, total= 1.6min
[Voting] ................... (8 of 12) Processing xgb_8, total=  44.6s
[Voting] ................. (10 of 12) Processing xgb_10, total=  17.8s
[Voting] ................... (9 of 12) Processing xgb_9, total=  54.7s
[Voting] ................. (11 of 12) Processing xgb_11, total=  36.1s
[Voting] ................. (12 of 12) Processing xgb_12, total=  11.9s
[Voting] ................... (1 of 12) Processing xgb_1, total=  16.6s
[Voting] ................... (2 of 12) Processing xgb_2, total=  22.5s
[Votin

In [85]:
def pred_and_sub(classifier, sub_no):
    classifier.fit(X=train_fs, y=train_yT)
    pred_vals = classifier.predict(test_fs)
    print(pred_vals)
    new_pred = le.inverse_transform(pred_vals)
    print(new_pred)
    final_df = pd.read_csv("/kaggle/input/initial-data/test_II.csv")
    final_df['Predicted'] = new_pred
    final_df.columns = ['Id', 'Predicted']
    final_df.to_csv(f'submission{sub_no}.csv', index=False)

In [87]:
pred_and_sub(voting_clf_1, 49)

[1 1 1 ... 1 1 1]
[2 2 2 ... 2 2 2]
[Voting] ................... (1 of 12) Processing xgb_1, total=  12.9s
[Voting] ................... (3 of 12) Processing xgb_3, total=  17.8s
[Voting] ................... (5 of 12) Processing xgb_5, total=  22.2s
[Voting] ................... (7 of 12) Processing xgb_7, total=  17.6s
[Voting] ................... (8 of 12) Processing xgb_8, total=  23.5s
[Voting] ................. (10 of 12) Processing xgb_10, total=   9.6s
[Voting] ................. (11 of 12) Processing xgb_11, total=  18.9s
[Voting] ................... (1 of 12) Processing xgb_1, total=   8.8s
[Voting] ................... (3 of 12) Processing xgb_3, total=  18.5s
[Voting] ................... (5 of 12) Processing xgb_5, total=  22.7s
[Voting] ................... (7 of 12) Processing xgb_7, total=  17.5s
[Voting] ................... (8 of 12) Processing xgb_8, total=  23.4s
[Voting] ................. (10 of 12) Processing xgb_10, total=   9.7s
[Voting] ................. (11 of 12) Pro

In [89]:
train_fs = train_df.loc[:, feature_ranks[(feature_ranks['rankings'] <= 50)]['features']]
test_fs = test_df.loc[:, feature_ranks[(feature_ranks['rankings'] <= 50)]['features']]

In [90]:
xgb_1 = XGBClassifier(**params, n_estimators=500)
xgb_2 = XGBClassifier(**params, n_estimators=700, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_3 = XGBClassifier(**params, n_estimators=450, max_depth=8, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_4 = XGBClassifier(**params, n_estimators=800, max_depth=15, colsample_bytree=0.68, learning_rate=0.09, subsample=0.90, min_child_weight=0.0, gamma=1.3)
xgb_5 = XGBClassifier(**params, n_estimators=800, max_depth=12, colsample_bytree=0.68, learning_rate=0.09, subsample=0.90, min_child_weight=0.0, gamma=1.3)
xgb_6 = XGBClassifier(**params, n_estimators=600, max_depth=10, colsample_bytree=0.78, learning_rate=0.11, subsample=0.91)
xgb_7 = XGBClassifier(**params, n_estimators=700, max_depth=15, colsample_bytree=0.78, learning_rate=0.14, subsample=0.91, gamma=1.3)
xgb_8 = XGBClassifier(**params, n_estimators=600, max_depth=8, colsample_bytree=0.78, learning_rate=0.2, subsample=0.99)
xgb_9 = XGBClassifier(**params, n_estimators=700, max_depth=8, learning_rate=0.14)
xgb_10 = XGBClassifier(**params, n_estimators=200, max_depth=8, learning_rate=0.2)
xgb_11 = XGBClassifier(**params, n_estimators=200, max_depth=10, learning_rate=0.13)
xgb_12 = XGBClassifier(**params, n_estimators=650, max_depth=6, learning_rate=0.2)

classifier_list = [
    ('xgb_1', xgb_1), ('xgb_2', xgb_2), ('xgb_3', xgb_3), ('xgb_4', xgb_4), ('xgb_5', xgb_5), ('xgb_6', xgb_6), ('xgb_7', xgb_7), ('xgb_8', xgb_8), ('xgb_9', xgb_9), ('xgb_10', xgb_10), ('xgb_11', xgb_11), ('xgb_12', xgb_12)
]
voting_clf_2 = VotingClassifier(estimators=classifier_list, n_jobs=-1, verbose=2, voting='soft', weights=[1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1])

In [91]:
scores = cross_validate(estimator=voting_clf_2, X=train_fs, y=train_yT,cv=skf, scoring='f1', n_jobs=-1)
print(np.mean(scores['test_score']))

0.950481231355743


In [92]:
pred_and_sub(voting_clf_2, 50)

[1 1 1 ... 1 1 0]
[2 2 2 ... 2 2 1]


In [93]:
train_fs = train_df.loc[:, feature_ranks[(feature_ranks['rankings'] <= 149)]['features']]
test_fs = test_df.loc[:, feature_ranks[(feature_ranks['rankings'] <= 149)]['features']]

In [94]:
xgb_1 = XGBClassifier(**params, n_estimators=500)
xgb_2 = XGBClassifier(**params, n_estimators=700, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_3 = XGBClassifier(**params, n_estimators=450, max_depth=8, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_4 = XGBClassifier(**params, n_estimators=800, max_depth=15, colsample_bytree=0.68, learning_rate=0.09, subsample=0.90, min_child_weight=0.0, gamma=1.3)
xgb_5 = XGBClassifier(**params, n_estimators=800, max_depth=12, colsample_bytree=0.68, learning_rate=0.09, subsample=0.90, min_child_weight=0.0, gamma=1.3)
xgb_6 = XGBClassifier(**params, n_estimators=600, max_depth=10, colsample_bytree=0.78, learning_rate=0.11, subsample=0.91)
xgb_7 = XGBClassifier(**params, n_estimators=700, max_depth=15, colsample_bytree=0.78, learning_rate=0.14, subsample=0.91, gamma=1.3)
xgb_8 = XGBClassifier(**params, n_estimators=600, max_depth=8, colsample_bytree=0.78, learning_rate=0.2, subsample=0.99)
xgb_9 = XGBClassifier(**params, n_estimators=700, max_depth=8, learning_rate=0.14)
xgb_10 = XGBClassifier(**params, n_estimators=200, max_depth=8, learning_rate=0.2)
xgb_11 = XGBClassifier(**params, n_estimators=200, max_depth=10, learning_rate=0.13)
xgb_12 = XGBClassifier(**params, n_estimators=650, max_depth=6, learning_rate=0.2)

classifier_list = [
    ('xgb_1', xgb_1), ('xgb_2', xgb_2), ('xgb_3', xgb_3), ('xgb_4', xgb_4), ('xgb_5', xgb_5), ('xgb_6', xgb_6), ('xgb_7', xgb_7), ('xgb_8', xgb_8), ('xgb_9', xgb_9), ('xgb_10', xgb_10), ('xgb_11', xgb_11), ('xgb_12', xgb_12)
]
voting_clf_3 = VotingClassifier(estimators=classifier_list, n_jobs=-1, verbose=2, voting='soft')
# , weights=[1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1]

[Voting] ................... (1 of 12) Processing xgb_1, total=  25.5s
[Voting] ................... (2 of 12) Processing xgb_2, total=  31.3s
[Voting] ................... (3 of 12) Processing xgb_3, total=  39.3s
[Voting] ................... (4 of 12) Processing xgb_4, total=  54.8s
[Voting] ................... (5 of 12) Processing xgb_5, total=  48.2s
[Voting] ................... (7 of 12) Processing xgb_7, total=  39.3s
[Voting] ................... (6 of 12) Processing xgb_6, total= 1.9min
[Voting] ................... (8 of 12) Processing xgb_8, total=  52.1s
[Voting] ................. (10 of 12) Processing xgb_10, total=  21.6s
[Voting] ................... (9 of 12) Processing xgb_9, total= 1.1min
[Voting] ................. (11 of 12) Processing xgb_11, total=  41.9s
[Voting] ................. (12 of 12) Processing xgb_12, total=  16.5s
[Voting] ................... (1 of 12) Processing xgb_1, total=  20.9s
[Voting] ................... (2 of 12) Processing xgb_2, total=  26.5s
[Votin

In [95]:
scores = cross_validate(estimator=voting_clf_3, X=train_fs, y=train_yT,cv=skf, scoring='f1', n_jobs=-1)
print(np.mean(scores['test_score']))

0.9505704602730575
[Voting] ................... (1 of 12) Processing xgb_1, total=  33.1s
[Voting] ................... (2 of 12) Processing xgb_2, total=  40.4s
[Voting] ................... (3 of 12) Processing xgb_3, total=  52.4s
[Voting] ................... (4 of 12) Processing xgb_4, total= 1.1min
[Voting] ................... (5 of 12) Processing xgb_5, total= 1.0min
[Voting] ................... (7 of 12) Processing xgb_7, total=  50.0s
[Voting] ................... (6 of 12) Processing xgb_6, total= 2.5min
[Voting] ................... (8 of 12) Processing xgb_8, total= 1.2min
[Voting] ................. (10 of 12) Processing xgb_10, total=  31.1s
[Voting] ................... (9 of 12) Processing xgb_9, total= 1.5min
[Voting] ................. (11 of 12) Processing xgb_11, total=  57.4s
[Voting] ................. (12 of 12) Processing xgb_12, total=  16.6s
[Voting] ................... (1 of 12) Processing xgb_1, total=  29.5s
[Voting] ................... (2 of 12) Processing xgb_2, t

In [96]:
pred_and_sub(voting_clf_2, 51)

[1 1 1 ... 1 1 0]
[2 2 2 ... 2 2 1]
[Voting] ................... (2 of 12) Processing xgb_2, total=  22.9s
[Voting] ................... (4 of 12) Processing xgb_4, total=  39.9s
[Voting] ................... (6 of 12) Processing xgb_6, total= 1.3min
[Voting] ................. (10 of 12) Processing xgb_10, total=  16.5s
[Voting] ................. (11 of 12) Processing xgb_11, total=  32.6s
[Voting] ................... (1 of 12) Processing xgb_1, total=  25.4s
[Voting] ................... (2 of 12) Processing xgb_2, total=  31.0s
[Voting] ................... (3 of 12) Processing xgb_3, total=  39.4s
[Voting] ................... (4 of 12) Processing xgb_4, total=  54.3s
[Voting] ................... (5 of 12) Processing xgb_5, total=  48.3s
[Voting] ................... (7 of 12) Processing xgb_7, total=  39.4s
[Voting] ................... (6 of 12) Processing xgb_6, total= 1.8min
[Voting] ................... (8 of 12) Processing xgb_8, total=  51.8s
[Voting] ................. (10 of 12) Pro