In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split, cross_val_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import VotingClassifier
from boruta import BorutaPy
from hyperopt import fmin, STATUS_OK, tpe, Trials, hp, space_eval

In [3]:
pd.set_option('display.max_rows',None)

In [2]:
base_path = Path().cwd().joinpath('/kaggle/input/molecular-descriptors/')
train_df = pd.read_csv(base_path.joinpath('train_molecular_data.csv'))
test_df = pd.read_csv(base_path.joinpath('test_molecular_data.csv'))

In [3]:
train_df.drop('Unnamed: 0', axis=1, inplace=True)
test_df.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
pd.set_option('display.max_rows',None)

In [5]:
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)
train_df.isnull().sum()

Assay Id                    0
MaxEStateIndex              0
MinEStateIndex              0
MaxAbsEStateIndex           0
MinAbsEStateIndex           0
qed                         0
MolWt                       0
HeavyAtomMolWt              0
ExactMolWt                  0
NumValenceElectrons         0
NumRadicalElectrons         0
MaxPartialCharge            0
MinPartialCharge            0
MaxAbsPartialCharge         0
MinAbsPartialCharge         0
FpDensityMorgan1            0
FpDensityMorgan2            0
FpDensityMorgan3            0
BCUT2D_MWHI                 0
BCUT2D_MWLOW                0
BCUT2D_CHGHI                0
BCUT2D_CHGLO                0
BCUT2D_LOGPHI               0
BCUT2D_LOGPLOW              0
BCUT2D_MRHI                 0
BCUT2D_MRLOW                0
BalabanJ                    0
BertzCT                     0
Chi0                        0
Chi0n                       0
Chi0v                       0
Chi1                        0
Chi1n                       0
Chi1v     

In [6]:
train_y = train_df.loc[:, 'Expected']
train_X = train_df.drop(['Expected'], axis=1)

In [7]:
skf = StratifiedKFold(random_state=10, n_splits=5, shuffle=True)

In [8]:
lab = LabelEncoder()
train_yT = lab.fit_transform(train_y)

In [10]:
xgb = XGBClassifier(n_jobs = -1)

In [17]:
feat_select = BorutaPy(estimator=xgb, n_estimators='auto', verbose=2, random_state=10, max_iter=100)

In [18]:
feat_select.fit(train_X.to_numpy(), train_yT)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	209
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	209
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	209
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	209
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	209
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	209
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	209
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	67
Rejected: 	142
Iteration: 	9 / 100
Confirmed: 	7
Tentative: 	60
Rejected: 	142
Iteration: 	10 / 100
Confirmed: 	7
Tentative: 	60
Rejected: 	142
Iteration: 	11 / 100
Confirmed: 	7
Tentative: 	60
Rejected: 	142
Iteration: 	12 / 100
Confirmed: 	8
Tentative: 	59
Rejected: 	142
Iteration: 	13 / 100
Confirmed: 	8
Tentative: 	59
Rejected: 	142
Iteration: 	14 / 100
Confirmed: 	8
Tentative: 	57
Rejected: 	144
Iteration: 	15 / 100
Confirmed: 	8
Tentative: 	57
Rejected: 	144
Iteration: 	16 / 100
Confirmed: 	9
Tentat

BorutaPy(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                 callbacks=None, colsample_bylevel=1,
                                 colsample_bynode=1, colsample_bytree=1,
                                 early_stopping_rounds=None,
                                 enable_categorical=False, eval_metric=None,
                                 gamma=0, gpu_id=-1, grow_policy='depthwise',
                                 importance_type=None,
                                 interaction_constraints='',
                                 learning_rate=0.300000012, max_bin=256,
                                 max_cat_to_onehot=4, max_delta_step=0,
                                 max_depth=6, max_leaves=0, min_child_weight=1,
                                 missing=nan, monotone_constraints='()',
                                 n_estimators=174, n_jobs=-1,
                                 num_parallel_tree=1, predictor='auto',
                          

In [20]:
feat_select.ranking_

array([  1,  72,  65, 149,  88,  49,  62,   1,   1,   1,  10,  23,   1,
        19,  93,  40,  64,  49,  26,  18,  96,   1,   1,  34,  93,  31,
        75,   1,  20,   2,  36,  24,   1,   1,  85,  32,   2,  86,  66,
        55,  16,   1,  80,  78,  64,  68,  81,  61,  35,  91,  25,   1,
        59,   1,  38,  52,  89,  40,  90, 105, 109,   1,  42,  69,  56,
        73,   1,   1, 149,   9,  46,   2,  15,  44,  75,  44,  99,   1,
        58, 105,  99, 149,  29,  77,  97, 149, 102,  83,  87,  82,  79,
       100,  28,  93,  68,  22,  28,   1,  30,  21,  12,  61, 102,   1,
        41,   3,  14,  54,   1,   2, 121,   1, 116,   1, 112,  58, 107,
        51,   2, 118, 113,  85,   1,   1,  12, 114, 116,   1, 107, 111,
       119,   1,   1, 149,   1, 103,   1, 149,   2,   2,   1,  72, 128,
       121, 130, 149,  49,   1,   3,   6,  47,  70, 130,   1, 110, 149,
       134, 149, 130, 149,  54, 149, 149, 117,   1,  45, 124, 123,   1,
       149, 127,   3,   2, 138, 149,  33,  12, 149,   1, 107, 14

In [21]:
combined_dict = {'features': train_X.columns.to_list(), 'rankings':feat_select.ranking_}
feature_ranks = pd.DataFrame(data=combined_dict)

In [25]:
feature_ranks.sort_values('rankings', inplace=True)
feature_ranks.reset_index(drop=True, inplace=True)

In [26]:
feature_ranks

Unnamed: 0,features,rankings
0,Assay Id,1
1,NumValenceElectrons,1
2,fr_thiophene,1
3,fr_pyridine,1
4,fr_phenol,1
5,fr_para_hydroxylation,1
6,fr_nitro_arom_nonortho,1
7,HeavyAtomMolWt,1
8,ExactMolWt,1
9,fr_C_S,1


In [27]:
feature_ranks.to_csv('feature_ranks.csv', index=False)

In [31]:
feat_select.transform(train_X.to_numpy()).shape

(75383, 42)

In [42]:
train_fs = train_df.loc[:, feature_ranks[(feature_ranks['rankings'] == 1) | (feature_ranks['rankings'] == 2)]['features']]

In [43]:
train_fs.head()

Unnamed: 0,Assay Id,NumValenceElectrons,fr_thiophene,fr_pyridine,fr_phenol,fr_para_hydroxylation,fr_nitro_arom_nonortho,HeavyAtomMolWt,ExactMolWt,fr_C_S,...,NumAromaticRings,Chi0n,Chi3n,fr_urea,fr_Imine,NumAliphaticHeterocycles,SlogP_VSA10,fr_imide,NumSaturatedCarbocycles,fr_NH0
0,1644,100,0,0,2,0,0,306.511,315.982463,0,...,2,9.724473,2.78017,0,0,0,0.0,0,0,0
1,2451,66,0,0,0,0,0,136.109,156.151415,0,...,0,7.857996,1.882392,0,0,0,0.0,0,0,0
2,1384,148,0,0,0,0,0,313.702,361.347528,0,...,0,17.5531,5.23399,0,0,0,0.0,0,0,1
3,16,90,0,1,0,0,0,245.585,255.052302,0,...,1,8.836687,2.284387,0,1,1,0.0,0,0,4
4,1856,8,0,0,0,0,0,149.894,149.894242,0,...,0,1.377964,0.0,0,0,0,0.0,0,0,0


In [44]:
test_fs = test_df.loc[:, feature_ranks[(feature_ranks['rankings'] == 1) | (feature_ranks['rankings'] == 2)]['features']]

In [45]:
test_fs.head()

Unnamed: 0,Assay Id,NumValenceElectrons,fr_thiophene,fr_pyridine,fr_phenol,fr_para_hydroxylation,fr_nitro_arom_nonortho,HeavyAtomMolWt,ExactMolWt,fr_C_S,...,NumAromaticRings,Chi0n,Chi3n,fr_urea,fr_Imine,NumAliphaticHeterocycles,SlogP_VSA10,fr_imide,NumSaturatedCarbocycles,fr_NH0
0,1682,66,0,0,1,0,0,148.12,164.120115,0,...,1,8.179264,1.961247,0,0,0,0.0,0,0,0
1,1656,152,0,1,0,0,0,414.316,431.05694,0,...,2,15.032383,3.255699,1,0,0,10.742876,0,0,3
2,36,254,0,0,0,2,0,655.944,695.250845,0,...,4,27.132362,11.100291,0,0,3,8.78083,0,0,4
3,1850,56,0,0,0,0,0,197.212,200.94981,0,...,0,6.065805,0.765269,0,0,1,0.0,0,0,1
4,30,168,0,0,0,0,0,380.27,418.271924,0,...,0,19.09635,7.347585,0,0,1,0.0,0,0,0


In [58]:
X_train, X_val, y_train, y_val = train_test_split(train_fs, train_yT, test_size=0.25, random_state=10, stratify=train_yT)

In [59]:
standard_params = {
    'n_jobs':-1,
    'tree_method': 'hist',
    'booster': 'gbtree',
    'max_bin': 256
}

In [60]:
def objective(space):
    xgb_cl = XGBClassifier(**space, **standard_params, eval_metric="error", early_stopping_rounds=10)
    
    fit_params={'verbose': False, 'eval_set': [[X_val, y_val]]}
    
    score = cross_val_score(estimator=xgb_cl, X=X_train, y=y_train, cv=skf, fit_params=fit_params, scoring='f1').mean()

    return {'loss': -score, 'status':STATUS_OK}

In [61]:
hyper_space = {
    'n_estimators': hp.randint('n_estimators', 100, 1000),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'max_depth': hp.randint('max_depth', 3, 18),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'gamma': hp.uniform ('gamma', 0.01,9),
#     'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
#     'reg_lambda' : hp.uniform('reg_lambda', 0,1),
    'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1)
}

In [62]:
trials = Trials()
best_params = fmin(fn=objective, space=hyper_space, algo=tpe.suggest, max_evals=100, trials=trials)

100%|██████████| 100/100 [13:56<00:00,  8.37s/trial, best loss: -0.9462928090428784]


In [63]:
best_params

{'colsample_bytree': 0.9106373868351865,
 'gamma': 0.7973219106319955,
 'learning_rate': 0.16858908958364727,
 'max_depth': 15,
 'min_child_weight': 7.0,
 'n_estimators': 835,
 'subsample': 0.9358000788419324}

In [70]:
def pred_and_sub(classifier, sub_no):
    classifier.fit(X=train_fs, y=train_yT)
    pred_vals = classifier.predict(test_fs)
    print(pred_vals)
    new_pred = lab.inverse_transform(pred_vals)
    print(new_pred)
    final_df = pd.read_csv("/kaggle/input/initial-data/test_II.csv")
    final_df['Predicted'] = new_pred
    final_df.columns = ['Id', 'Predicted']
    final_df.to_csv(f'submission{sub_no}.csv', index=False)

In [65]:
classifier1 = XGBClassifier(**best_params, **standard_params)

In [71]:
pred_and_sub(classifier1, 40)

[1 1 1 ... 1 1 1]
[2 2 2 ... 2 2 2]


In [82]:
params = {'colsample_bytree': 0.9106373868351865,
 'gamma': 0.7973219106319955,
 'learning_rate': 0.16858908958364727,
 'max_depth': 12,
 'min_child_weight': 7.0,
 'n_estimators': 600,
 'subsample': 0.9358000788419324}
classifier2 = XGBClassifier(**params, **standard_params)
scores = cross_validate(estimator=classifier2, X=train_fs, y=train_yT,cv=skf, scoring='f1', n_jobs=-1)
print(np.mean(scores['test_score']))

0.9487614082616261


In [84]:
pred_and_sub(classifier2, 43)

[1 1 1 ... 1 1 1]
[2 2 2 ... 2 2 2]


# **Final Attempts**

In [9]:
# After gpu switch off
feature_ranks_2 = pd.read_csv('/kaggle/input/features-ranks-3/feature_ranks_v3.csv')

In [9]:
xgb_2 = XGBClassifier(n_jobs = -1, tree_method='gpu_hist')

In [10]:
feat_select_2 = BorutaPy(estimator=xgb_2, n_estimators='auto', verbose=2, random_state=10, max_iter=100)
feat_select_2.fit(train_X.to_numpy(), train_yT)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	209
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	209
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	209
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	209
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	209
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	209
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	209
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	124
Rejected: 	85
Iteration: 	9 / 100
Confirmed: 	6
Tentative: 	118
Rejected: 	85
Iteration: 	10 / 100
Confirmed: 	6
Tentative: 	118
Rejected: 	85
Iteration: 	11 / 100
Confirmed: 	6
Tentative: 	118
Rejected: 	85
Iteration: 	12 / 100
Confirmed: 	11
Tentative: 	93
Rejected: 	105
Iteration: 	13 / 100
Confirmed: 	11
Tentative: 	93
Rejected: 	105
Iteration: 	14 / 100
Confirmed: 	11
Tentative: 	93
Rejected: 	105
Iteration: 	15 / 100
Confirmed: 	11
Tentative: 	93
Rejected: 	105
Iteration: 	16 / 100
Confirmed: 	11
T

BorutaPy(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                 callbacks=None, colsample_bylevel=1,
                                 colsample_bynode=1, colsample_bytree=1,
                                 early_stopping_rounds=None,
                                 enable_categorical=False, eval_metric=None,
                                 gamma=0, gpu_id=0, grow_policy='depthwise',
                                 importance_type=None,
                                 interaction_constraints='',
                                 learning_rate=0.300000012, max_bin=256,
                                 max_cat_to_onehot=4, max_delta_step=0,
                                 max_depth=6, max_leaves=0, min_child_weight=1,
                                 missing=nan, monotone_constraints='()',
                                 n_estimators=208, n_jobs=-1,
                                 num_parallel_tree=1, predictor='auto',
                           

In [11]:
combined_dict_2 = {'features': train_X.columns.to_list(), 'rankings':feat_select_2.ranking_}
feature_ranks_2 = pd.DataFrame(data=combined_dict_2)

In [12]:
feature_ranks_2.to_csv('feature_ranks_v3.csv', index=False)

In [11]:
feature_ranks_2.sort_values('rankings', inplace=True)
feature_ranks_2.reset_index(drop=True, inplace=True)

In [12]:
feature_ranks_2

Unnamed: 0,features,rankings
0,Assay Id,1
1,NumAliphaticCarbocycles,1
2,VSA_EState9,1
3,VSA_EState5,1
4,VSA_EState4,1
5,VSA_EState3,1
6,VSA_EState2,1
7,NumAromaticCarbocycles,1
8,VSA_EState10,1
9,SlogP_VSA2,1


In [16]:
train_fs = train_df.loc[:, feature_ranks_2[(feature_ranks_2['rankings'] <= 100 )]['features']]
test_fs = test_df.loc[:, feature_ranks_2[(feature_ranks_2['rankings'] <= 100)]['features']]

In [14]:
params = {
    'tree_method': 'gpu_hist',
    'n_jobs': -1
}

In [20]:
xgb_1 = XGBClassifier(**params, n_estimators=500)
xgb_2 = XGBClassifier(**params, n_estimators=500, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_3 = XGBClassifier(**params, n_estimators=450, max_depth=12, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_4 = XGBClassifier(**params, n_estimators=650, max_depth=10, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_5 = XGBClassifier(**params, n_estimators=500, max_depth=10, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_6 = XGBClassifier(**params, n_estimators=800, colsample_bytree=0.7, learning_rate=0.13)

classifier_list = [
    ('xgb_1', xgb_1), ('xgb_2', xgb_2), ('xgb_3', xgb_3), ('xgb_4', xgb_4), ('xgb_5', xgb_5), ('xgb_6', xgb_6)
]
voting_clf = VotingClassifier(estimators=classifier_list, n_jobs=-1, verbose=2, voting='soft', weights=[1,2,1,1,1,1])

In [23]:
scores = cross_validate(estimator=voting_clf, X=train_fs, y=train_yT,cv=skf, scoring='f1', n_jobs=-1)
print(np.mean(scores['test_score']))

0.9502591919339796


In [15]:
def pred_and_sub(classifier, sub_no):
    classifier.fit(X=train_fs, y=train_yT)
    pred_vals = classifier.predict(test_fs)
    print(pred_vals)
    new_pred = lab.inverse_transform(pred_vals)
    print(new_pred)
    final_df = pd.read_csv("/kaggle/input/initial-data/test_II.csv")
    final_df['Predicted'] = new_pred
    final_df.columns = ['Id', 'Predicted']
    final_df.to_csv(f'submission{sub_no}.csv', index=False)

In [25]:
pred_and_sub(voting_clf, 52)

[1 1 1 ... 1 1 1]
[2 2 2 ... 2 2 2]


In [26]:
xgb_1 = XGBClassifier(**params, n_estimators=500)
xgb_2 = XGBClassifier(**params, n_estimators=700, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_3 = XGBClassifier(**params, n_estimators=450, max_depth=12, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_4 = XGBClassifier(**params, n_estimators=800, max_depth=15, colsample_bytree=0.68, learning_rate=0.09, subsample=0.90, min_child_weight=0.0, gamma=1.3)
xgb_5 = XGBClassifier(**params, n_estimators=800, max_depth=12, colsample_bytree=0.68, learning_rate=0.09, subsample=0.90, min_child_weight=0.0, gamma=1.3)
xgb_6 = XGBClassifier(**params, n_estimators=700, max_depth=10, colsample_bytree=0.68, learning_rate=0.2, subsample=0.90, min_child_weight=0.0, gamma=0.3)
xgb_7 = XGBClassifier(**params, n_estimators=700, max_depth=15, colsample_bytree=0.53, learning_rate=0.14, subsample=0.99, min_child_weight=7.0)
xgb_8 = XGBClassifier(**params, n_estimators=600, max_depth=10, colsample_bytree=0.63, learning_rate=0.14, subsample=0.99)
xgb_9 = XGBClassifier(**params, n_estimators=700, max_depth=8, learning_rate=0.14)
xgb_10 = XGBClassifier(**params, n_estimators=600, max_depth=10, learning_rate=0.2)
xgb_11 = XGBClassifier(**params, n_estimators=700, max_depth=12, colsample_bytree=0.7, learning_rate=0.14, subsample=0.99)
xgb_12 = XGBClassifier(**params, n_estimators=650, max_depth=10, learning_rate=0.2)

classifier_list = [
    ('xgb_1', xgb_1), ('xgb_2', xgb_2), ('xgb_3', xgb_3), ('xgb_4', xgb_4), ('xgb_5', xgb_5), ('xgb_6', xgb_6), ('xgb_7', xgb_7), ('xgb_8', xgb_8), ('xgb_9', xgb_9), ('xgb_10', xgb_10), ('xgb_11', xgb_11), ('xgb_12', xgb_12)
]
voting_clf_2 = VotingClassifier(estimators=classifier_list, n_jobs=-1, verbose=2, voting='soft', weights=[1,1,2,1,2,1, 1, 1, 1, 1, 1, 1])

In [27]:
scores = cross_validate(estimator=voting_clf_2, X=train_fs, y=train_yT,cv=skf, scoring='f1', n_jobs=-1)
print(np.mean(scores['test_score']))

[Voting] .................... (2 of 6) Processing xgb_2, total=  25.3s
[Voting] .................... (1 of 6) Processing xgb_1, total=  26.1s
[Voting] .................... (4 of 6) Processing xgb_4, total= 1.8min
[Voting] .................... (3 of 6) Processing xgb_3, total= 2.0min
[Voting] .................... (6 of 6) Processing xgb_6, total=  33.4s
[Voting] .................... (5 of 6) Processing xgb_5, total= 1.1min
[Voting] .................... (2 of 6) Processing xgb_2, total=  20.3s
[Voting] .................... (1 of 6) Processing xgb_1, total=  21.0s
[Voting] .................... (4 of 6) Processing xgb_4, total= 1.8min
[Voting] .................... (3 of 6) Processing xgb_3, total= 2.0min
[Voting] .................... (6 of 6) Processing xgb_6, total=  32.9s
[Voting] .................... (5 of 6) Processing xgb_5, total= 1.0min
[Voting] .................... (1 of 6) Processing xgb_1, total=  12.0s
[Voting] .................... (3 of 6) Processing xgb_3, total= 1.1min
[Votin



[Voting] .................... (2 of 6) Processing xgb_2, total=  25.1s
[Voting] .................... (1 of 6) Processing xgb_1, total=  26.1s
[Voting] .................... (4 of 6) Processing xgb_4, total= 1.8min
[Voting] .................... (3 of 6) Processing xgb_3, total= 2.0min
[Voting] .................... (6 of 6) Processing xgb_6, total=  33.0s
[Voting] .................... (5 of 6) Processing xgb_5, total= 1.1min
[Voting] .................... (2 of 6) Processing xgb_2, total=  20.3s
[Voting] .................... (1 of 6) Processing xgb_1, total=  21.4s
[Voting] .................... (4 of 6) Processing xgb_4, total= 1.7min
[Voting] .................... (3 of 6) Processing xgb_3, total= 2.0min
[Voting] .................... (6 of 6) Processing xgb_6, total=  32.8s
[Voting] .................... (5 of 6) Processing xgb_5, total= 1.1min
[Voting] .................... (1 of 6) Processing xgb_1, total=   9.5s
[Voting] .................... (2 of 6) Processing xgb_2, total=   9.6s
[Votin

In [28]:
pred_and_sub(voting_clf_2, 53)

[1 1 1 ... 1 1 1]
[2 2 2 ... 2 2 2]
[Voting] ................... (2 of 12) Processing xgb_2, total=  17.8s
[Voting] ................... (4 of 12) Processing xgb_4, total=  33.6s
[Voting] ................... (5 of 12) Processing xgb_5, total=  26.0s
[Voting] ................... (7 of 12) Processing xgb_7, total= 1.2min
[Voting] ................. (10 of 12) Processing xgb_10, total=  52.5s
[Voting] ................. (12 of 12) Processing xgb_12, total=  56.1s
[Voting] ................... (1 of 12) Processing xgb_1, total=  25.2s
[Voting] ................... (2 of 12) Processing xgb_2, total=  32.5s
[Voting] ................... (4 of 12) Processing xgb_4, total=  57.2s
[Voting] ................... (5 of 12) Processing xgb_5, total=  45.4s
[Voting] ................... (3 of 12) Processing xgb_3, total= 2.0min
[Voting] ................... (6 of 12) Processing xgb_6, total=  54.7s
[Voting] ................... (8 of 12) Processing xgb_8, total= 1.5min
[Voting] ................... (7 of 12) Pr

In [30]:
xgb_x = XGBClassifier(**params, n_estimators=700, max_depth=15, learning_rate=0.14)
scores = cross_validate(estimator=xgb_x, X=train_fs, y=train_yT,cv=skf, scoring='f1', n_jobs=-1)
print(np.mean(scores['test_score']))

0.9477144129358237


In [31]:
xgb_1 = XGBClassifier(**params, n_estimators=500)
xgb_2 = XGBClassifier(**params, n_estimators=700, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_3 = XGBClassifier(**params, n_estimators=650, max_depth=12, colsample_bytree=0.63, learning_rate=0.1, subsample=0.96)
xgb_4 = XGBClassifier(**params, n_estimators=800, max_depth=15, colsample_bytree=0.68, learning_rate=0.09, subsample=0.90, min_child_weight=0.0, gamma=1.3)
xgb_5 = XGBClassifier(**params, n_estimators=400, max_depth=8, learning_rate=0.2)
xgb_6 = XGBClassifier(**params, n_estimators=600, max_depth=10, colsample_bytree=0.68, learning_rate=0.1, min_child_weight=0.0, gamma=0.3)
xgb_7 = XGBClassifier(**params, n_estimators=700, max_depth=10, learning_rate=0.18)
xgb_8 = XGBClassifier(**params, n_estimators=600, max_depth=8, colsample_bytree=0.9, learning_rate=0.2, subsample=0.99)

classifier_list = [
    ('xgb_1', xgb_1), ('xgb_2', xgb_2), ('xgb_3', xgb_3), ('xgb_4', xgb_4), ('xgb_5', xgb_5), ('xgb_6', xgb_6), ('xgb_7', xgb_7), ('xgb_8', xgb_8)
]
voting_clf_3 = VotingClassifier(estimators=classifier_list, n_jobs=-1, verbose=2, voting='soft', weights=[1,1,2,1,2,1, 1, 1])

In [34]:
scores = cross_validate(estimator=voting_clf_3, X=train_fs, y=train_yT,cv=skf, scoring='f1', n_jobs=-1)
print(np.mean(scores['test_score']))

0.9505393843204203


In [35]:
pred_and_sub(voting_clf_3, 54)

[1 1 1 ... 1 1 1]
[2 2 2 ... 2 2 2]
[Voting] .................... (1 of 8) Processing xgb_1, total=  26.7s
[Voting] .................... (2 of 8) Processing xgb_2, total=  31.6s
[Voting] .................... (4 of 8) Processing xgb_4, total=  57.4s
[Voting] .................... (5 of 8) Processing xgb_5, total=  40.2s
[Voting] .................... (6 of 8) Processing xgb_6, total=  44.9s
[Voting] .................... (3 of 8) Processing xgb_3, total= 2.9min
[Voting] .................... (8 of 8) Processing xgb_8, total=  56.7s
[Voting] .................... (7 of 8) Processing xgb_7, total= 1.6min
[Voting] .................... (1 of 8) Processing xgb_1, total=  21.9s
[Voting] .................... (2 of 8) Processing xgb_2, total=  27.5s
[Voting] .................... (4 of 8) Processing xgb_4, total=  58.1s
[Voting] .................... (5 of 8) Processing xgb_5, total=  39.1s
[Voting] .................... (6 of 8) Processing xgb_6, total=  45.1s
[Voting] .................... (3 of 8) Pr

# **After GPU closes**

In [13]:
train_fs = train_df.loc[:, feature_ranks_2[(feature_ranks_2['rankings'] <= 2 )]['features']]
test_fs = test_df.loc[:, feature_ranks_2[(feature_ranks_2['rankings'] <= 2)]['features']]

In [16]:
xgb_1 = XGBClassifier(**params, n_estimators=500)
xgb_2 = XGBClassifier(**params, n_estimators=700, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_3 = XGBClassifier(**params, n_estimators=650, max_depth=12, colsample_bytree=0.63, learning_rate=0.1, subsample=0.96)
xgb_4 = XGBClassifier(**params, n_estimators=800, max_depth=15, colsample_bytree=0.68, learning_rate=0.09, subsample=0.90, min_child_weight=0.0, gamma=1.3)
xgb_5 = XGBClassifier(**params, n_estimators=400, max_depth=8, learning_rate=0.2)
xgb_6 = XGBClassifier(**params, n_estimators=600, max_depth=10, colsample_bytree=0.68, learning_rate=0.1, min_child_weight=0.0, gamma=0.3)
xgb_7 = XGBClassifier(**params, n_estimators=700, max_depth=10, learning_rate=0.18)
xgb_8 = XGBClassifier(**params, n_estimators=600, max_depth=8, colsample_bytree=0.9, learning_rate=0.2, subsample=0.99)
xgb_9 = XGBClassifier(**params, n_estimators=900, max_depth=8, learning_rate=0.2)
xgb_10 = XGBClassifier(**params, n_estimators=600, max_depth=12, learning_rate=0.2, gamma=0.01)

classifier_list = [
    ('xgb_1', xgb_1), ('xgb_2', xgb_2), ('xgb_3', xgb_3), ('xgb_4', xgb_4), ('xgb_5', xgb_5), ('xgb_6', xgb_6), ('xgb_7', xgb_7), ('xgb_8', xgb_8), ('xgb_9', xgb_9), ('xgb_10', xgb_10)
]
voting_clf_4 = VotingClassifier(estimators=classifier_list, n_jobs=-1, verbose=2, voting='soft', weights=[1,1, 1, 1, 1, 1, 1, 1, 1, 1])

In [17]:
scores = cross_validate(estimator=voting_clf_4, X=train_fs, y=train_yT,cv=skf, scoring='f1', n_jobs=-1)
print(np.mean(scores['test_score']))

0.9501694538430012


In [18]:
pred_and_sub(voting_clf_4, 55)

[1 1 1 ... 1 1 1]
[2 2 2 ... 2 2 2]


In [19]:
train_fs = train_df.loc[:, feature_ranks_2[(feature_ranks_2['rankings'] <= 49 )]['features']]
test_fs = test_df.loc[:, feature_ranks_2[(feature_ranks_2['rankings'] <= 49)]['features']]

In [30]:
xgb_x = XGBClassifier(**params,n_estimators=600, max_depth=12, learning_rate=0.2, gamma=0.01)
scores = cross_validate(estimator=xgb_x, X=train_fs, y=train_yT,cv=skf, scoring='f1', n_jobs=-1)
print(np.mean(scores['test_score']))

0.9480898291638511


In [31]:
xgb_1 = XGBClassifier(**params, n_estimators=500)
xgb_2 = XGBClassifier(**params, n_estimators=700, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_3 = XGBClassifier(**params, n_estimators=650, max_depth=12, colsample_bytree=0.63, learning_rate=0.1, subsample=0.96)
xgb_4 = XGBClassifier(**params, n_estimators=800, max_depth=15, colsample_bytree=0.68, learning_rate=0.09, subsample=0.90, min_child_weight=0.0, gamma=1.3)
xgb_5 = XGBClassifier(**params, n_estimators=400, max_depth=8, learning_rate=0.2)
xgb_6 = XGBClassifier(**params, n_estimators=600, max_depth=10, colsample_bytree=0.68, learning_rate=0.1, min_child_weight=0.0, gamma=0.3)
xgb_7 = XGBClassifier(**params, n_estimators=700, max_depth=10, learning_rate=0.18)
xgb_8 = XGBClassifier(**params, n_estimators=600, max_depth=8, colsample_bytree=0.9, learning_rate=0.2, subsample=0.99)
xgb_9 = XGBClassifier(**params, n_estimators=900, max_depth=8, learning_rate=0.2)
xgb_10 = XGBClassifier(**params, n_estimators=600, max_depth=12, learning_rate=0.2, gamma=0.01)

classifier_list = [
    ('xgb_1', xgb_1), ('xgb_2', xgb_2), ('xgb_3', xgb_3), ('xgb_4', xgb_4), ('xgb_5', xgb_5), ('xgb_6', xgb_6), ('xgb_7', xgb_7), ('xgb_8', xgb_8), ('xgb_9', xgb_9), ('xgb_10', xgb_10)
]
voting_clf_5 = VotingClassifier(estimators=classifier_list, n_jobs=-1, verbose=2, voting='soft', weights=[1, 1, 1, 1, 2, 2, 1, 1, 1, 1])

In [32]:
scores = cross_validate(estimator=voting_clf_5, X=train_fs, y=train_yT,cv=skf, scoring='f1', n_jobs=-1)
print(np.mean(scores['test_score']))

[Voting] ................... (1 of 10) Processing xgb_1, total=  23.1s
[Voting] ................... (2 of 10) Processing xgb_2, total=  28.9s
[Voting] ................... (4 of 10) Processing xgb_4, total=  41.0s
[Voting] ................... (5 of 10) Processing xgb_5, total=  29.4s
[Voting] ................... (6 of 10) Processing xgb_6, total=  31.7s
[Voting] ................... (3 of 10) Processing xgb_3, total= 2.3min
[Voting] ................... (8 of 10) Processing xgb_8, total=  42.8s
[Voting] ................... (7 of 10) Processing xgb_7, total= 1.4min
[Voting] ................... (9 of 10) Processing xgb_9, total= 1.0min
[Voting] ................. (10 of 10) Processing xgb_10, total=  59.6s
[Voting] ................... (1 of 10) Processing xgb_1, total=  15.4s
[Voting] ................... (2 of 10) Processing xgb_2, total=  20.6s
[Voting] ................... (4 of 10) Processing xgb_4, total=  40.5s
[Voting] ................... (5 of 10) Processing xgb_5, total=  28.9s
[Votin



[Voting] ................... (1 of 10) Processing xgb_1, total=  19.7s
[Voting] ................... (2 of 10) Processing xgb_2, total=  25.3s
[Voting] ................... (4 of 10) Processing xgb_4, total=  50.4s
[Voting] ................... (5 of 10) Processing xgb_5, total=  34.5s
[Voting] ................... (6 of 10) Processing xgb_6, total=  38.2s
[Voting] ................... (3 of 10) Processing xgb_3, total= 2.6min
[Voting] ................... (8 of 10) Processing xgb_8, total=  50.4s
[Voting] ................... (7 of 10) Processing xgb_7, total= 1.6min
[Voting] ................... (9 of 10) Processing xgb_9, total= 1.2min
[Voting] ................. (10 of 10) Processing xgb_10, total= 1.1min
[Voting] ................... (1 of 10) Processing xgb_1, total=  12.4s
[Voting] ................... (2 of 10) Processing xgb_2, total=  18.1s
[Voting] ................... (4 of 10) Processing xgb_4, total=  48.2s
[Voting] ................... (5 of 10) Processing xgb_5, total=  35.4s
[Votin

In [33]:
pred_and_sub(voting_clf_5, 56)

[1 1 1 ... 1 1 1]
[2 2 2 ... 2 2 2]


In [36]:
train_fs = train_df.loc[:, feature_ranks_2[(feature_ranks_2['rankings'] <= 90 )]['features']]
test_fs = test_df.loc[:, feature_ranks_2[(feature_ranks_2['rankings'] <= 90)]['features']]

In [39]:
xgb_1 = XGBClassifier(**params, n_estimators=500)
xgb_2 = XGBClassifier(**params, n_estimators=700, colsample_bytree=0.63, learning_rate=0.13, subsample=0.96)
xgb_3 = XGBClassifier(**params, n_estimators=650, max_depth=12, colsample_bytree=0.63, learning_rate=0.1, subsample=0.96)
xgb_4 = XGBClassifier(**params, n_estimators=800, max_depth=15, colsample_bytree=0.68, learning_rate=0.09, subsample=0.90, min_child_weight=0.0, gamma=1.3)
xgb_5 = XGBClassifier(**params, n_estimators=400, max_depth=8, learning_rate=0.2)
xgb_6 = XGBClassifier(**params, n_estimators=600, max_depth=10, colsample_bytree=0.68, learning_rate=0.1, min_child_weight=0.0, gamma=0.3)
xgb_7 = XGBClassifier(**params, n_estimators=700, max_depth=10, learning_rate=0.18)
xgb_8 = XGBClassifier(**params, n_estimators=600, max_depth=8, colsample_bytree=0.9, learning_rate=0.2, subsample=0.99)
xgb_9 = XGBClassifier(**params, n_estimators=900, max_depth=8, learning_rate=0.2)
xgb_10 = XGBClassifier(**params, n_estimators=600, max_depth=12, learning_rate=0.2, gamma=0.01)
xgb_11 = XGBClassifier(**params, n_estimators=600)
xgb_12 = XGBClassifier(**params, n_estimators=800)
classifier_list = [
    ('xgb_1', xgb_1), ('xgb_2', xgb_2), ('xgb_3', xgb_3), ('xgb_4', xgb_4), ('xgb_5', xgb_5), ('xgb_6', xgb_6), ('xgb_7', xgb_7), ('xgb_8', xgb_8), ('xgb_9', xgb_9), ('xgb_10', xgb_10), ('xgb_11', xgb_11), ('xgb_12', xgb_12)
]
voting_clf_6 = VotingClassifier(estimators=classifier_list, n_jobs=-1, verbose=2, voting='soft', weights=[1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1])

[Voting] ................... (1 of 10) Processing xgb_1, total=  22.3s
[Voting] ................... (2 of 10) Processing xgb_2, total=  28.9s
[Voting] ................... (4 of 10) Processing xgb_4, total=  49.2s
[Voting] ................... (5 of 10) Processing xgb_5, total=  34.8s
[Voting] ................... (6 of 10) Processing xgb_6, total=  40.4s
[Voting] ................... (3 of 10) Processing xgb_3, total= 2.6min
[Voting] ................... (8 of 10) Processing xgb_8, total=  50.3s
[Voting] ................... (7 of 10) Processing xgb_7, total= 1.6min
[Voting] ................... (9 of 10) Processing xgb_9, total= 1.2min
[Voting] ................. (10 of 10) Processing xgb_10, total= 1.0min
[Voting] ................... (1 of 10) Processing xgb_1, total=  10.1s
[Voting] ................... (3 of 10) Processing xgb_3, total= 1.4min
[Voting] ................... (8 of 10) Processing xgb_8, total=  27.9s
[Voting] ................... (9 of 10) Processing xgb_9, total=  40.4s
[Votin

In [40]:
scores = cross_validate(estimator=voting_clf_6, X=train_fs, y=train_yT,cv=skf, scoring='f1', n_jobs=-1)
print(np.mean(scores['test_score']))

0.9502696237532712


In [41]:
pred_and_sub(voting_clf_6, 57)

[1 1 1 ... 1 1 1]
[2 2 2 ... 2 2 2]
[Voting] ................... (1 of 12) Processing xgb_1, total=  27.3s
[Voting] ................... (2 of 12) Processing xgb_2, total=  31.7s
[Voting] ................... (4 of 12) Processing xgb_4, total=  56.1s
[Voting] ................... (5 of 12) Processing xgb_5, total=  39.1s
[Voting] ................... (6 of 12) Processing xgb_6, total=  41.2s
[Voting] ................... (3 of 12) Processing xgb_3, total= 2.7min
[Voting] ................... (8 of 12) Processing xgb_8, total=  55.9s
[Voting] ................... (7 of 12) Processing xgb_7, total= 1.8min
[Voting] ................... (9 of 12) Processing xgb_9, total= 1.3min
[Voting] ................. (11 of 12) Processing xgb_11, total=  27.8s
[Voting] ................. (10 of 12) Processing xgb_10, total= 1.5min
[Voting] ................. (12 of 12) Processing xgb_12, total=  25.1s
[Voting] ................... (1 of 12) Processing xgb_1, total=  19.6s
[Voting] ................... (2 of 12) Pr