In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem, Draw, DataStructs, MACCSkeys
import pandas as pd
from pathlib import Path
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split, cross_val_score
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm
import matplotlib.pyplot as plt
from hyperopt import fmin, STATUS_OK, tpe, Trials, hp, space_eval

In [13]:
base_path = Path().cwd().joinpath('content/datasets/')
train_df = pd.read_csv(base_path.joinpath('train_molecular_data.csv'))
test_df = pd.read_csv(base_path.joinpath('test_molecular_data.csv'))

In [19]:
train_y = train_df.loc[:, 'Expected']
train_X = train_df.drop(['Expected'], axis=1)

In [20]:
skf = StratifiedKFold(random_state=10, n_splits=5, shuffle=True)

In [21]:
submission_path = Path().cwd().joinpath('content/submissions/')

In [22]:
lab = LabelEncoder()
train_yT = lab.fit_transform(train_y)
X_train, X_val, y_train, y_val = train_test_split(train_X, train_yT, test_size=0.25, random_state=10, stratify=train_yT)

In [33]:
standard_params = {
    'n_jobs':4,
    'tree_method': 'hist',
    'booster': 'gbtree',
    'max_bin': 256
}

In [34]:
def objective(space):
    xgb_cl = XGBClassifier(**space, **standard_params, eval_metric="error", early_stopping_rounds=10)
    
    fit_params={'verbose': False, 'eval_set': [[X_val, y_val]]}
    
    score = cross_val_score(estimator=xgb_cl, X=X_train, y=y_train, cv=skf, fit_params=fit_params, scoring='f1').mean()

    return {'loss': -score, 'status':STATUS_OK}
    

In [35]:
hyper_space = {
    'n_estimators': hp.randint('n_estimators', 100, 1000),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'max_depth': hp.randint('max_depth', 3, 18),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'gamma': hp.uniform ('gamma', 0.01,9),
#     'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
#     'reg_lambda' : hp.uniform('reg_lambda', 0,1),
    'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1)
}

In [36]:
trials = Trials()
best_params = fmin(fn=objective, space=hyper_space, algo=tpe.suggest, max_evals=100, trials=trials)

100%|██████| 100/100 [17:29<00:00, 10.50s/trial, best loss: -0.9468525595651057]


In [37]:
best_params

{'colsample_bytree': 0.6317150680991094,
 'gamma': 0.01504712763596594,
 'learning_rate': 0.13102033630514603,
 'max_depth': 16,
 'min_child_weight': 7.0,
 'n_estimators': 946,
 'subsample': 0.9619442565997574}

In [38]:
classifier = XGBClassifier(**best_params, **standard_params)
classifier.fit(X=train_X, y=train_yT)

In [39]:
pred_vals = classifier.predict(test_df)
print(pred_vals)
new_pred = lab.inverse_transform(pred_vals)
print(new_pred)
final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
final_df['Predicted'] = new_pred
final_df.columns = ['Id', 'Predicted']
final_df.to_csv(submission_path.joinpath('submission30.csv'), index=False)

[1 1 1 ... 1 1 0]
[2 2 2 ... 2 2 1]


In [42]:
def pred_and_sub(classifier, sub_no):
    classifier.fit(X=train_X, y=train_yT)
    pred_vals = classifier.predict(test_df)
    print(pred_vals)
    new_pred = lab.inverse_transform(pred_vals)
    print(new_pred)
    final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
    final_df['Predicted'] = new_pred
    final_df.columns = ['Id', 'Predicted']
    final_df.to_csv(submission_path.joinpath(f'submission{sub_no}.csv'), index=False)

In [41]:
params = {'colsample_bytree': 0.6317150680991094,
 'gamma': 0.01504712763596594,
 'learning_rate': 0.13102033630514603,
 'max_depth': 16,
 'min_child_weight': 7.0,
 'n_estimators': 600,
 'subsample': 0.9619442565997574}
classifier2 = XGBClassifier(**params, **standard_params)
scores = cross_validate(estimator=classifier2, X=train_X, y=train_yT,cv=skf, scoring='f1', n_jobs=4)
print(np.mean(scores['test_score']))

0.9478050551602809


In [43]:
pred_and_sub(classifier2, 31)

[1 1 1 ... 1 1 0]
[2 2 2 ... 2 2 1]


In [44]:
params = {'colsample_bytree': 0.6317150680991094,
 'gamma': 0.01504712763596594,
 'learning_rate': 0.13102033630514603,
 'max_depth': 12,
 'min_child_weight': 7.0,
 'n_estimators': 600,
 'subsample': 0.9619442565997574}
classifier3 = XGBClassifier(**params, **standard_params)
scores = cross_validate(estimator=classifier3, X=train_X, y=train_yT,cv=skf, scoring='f1', n_jobs=4)
print(np.mean(scores['test_score']))

0.9484920955965365


In [45]:
pred_and_sub(classifier3, 32)

[1 1 1 ... 1 1 1]
[2 2 2 ... 2 2 2]


In [46]:
params = {'colsample_bytree': 0.6317150680991094,
 'gamma': 0.01504712763596594,
 'learning_rate': 0.13102033630514603,
 'max_depth': 10,
 'min_child_weight': 7.0,
 'n_estimators': 600,
 'subsample': 0.9619442565997574}
classifier4 = XGBClassifier(**params, **standard_params)
scores = cross_validate(estimator=classifier4, X=train_X, y=train_yT,cv=skf, scoring='f1', n_jobs=4)
print(np.mean(scores['test_score']))

0.9491106314801445


In [47]:
pred_and_sub(classifier4, 33)

[1 1 1 ... 1 1 0]
[2 2 2 ... 2 2 1]


In [48]:
params = {'colsample_bytree': 0.6317150680991094,
 'gamma': 0.01504712763596594,
 'learning_rate': 0.13102033630514603,
 'max_depth': 10,
 'min_child_weight': 7.0,
 'n_estimators': 650,
 'subsample': 0.9619442565997574}
classifier5 = XGBClassifier(**params, **standard_params)
scores = cross_validate(estimator=classifier5, X=train_X, y=train_yT,cv=skf, scoring='f1', n_jobs=4)
print(np.mean(scores['test_score']))

0.948928407677293


In [49]:
pred_and_sub(classifier5, 34)

[1 1 1 ... 1 1 0]
[2 2 2 ... 2 2 1]


In [59]:
params = {'colsample_bytree': 0.6317150680991094,
 'gamma': 0.01504712763596594,
 'learning_rate': 0.13102033630514603,
 'max_depth': 16,
 'min_child_weight': 7.0,
 'n_estimators': 700,
 'subsample': 0.9619442565997574}
classifier6 = XGBClassifier(**params, **standard_params)
scores = cross_validate(estimator=classifier6, X=train_X, y=train_yT,cv=skf, scoring='f1', n_jobs=4)
print(np.mean(scores['test_score']))

0.9475872633906605


In [56]:
pred_and_sub(classifier6, 35)

[1 1 1 ... 1 1 1]
[2 2 2 ... 2 2 2]


In [60]:
params = {'colsample_bytree': 0.6317150680991094,
 'gamma': 0.01504712763596594,
 'learning_rate': 0.11102033630514603,
 'max_depth': 16,
 'min_child_weight': 7.0,
 'n_estimators': 600,
 'subsample': 0.9619442565997574}
classifier7 = XGBClassifier(**params, **standard_params)
scores = cross_validate(estimator=classifier7, X=train_X, y=train_yT,cv=skf, scoring='f1', n_jobs=4)
print(np.mean(scores['test_score']))

0.9482267522828923


In [61]:
pred_and_sub(classifier7, 36)

[1 1 1 ... 1 1 0]
[2 2 2 ... 2 2 1]


In [62]:
params = {'colsample_bytree': 0.6317150680991094,
 'gamma': 0.01504712763596594,
 'learning_rate': 0.10102033630514603,
 'max_depth': 12,
 'min_child_weight': 7.0,
 'n_estimators': 660,
 'subsample': 0.9619442565997574}
classifier8 = XGBClassifier(**params, **standard_params)
scores = cross_validate(estimator=classifier8, X=train_X, y=train_yT,cv=skf, scoring='f1', n_jobs=4)
print(np.mean(scores['test_score']))

0.9488939346948907


In [63]:
pred_and_sub(classifier8, 37)

[1 1 1 ... 1 1 0]
[2 2 2 ... 2 2 1]
