In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem, Draw, DataStructs, MACCSkeys
import pandas as pd
from pathlib import Path
import numpy as np
from xgboost import XGBClassifier, plot_importance
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split, cross_val_score
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm
import matplotlib.pyplot as plt
from hyperopt import fmin, STATUS_OK, tpe, Trials, hp, space_eval

In [2]:
base_path = Path().cwd().joinpath('content/datasets/')
train_df = pd.read_csv(base_path.joinpath('train_II.csv'))
test_df = pd.read_csv(base_path.joinpath('test_II.csv'))

In [3]:
train_df[["Chemical Id", "Assay Id"]] = train_df.Id.str.split(
    ";", expand=True
)
train_df.drop("Id", axis=1, inplace=True)
train_df = train_df[["Assay Id", "Chemical Id", "Expected"]]

test_df[["Chemical Id", "Assay Id"]] = test_df.x.str.split(";", expand=True)
test_df.drop("x", axis=1, inplace=True)
test_df = test_df[["Assay Id", "Chemical Id"]]

In [4]:
def generate_macckey_fingerprints(data_frame, is_train=True):
    
    data_frame['molecules'] = data_frame['Chemical Id'].apply(lambda smile: Chem.MolFromSmiles(smile))
    
    if is_train:
        data_frame.dropna(axis=0, inplace=True)
        data_frame.reset_index(drop=True, inplace=True)
    
    macc_keys = np.array([np.array(MACCSkeys.GenMACCSKeys(mol)) for mol in tqdm(data_frame['molecules'])])
    
    COLUMNS = [f'FP{i}' for i in range(0, 167)]
    
    data_frame.drop(['Chemical Id', 'molecules'], axis=1, inplace=True)
    
    macc_keys_df = pd.DataFrame(data=macc_keys, columns=COLUMNS)
    
    return pd.concat([data_frame, macc_keys_df], axis=1)

In [5]:
train_fp = generate_macckey_fingerprints(train_df)

[16:20:20] Explicit valence for atom # 1 Si, 8, is greater than permitted
[16:20:21] Explicit valence for atom # 1 Si, 8, is greater than permitted
[16:20:22] Explicit valence for atom # 1 Si, 8, is greater than permitted
[16:20:22] Explicit valence for atom # 1 Si, 8, is greater than permitted
[16:20:22] Explicit valence for atom # 1 Si, 8, is greater than permitted
[16:20:23] Explicit valence for atom # 1 Si, 8, is greater than permitted
100%|███████████████████████████████████| 75377/75377 [00:54<00:00, 1386.89it/s]


In [6]:
train_fp.head()

Unnamed: 0,Assay Id,Expected,FP0,FP1,FP2,FP3,FP4,FP5,FP6,FP7,...,FP157,FP158,FP159,FP160,FP161,FP162,FP163,FP164,FP165,FP166
0,1644,2,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,1,1,0
1,2451,2,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2,1384,2,0,0,0,0,0,0,0,0,...,0,1,0,1,1,0,0,0,0,1
3,16,2,0,0,0,0,0,0,0,0,...,0,1,1,0,1,1,1,1,1,0
4,1856,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [7]:
train_fp['Assay Id'] = pd.to_numeric(train_fp['Assay Id'])
train_fp.to_csv('train_macc_keys', index=False)

In [8]:
test_fp = generate_macckey_fingerprints(test_df, is_train=False)

100%|███████████████████████████████████| 10994/10994 [00:07<00:00, 1391.34it/s]


In [9]:
test_fp.head()

Unnamed: 0,Assay Id,FP0,FP1,FP2,FP3,FP4,FP5,FP6,FP7,FP8,...,FP157,FP158,FP159,FP160,FP161,FP162,FP163,FP164,FP165,FP166
0,1682,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,1,1,1,1,0
1,1656,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
2,36,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
3,1850,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,0,1,1,1,1
4,30,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,0,1,1,1,0


In [10]:
test_fp['Assay Id'] = pd.to_numeric(test_fp['Assay Id'])
test_fp.to_csv('test_macc_keys', index=False)

In [11]:
train_y = train_fp.loc[:, 'Expected']
train_X = train_fp.drop(['Expected'], axis=1)

In [12]:
skf = StratifiedKFold(random_state=10, n_splits=5, shuffle=True)

In [27]:
xgb = XGBClassifier(tree_method='hist', max_bin=256, n_estimators=650, learning_rate=0.1, objective='binary:logistic', max_depth=8, subsample=0.9)
lab = LabelEncoder()
y_data = lab.fit_transform(y_train)
scores = cross_validate(estimator=xgb, X=X_train, y=y_data,cv=skf, scoring='f1', n_jobs=4)
print(np.mean(scores['test_score']))

0.9474935932738584


In [13]:
submission_path = Path().cwd().joinpath('content/submissions/')

In [30]:
model = xgb.fit(X_train, y_data)
pred_vals = model.predict(test_fp)
print(pred_vals)
new_pred = lab.inverse_transform(pred_vals)
print(new_pred)
final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
final_df['Predicted'] = new_pred
final_df.columns = ['Id', 'Predicted']
final_df.to_csv(submission_path.joinpath('submission24.csv'), index=False)

[1 1 1 ... 1 1 1]
[2 2 2 ... 2 2 2]


In [33]:
xgb2 = XGBClassifier(tree_method='hist', max_bin=256, n_estimators=650, learning_rate=0.2, objective='binary:logistic', max_depth=9, gamma=0.01)
lab = LabelEncoder()
y_data = lab.fit_transform(y_train)
scores = cross_validate(estimator=xgb2, X=X_train, y=y_data,cv=skf, scoring='f1', n_jobs=4)
print(np.mean(scores['test_score']))

0.9458057289065025


In [34]:
model = xgb2.fit(X_train, y_data)
pred_vals = model.predict(test_fp)
print(pred_vals)
new_pred = lab.inverse_transform(pred_vals)
print(new_pred)
final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
final_df['Predicted'] = new_pred
final_df.columns = ['Id', 'Predicted']
final_df.to_csv(submission_path.joinpath('submission25.csv'), index=False)

[1 1 1 ... 1 0 1]
[2 2 2 ... 2 1 2]


In [35]:
xgb3 = XGBClassifier(tree_method='hist', max_bin=256, n_estimators=850, learning_rate=0.2, objective='binary:logistic', max_depth=9, gamma=0.01)
lab = LabelEncoder()
y_data = lab.fit_transform(y_train)
scores = cross_validate(estimator=xgb3, X=X_train, y=y_data,cv=skf, scoring='f1', n_jobs=4)
print(np.mean(scores['test_score']))

0.9457993503455786


In [37]:
model = xgb3.fit(X_train, y_data)
pred_vals = model.predict(test_fp)
print(pred_vals)
new_pred = lab.inverse_transform(pred_vals)
print(new_pred)
final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
final_df['Predicted'] = new_pred
final_df.columns = ['Id', 'Predicted']
final_df.to_csv(submission_path.joinpath('submission26.csv'), index=False)

[1 1 1 ... 1 0 1]
[2 2 2 ... 2 1 2]


In [38]:
xgb4 = XGBClassifier(tree_method='hist', max_bin=256, n_estimators=450, learning_rate=0.2, objective='binary:logistic', max_depth=8, gamma=0.01)
lab = LabelEncoder()
y_data = lab.fit_transform(y_train)
scores = cross_validate(estimator=xgb4, X=X_train, y=y_data,cv=skf, scoring='f1', n_jobs=4)
print(np.mean(scores['test_score']))

0.9465642785956575


In [39]:
model = xgb4.fit(X_train, y_data)
pred_vals = model.predict(test_fp)
print(pred_vals)
new_pred = lab.inverse_transform(pred_vals)
print(new_pred)
final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
final_df['Predicted'] = new_pred
final_df.columns = ['Id', 'Predicted']
final_df.to_csv(submission_path.joinpath('submission27.csv'), index=False)

[1 1 1 ... 1 1 0]
[2 2 2 ... 2 2 1]


In [14]:
space = {
    'n_estimators': hp.randint('n_estimators', 100, 1000),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'max_depth': hp.randint('max_depth', 3, 18),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1)
}

In [15]:
lab = LabelEncoder()
train_yT = lab.fit_transform(train_y)
X_train, X_val, y_train, y_val = train_test_split(train_X, train_yT, test_size=0.25, random_state=10, stratify=train_yT)

In [20]:
def objective(space):
    xgb_cl = XGBClassifier(**space, eval_metric="error",
                           early_stopping_rounds=10,
                            objective='binary:logistic', n_jobs=4, tree_method='hist')
    
    fit_params={'verbose': False, 'eval_set': [[X_val, y_val]]}
    
    score = cross_val_score(estimator=xgb_cl, X=X_train, y=y_train, cv=skf, fit_params=fit_params, scoring='f1').mean()

    return {'loss': -score, 'status':STATUS_OK}
    

In [21]:
trials = Trials()
best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)

100%|██████| 100/100 [12:16<00:00,  7.36s/trial, best loss: -0.9445503235639684]


In [22]:
best_params

{'colsample_bytree': 0.6579661398347049,
 'learning_rate': 0.17392073226596919,
 'max_depth': 12,
 'n_estimators': 258,
 'subsample': 0.9841888805154393}

In [23]:
space_eval(space, best_params)

{'colsample_bytree': 0.6579661398347049,
 'learning_rate': 0.17392073226596919,
 'max_depth': 12,
 'n_estimators': 258,
 'subsample': 0.9841888805154393}

In [31]:
eval_set = [(X_val, y_val)]
classifier = XGBClassifier(**best_params, booster="gbtree", n_jobs=4, tree_method='hist')
classifier.fit(X=train_X, y=train_yT)

In [26]:
classifier.best_iteration

70

In [28]:
f1_score(y_val, classifier.predict(X_val))

0.9460575908337102

In [32]:
pred_vals = classifier.predict(test_fp)
print(pred_vals)
new_pred = lab.inverse_transform(pred_vals)
print(new_pred)
final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
final_df['Predicted'] = new_pred
final_df.columns = ['Id', 'Predicted']
final_df.to_csv(submission_path.joinpath('submission28.csv'), index=False)

[1 1 1 ... 1 1 0]
[2 2 2 ... 2 2 1]


In [33]:
classifier2 = XGBClassifier(**best_params, booster="gbtree", n_jobs=4, tree_method='hist', eval_metric="error", early_stopping_rounds=10)
classifier2.fit(X=X_train, y=y_train, eval_set=[(X_val, y_val)])

[0]	validation_0-error:0.12306
[1]	validation_0-error:0.11679
[2]	validation_0-error:0.11754
[3]	validation_0-error:0.11441
[4]	validation_0-error:0.11244
[5]	validation_0-error:0.11350
[6]	validation_0-error:0.11181
[7]	validation_0-error:0.10921
[8]	validation_0-error:0.11043
[9]	validation_0-error:0.10915
[10]	validation_0-error:0.10740
[11]	validation_0-error:0.10687
[12]	validation_0-error:0.10586
[13]	validation_0-error:0.10496
[14]	validation_0-error:0.10517
[15]	validation_0-error:0.10486
[16]	validation_0-error:0.10432
[17]	validation_0-error:0.10390
[18]	validation_0-error:0.10358
[19]	validation_0-error:0.10257
[20]	validation_0-error:0.10252
[21]	validation_0-error:0.10199
[22]	validation_0-error:0.10231
[23]	validation_0-error:0.10183
[24]	validation_0-error:0.10167
[25]	validation_0-error:0.10119
[26]	validation_0-error:0.10125
[27]	validation_0-error:0.10125
[28]	validation_0-error:0.10098
[29]	validation_0-error:0.10098
[30]	validation_0-error:0.10072
[31]	validation_0-

In [38]:
classifier2.best_iteration

65

In [42]:
classifier3 = XGBClassifier(**best_params, booster="gbtree", n_jobs=4, tree_method='hist', eval_metric="error", early_stopping_rounds=20)
classifier3.fit(X=X_train, y=y_train, eval_set=[(X_val, y_val)])

[0]	validation_0-error:0.12306
[1]	validation_0-error:0.11679
[2]	validation_0-error:0.11754
[3]	validation_0-error:0.11441
[4]	validation_0-error:0.11244
[5]	validation_0-error:0.11350
[6]	validation_0-error:0.11181
[7]	validation_0-error:0.10921
[8]	validation_0-error:0.11043
[9]	validation_0-error:0.10915
[10]	validation_0-error:0.10740
[11]	validation_0-error:0.10687
[12]	validation_0-error:0.10586
[13]	validation_0-error:0.10496
[14]	validation_0-error:0.10517
[15]	validation_0-error:0.10486
[16]	validation_0-error:0.10432
[17]	validation_0-error:0.10390
[18]	validation_0-error:0.10358
[19]	validation_0-error:0.10257
[20]	validation_0-error:0.10252
[21]	validation_0-error:0.10199
[22]	validation_0-error:0.10231
[23]	validation_0-error:0.10183
[24]	validation_0-error:0.10167
[25]	validation_0-error:0.10119
[26]	validation_0-error:0.10125
[27]	validation_0-error:0.10125
[28]	validation_0-error:0.10098
[29]	validation_0-error:0.10098
[30]	validation_0-error:0.10072
[31]	validation_0-

In [43]:
classifier3.best_iteration

65

In [44]:
f1_score(y_val, classifier3.predict(X_val))

0.9456210646823126

In [47]:
pred_vals = classifier3.predict(test_fp)
print(pred_vals)
new_pred = lab.inverse_transform(pred_vals)
print(new_pred)
final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
final_df['Predicted'] = new_pred
final_df.columns = ['Id', 'Predicted']
final_df.to_csv(submission_path.joinpath('submission29.csv'), index=False)

[1 1 1 ... 1 1 1]
[2 2 2 ... 2 2 2]
