In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem, Draw, DataStructs
import pandas as pd
from pathlib import Path
import numpy as np
from xgboost import XGBClassifier, plot_importance
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, cross_validate
from tqdm import tqdm
import matplotlib.pyplot as plt

In [2]:
base_path = Path().cwd().joinpath('content/datasets/')
train_df = pd.read_csv(base_path.joinpath('train_II.csv'))
test_df = pd.read_csv(base_path.joinpath('test_II.csv'))

In [3]:
train_df[["Chemical Id", "Assay Id"]] = train_df.Id.str.split(
    ";", expand=True
)
train_df.drop("Id", axis=1, inplace=True)
train_df = train_df[["Assay Id", "Chemical Id", "Expected"]]

In [10]:
train_df.head()

Unnamed: 0,Assay Id,Chemical Id,Expected
0,1644,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O,2
1,2451,CCCCCCCCC(=O)C,2
2,1384,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],2
3,16,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl,2
4,1856,[Na+].[I-],2


In [4]:
def generate_morgan_fingerprints(data_frame, is_train = True):
    
    data_frame['molecules'] = data_frame['Chemical Id'].apply(lambda smile: Chem.MolFromSmiles(smile))
    
    if is_train:
        data_frame.dropna(axis=0,inplace=True) #dropping silicon based elemets for train dataset
        data_frame.reset_index(drop=True, inplace=True)
    
    onBits = {}
    
    morgan_fp = [
        np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024, bitInfo= onBits)) for mol in tqdm(data_frame['molecules'])
    ]
    
    data_frame.drop(['Chemical Id', 'molecules'], axis=1, inplace=True)
    
    COLUMNS = [f'FP{i}' for i in range(0, 1024)]
    
    finger_prints = pd.DataFrame(data=morgan_fp, columns=COLUMNS)
    
    morgan_fp_data = pd.concat([data_frame, finger_prints], axis=1)
    
    return morgan_fp_data

In [4]:
# train_df['molecules'] = train_df['Chemical Id'].apply(lambda smile: Chem.MolFromSmiles(smile))

train_fp = generate_morgan_fingerprints(train_df)

[16:47:21] Explicit valence for atom # 1 Si, 8, is greater than permitted
[16:47:21] Explicit valence for atom # 1 Si, 8, is greater than permitted
[16:47:22] Explicit valence for atom # 1 Si, 8, is greater than permitted
[16:47:22] Explicit valence for atom # 1 Si, 8, is greater than permitted
[16:47:23] Explicit valence for atom # 1 Si, 8, is greater than permitted
[16:47:23] Explicit valence for atom # 1 Si, 8, is greater than permitted


In [7]:
train_fp.to_csv('train_morgan_fp.csv', index=False)

In [8]:
test_df[["Chemical Id", "Assay Id"]] = test_df.x.str.split(";", expand=True)
test_df.drop("x", axis=1, inplace=True)
test_df = test_df[["Assay Id", "Chemical Id"]]

In [9]:
test_df.head()

Unnamed: 0,Assay Id,Chemical Id
0,1682,CC1=CC(=C(C=C1)C(C)(C)C)O
1,1656,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...
2,36,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...
3,1850,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+]
4,30,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...


In [10]:
test_fp = generate_morgan_fingerprints(test_df, is_train=False)

100%|████████████████████████████████████| 10994/10994 [00:12<00:00, 859.40it/s]


In [12]:
test_fp.head()

Unnamed: 0,Assay Id,FP0,FP1,FP2,FP3,FP4,FP5,FP6,FP7,FP8,FP9,FP10,FP11,FP12,FP13,FP14,FP15,FP16,FP17,FP18,FP19,FP20,FP21,FP22,FP23,FP24,FP25,FP26,FP27,FP28,FP29,FP30,FP31,FP32,FP33,FP34,FP35,FP36,FP37,FP38,...,FP984,FP985,FP986,FP987,FP988,FP989,FP990,FP991,FP992,FP993,FP994,FP995,FP996,FP997,FP998,FP999,FP1000,FP1001,FP1002,FP1003,FP1004,FP1005,FP1006,FP1007,FP1008,FP1009,FP1010,FP1011,FP1012,FP1013,FP1014,FP1015,FP1016,FP1017,FP1018,FP1019,FP1020,FP1021,FP1022,FP1023
0,1682,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,1656,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,36,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,1850,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,30,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0


In [13]:
test_fp.isna().sum()

Assay Id    0
FP0         0
FP1         0
FP2         0
FP3         0
           ..
FP1019      0
FP1020      0
FP1021      0
FP1022      0
FP1023      0
Length: 1025, dtype: int64

In [14]:
test_fp.to_csv('test_morgan_fp.csv', index=False)

In [15]:
test_fp['Assay Id'] = pd.to_numeric(test_fp['Assay Id'])

In [16]:
y_train = train_fp.loc[:, 'Expected']
X_train = train_fp.drop(['Expected'], axis=1)
X_train['Assay Id'] = pd.to_numeric(X_train['Assay Id'])

In [17]:
X_train.dtypes

Assay Id    int64
FP0         int64
FP1         int64
FP2         int64
FP3         int64
            ...  
FP1019      int64
FP1020      int64
FP1021      int64
FP1022      int64
FP1023      int64
Length: 1025, dtype: object

In [18]:
skf = StratifiedKFold(random_state=10, n_splits=5, shuffle=True)

In [67]:
xgb2 = XGBClassifier(tree_method='hist', max_bin=256, n_estimators=800, learning_rate=0.2)
lab = LabelEncoder()
y_data = lab.fit_transform(y_train)
scores = cross_validate(estimator=xgb2, X=X_train, y=y_data,cv=skf, scoring='f1', n_jobs=4)
print(np.mean(scores['test_score']))

In [19]:
submission_path = Path().cwd().joinpath('content/submissions/')

In [72]:
model = xgb2.fit(X_train, y_data)
pred_vals = model.predict(test_fp)
print(pred_vals)
new_pred = lab.inverse_transform(pred_vals)
print(new_pred)
final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
final_df['Predicted'] = new_pred
final_df.columns = ['Id', 'Predicted']
final_df.to_csv(submission_path.joinpath('submission17.csv'), index=False)

[1 1 1 ... 1 1 1]
[2 2 2 ... 2 2 2]


In [51]:
xgb3 = XGBClassifier(
    tree_method='hist', max_bin=255, n_estimators=600, learning_rate=0.2, objective='binary:logistic', gamma=0.65, reg_alpha=0.9, subsample=0.8,
    colsample_bytree = 0.7, max_depth=10
)
lab = LabelEncoder()
y_data = lab.fit_transform(y_train)
# scores = cross_validate(estimator=xgb3, X=X_train, y=y_data,cv=skf, scoring='f1', n_jobs=4)
# print(np.mean(scores['test_score']))

In [52]:
model = xgb3.fit(X_train, y_data)
pred_vals = model.predict(test_fp)
print(pred_vals)
new_pred = lab.inverse_transform(pred_vals)
print(new_pred)
final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
final_df['Predicted'] = new_pred
final_df.columns = ['Id', 'Predicted']
final_df.to_csv(submission_path.joinpath('submission18.csv'), index=False)

[1 1 1 ... 1 1 0]
[2 2 2 ... 2 2 1]


In [56]:
xgb4 = XGBClassifier(
    tree_method='hist', max_bin=255, n_estimators=500, learning_rate=0.2, objective='binary:logistic', gamma=0.65, reg_alpha=0.9, subsample=0.8,
    colsample_bytree = 0.9, max_depth=6, colsample_bylevel=0.8
)
lab = LabelEncoder()
y_data = lab.fit_transform(y_train)
# scores = cross_validate(estimator=xgb4, X=X_train, y=y_data,cv=skf, scoring='f1', n_jobs=4)
# print(np.mean(scores['test_score']))

In [58]:
model = xgb4.fit(X_train, y_data)
pred_vals = model.predict(test_fp)
print(pred_vals)
new_pred = lab.inverse_transform(pred_vals)
print(new_pred)
final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
final_df['Predicted'] = new_pred
final_df.columns = ['Id', 'Predicted']
final_df.to_csv(submission_path.joinpath('submission19.csv'), index=False)

[1 1 1 ... 1 1 1]
[2 2 2 ... 2 2 2]


In [65]:
xgb5 = XGBClassifier(
    tree_method='hist', max_bin=255, n_estimators=800, learning_rate=0.2, objective='binary:logistic', gamma=0.65, reg_alpha=0.9, subsample=0.9,
    colsample_bytree = 0.9, max_depth=10, colsample_bylevel=0.9
)
lab = LabelEncoder()
y_data = lab.fit_transform(y_train)
scores = cross_validate(estimator=xgb5, X=X_train, y=y_data,cv=skf, scoring='f1', n_jobs=4)
print(np.mean(scores['test_score']))

0.9476024023121834


In [64]:
model = xgb5.fit(X_train, y_data)
pred_vals = model.predict(test_fp)
print(pred_vals)
new_pred = lab.inverse_transform(pred_vals)
print(new_pred)
final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
final_df['Predicted'] = new_pred
final_df.columns = ['Id', 'Predicted']
final_df.to_csv(submission_path.joinpath('submission20.csv'), index=False)

[1 1 1 ... 1 1 1]
[2 2 2 ... 2 2 2]


In [72]:
xgb6 = XGBClassifier(
    tree_method='hist', max_bin=255, n_estimators=600, learning_rate=0.2, objective='binary:logistic', gamma=0.65, reg_alpha=0.9, reg_lambda=0.89, subsample=0.9,
    colsample_bytree = 0.9, max_depth=12, colsample_bylevel=0.9
)
lab = LabelEncoder()
y_data = lab.fit_transform(y_train)
# scores = cross_validate(estimator=xgb6, X=X_train, y=y_data,cv=skf, scoring='f1', n_jobs=4)
# print(np.mean(scores['test_score']))

0.9477352859260233


In [73]:
model = xgb6.fit(X_train, y_data)
pred_vals = model.predict(test_fp)
print(pred_vals)
new_pred = lab.inverse_transform(pred_vals)
print(new_pred)
final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
final_df['Predicted'] = new_pred
final_df.columns = ['Id', 'Predicted']
final_df.to_csv(submission_path.joinpath('submission21.csv'), index=False)

[1 1 1 ... 1 1 1]
[2 2 2 ... 2 2 2]
