# Imports

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

# Read In Data

In [3]:
df = pd.read_csv('../../DATA/filled_toxicity_df.csv')

df.head()

Unnamed: 0,mol_id,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,TOX3021,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,2.0,...,1,0,0,0,0,1,0,0,0,0
1,TOX3020,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,TOX3024,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,4.0,...,0,0,0,0,0,0,0,0,0,0
3,TOX3027,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,TOX20800,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.set_index('mol_id', inplace=True)

df.head()

Unnamed: 0_level_0,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,AromaticProportion,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
mol_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TOX3021,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,2.0,0.5625,...,1,0,0,0,0,1,0,0,0,0
TOX3020,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,2.0,0.4,...,0,0,0,0,0,0,0,0,0,0
TOX3024,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,4.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TOX3027,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,1.0,0.3,...,0,0,0,0,0,0,0,0,0,0
TOX20800,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df.columns

Index(['MolecularWeight', 'LogP', 'TPSA', 'HBDonors', 'HBAcceptors',
       'RotatableBonds', 'FractionCSP3', 'HeavyAtoms', 'RingCount',
       'AromaticProportion', 'LogS_ESOL', 'PositiveCharges', 'NegativeCharges',
       'FormalCharge', 'AromaticRings', 'AromaticHeterocycles',
       'AliphaticRings', 'MolecularComplexity', 'MolarRefractivity',
       'Heteroatoms', 'HalogenCount', 'PhenolicGroups', 'NR-AR', 'NR-AR-LBD',
       'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma',
       'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'],
      dtype='object')

In [6]:
subset_0 = df[df['SR-ARE'] == 0].sample(n=1404, random_state=42)

subset_1 = df[df['SR-ARE'] == 1]

balanced_df = pd.concat([subset_0, subset_1])

features_df = balanced_df[['MolecularWeight', 'LogP', 'TPSA', 'HBDonors', 'HBAcceptors',
       'RotatableBonds', 'FractionCSP3', 'HeavyAtoms', 'RingCount', 'LogS_ESOL',
       'FormalCharge', 'AromaticRings', 'AromaticHeterocycles',
       'AliphaticRings', 'MolecularComplexity', 'MolarRefractivity']]

target_df = balanced_df[['SR-ARE']]

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features_df, target_df, test_size=0.33, random_state=42)

In [9]:
X_train_fe = X_train.copy()
X_val_fe = X_test.copy()

X_train_fe['TPSA_LogP'] = X_train['TPSA'] * X_train['LogP']
X_val_fe['TPSA_LogP'] = X_test['TPSA'] * X_test['LogP']

X_train_fe['MW_per_HBD'] = X_train['MolecularWeight'] / (X_train['HBDonors'] + 1e-6)
X_val_fe['MW_per_HBD'] = X_test['MolecularWeight'] / (X_test['HBDonors'] + 1e-6)

X_train_fe['LogP_div_HBA'] = X_train['LogP'] / (X_train['HBAcceptors'] + 1e-6)
X_val_fe['LogP_div_HBA'] = X_test['LogP'] / (X_test['HBAcceptors'] + 1e-6)

In [11]:
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

smote_enn = SMOTEENN(random_state=42)

y_train_numeric = y_train['SR-ARE'].astype(int)

imbalance_ratio = (len(y_train_numeric) - sum(y_train_numeric)) / sum(y_train_numeric)

xgb_clf = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight=(len(y_train_numeric) - sum(y_train_numeric)) / sum(y_train_numeric),  # imbalance handling
    random_state=42
)

pipeline = Pipeline([
    ('smote_enn', smote_enn),
    ('clf', xgb_clf)
])

param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [3, 5, 7],
    'clf__learning_rate': [0.01, 0.1],
    'clf__subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1_macro',  
    cv=5,
    verbose=2,
    n_jobs=1
)

grid_search.fit(X_train_fe, y_train)

print("Best parameters:", grid_search.best_params_)

y_pred = grid_search.predict(X_val_fe)

y_proba = grid_search.predict_proba(X_val_fe)[:, 1]  # for AUC

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=0.8; total time=   0.1s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=0.8; total time=   0.0s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=0.8; total time=   0.1s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=0.8; total time=   0.1s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=0.8; total time=   0.1s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=1.0; total time=   0.0s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=1.0; total time=   0.0s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=1.0; total time=   0.1s
[CV] END clf__learning_rat

[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=0.8; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=0.8; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=0.8; total time=   0.2s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   0.2s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_depth=5, clf__n_estimators=100, clf__subsample=0.8; to

In [13]:
from sklearn.metrics import classification_report 
from sklearn.metrics import roc_auc_score

y_proba = grid_search.predict_proba(X_val_fe)[:, 1]  
print("AUC-ROC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

AUC-ROC: 0.7398222797348626
              precision    recall  f1-score   support

           0       0.69      0.70      0.69       463
           1       0.69      0.68      0.69       464

    accuracy                           0.69       927
   macro avg       0.69      0.69      0.69       927
weighted avg       0.69      0.69      0.69       927



In [14]:
model = grid_search.best_estimator_

In [15]:
import pickle
import os

os.makedirs('../../Models/SR-ARE/', exist_ok=True)

with open('../../Models/SR-ARE/smoteen_enggfeatures_xgboost.pkl', 'wb') as file:
    pickle.dump(model, file)

# Using XGBoost Pipeline with SMOTEEN with  ```scale_pos_weight```= ```1```

In [16]:
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

smote_enn = SMOTEENN(random_state=42)

y_train_numeric = y_train['SR-ARE'].astype(int)

imbalance_ratio = (len(y_train_numeric) - sum(y_train_numeric)) / sum(y_train_numeric)

xgb_clf = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight=1,
    random_state=42
)

pipeline = Pipeline([
    ('smote_enn', smote_enn),
    ('clf', xgb_clf)
])

param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [3, 5, 7],
    'clf__learning_rate': [0.01, 0.1],
    'clf__subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1_macro',  
    cv=5,
    verbose=2,
    n_jobs=1
)

grid_search.fit(X_train_fe, y_train)

print("Best parameters:", grid_search.best_params_)

y_pred = grid_search.predict(X_val_fe)

y_proba = grid_search.predict_proba(X_val_fe)[:, 1]  # for AUC

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=0.8; total time=   0.2s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=0.8; total time=   0.1s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=0.8; total time=   0.1s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=0.8; total time=   0.1s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=0.8; total time=   0.1s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=1.0; total time=   0.0s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=1.0; total time=   0.1s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=1.0; total time=   0.1s
[CV] END clf__learning_rat

[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=0.8; total time=   0.2s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=0.8; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=0.8; total time=   0.2s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   0.2s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   0.2s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   0.2s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   0.2s
[CV] END clf__learning_rate=0.1, clf__max_depth=5, clf__n_estimators=100, clf__subsample=0.8; to

In [17]:
from sklearn.metrics import classification_report 
from sklearn.metrics import roc_auc_score

y_proba = grid_search.predict_proba(X_val_fe)[:, 1]  
print("AUC-ROC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

AUC-ROC: 0.7492761785953675
              precision    recall  f1-score   support

           0       0.70      0.71      0.71       463
           1       0.71      0.70      0.71       464

    accuracy                           0.71       927
   macro avg       0.71      0.71      0.71       927
weighted avg       0.71      0.71      0.71       927



In [18]:
model = grid_search.best_estimator_

In [19]:
import pickle
import os

os.makedirs('../../Models/SR-ARE/', exist_ok=True)

with open('../../Models/SR-ARE/smoteen_enggfeatures_xgboost.pkl', 'wb') as file:
    pickle.dump(model, file)