# Imports

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

# Data Read-In

In [2]:
df = pd.read_csv('../../DATA/filled_toxicity_df.csv')

df.head()

Unnamed: 0,mol_id,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,TOX3021,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,2.0,...,1,0,0,0,0,1,0,0,0,0
1,TOX3020,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,TOX3024,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,4.0,...,0,0,0,0,0,0,0,0,0,0
3,TOX3027,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,TOX20800,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df.set_index('mol_id', inplace=True)

df.head()

Unnamed: 0_level_0,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,AromaticProportion,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
mol_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TOX3021,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,2.0,0.5625,...,1,0,0,0,0,1,0,0,0,0
TOX3020,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,2.0,0.4,...,0,0,0,0,0,0,0,0,0,0
TOX3024,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,4.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TOX3027,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,1.0,0.3,...,0,0,0,0,0,0,0,0,0,0
TOX20800,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.columns

Index(['MolecularWeight', 'LogP', 'TPSA', 'HBDonors', 'HBAcceptors',
       'RotatableBonds', 'FractionCSP3', 'HeavyAtoms', 'RingCount',
       'AromaticProportion', 'LogS_ESOL', 'PositiveCharges', 'NegativeCharges',
       'FormalCharge', 'AromaticRings', 'AromaticHeterocycles',
       'AliphaticRings', 'MolecularComplexity', 'MolarRefractivity',
       'Heteroatoms', 'HalogenCount', 'PhenolicGroups', 'NR-AR', 'NR-AR-LBD',
       'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma',
       'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'],
      dtype='object')

In [5]:
df['NR-AR-LBD'].value_counts()

0    7560
1     271
Name: NR-AR-LBD, dtype: int64

In [6]:
subset_0 = df[df['NR-AR-LBD'] == 0].sample(n=312, random_state=42)

subset_1 = df[df['NR-AR-LBD'] == 1]

balanced_df = pd.concat([subset_0, subset_1])

features_df = balanced_df[['MolecularWeight', 'LogP', 'TPSA', 'HBDonors', 'HBAcceptors',
       'RotatableBonds', 'FractionCSP3', 'HeavyAtoms', 'RingCount', 'LogS_ESOL',
       'FormalCharge', 'AromaticRings', 'AromaticHeterocycles',
       'AliphaticRings', 'MolecularComplexity', 'MolarRefractivity']]

target_df = balanced_df[['NR-AR-LBD']]

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features_df, target_df, test_size=0.33, random_state=42)

### ```scale_pos_weight```= ```(len(y_train_numeric) - sum(y_train_numeric)) / sum(y_train_numeric)```

In [10]:
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Step 1: Resampler
smote_enn = SMOTEENN(random_state=42)

# Convert y_train to numeric (if it's not already)
y_train_numeric = y_train['NR-AR-LBD'].astype(int)

# Then compute scale_pos_weight
imbalance_ratio = (len(y_train_numeric) - sum(y_train_numeric)) / sum(y_train_numeric)

# Step 2: Classifier
xgb_clf = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight=(len(y_train_numeric) - sum(y_train_numeric)) / sum(y_train_numeric),  # imbalance handling
    random_state=42
)

# Step 3: Pipeline
pipeline = Pipeline([
    ('smote_enn', smote_enn),
    ('clf', xgb_clf)
])

# Step 4: Hyperparameter tuning
param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [3, 5, 7],
    'clf__learning_rate': [0.01, 0.1],
    'clf__subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1_macro',  
    cv=5,
    verbose=2,
    n_jobs=1
)

# Step 5: Fit the model
grid_search.fit(X_train, y_train)

# Step 6: Predictions and evaluation
y_pred = grid_search.predict(X_test)
y_proba = grid_search.predict_proba(X_test)[:, 1]  # for AUC

print("Best Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=0.8; total time=   0.0s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=0.8; total time=   0.0s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=0.8; total time=   0.0s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=0.8; total time=   0.0s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=0.8; total time=   0.0s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=1.0; total time=   0.0s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=1.0; total time=   0.0s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=1.0; total time=   0.0s
[CV] END clf__learning_rat

[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=0.8; total time=   0.0s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=0.8; total time=   0.0s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=0.8; total time=   0.0s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   0.0s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   0.0s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   0.0s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   0.0s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   0.0s
[CV] END clf__learning_rate=0.1, clf__max_depth=5, clf__n_estimators=100, clf__subsample=0.8; to

In [11]:
from sklearn.metrics import classification_report 
from sklearn.metrics import roc_auc_score

y_proba = grid_search.predict_proba(X_test)[:, 1]  
print("AUC-ROC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

AUC-ROC: 0.8257972913936218
              precision    recall  f1-score   support

           0       0.80      0.73      0.77       109
           1       0.69      0.76      0.72        84

    accuracy                           0.75       193
   macro avg       0.74      0.75      0.74       193
weighted avg       0.75      0.75      0.75       193



In [12]:
model = grid_search.best_estimator_

## Dumping into a .pkl File

In [14]:
import pickle
import os

os.makedirs('../../Models/NR-AR-LBD/', exist_ok=True)

with open('../../Models/NR-AR-LBD/xgboost_smoteen_rs_spwformula.pkl', 'wb') as file:
    pickle.dump(model, file)

### ```scale_pos_weight```= 1

In [15]:
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Step 1: Resampler
smote_enn = SMOTEENN(random_state=42)

# Convert y_train to numeric (if it's not already)
y_train_numeric = y_train['NR-AR-LBD'].astype(int)

# Then compute scale_pos_weight
imbalance_ratio = (len(y_train_numeric) - sum(y_train_numeric)) / sum(y_train_numeric)

# Step 2: Classifier
xgb_clf = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight=1,  # imbalance handling
    random_state=42
)

# Step 3: Pipeline
pipeline = Pipeline([
    ('smote_enn', smote_enn),
    ('clf', xgb_clf)
])

# Step 4: Hyperparameter tuning
param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [3, 5, 7],
    'clf__learning_rate': [0.01, 0.1],
    'clf__subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1_macro',  
    cv=5,
    verbose=2,
    n_jobs=1
)

# Step 5: Fit the model
grid_search.fit(X_train, y_train)

# Step 6: Predictions and evaluation
y_pred = grid_search.predict(X_test)
y_proba = grid_search.predict_proba(X_test)[:, 1]  # for AUC

print("Best Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=0.8; total time=   0.0s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=0.8; total time=   0.0s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=0.8; total time=   0.0s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=0.8; total time=   0.0s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=0.8; total time=   0.0s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=1.0; total time=   0.0s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=1.0; total time=   0.0s
[CV] END clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, clf__subsample=1.0; total time=   0.0s
[CV] END clf__learning_rat

[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=0.8; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=0.8; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_depth=3, clf__n_estimators=200, clf__subsample=1.0; total time=   0.1s
[CV] END clf__learning_rate=0.1, clf__max_depth=5, clf__n_estimators=100, clf__subsample=0.8; total time=   0.0s
[CV] END clf__learning_rate=0.1, clf__max_depth=5, clf__n_estimators=100, clf__subsample=0.8; to

In [16]:
from sklearn.metrics import classification_report 
from sklearn.metrics import roc_auc_score

y_proba = grid_search.predict_proba(X_test)[:, 1]  
print("AUC-ROC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

AUC-ROC: 0.8262341633901267
              precision    recall  f1-score   support

           0       0.80      0.73      0.77       109
           1       0.69      0.76      0.72        84

    accuracy                           0.75       193
   macro avg       0.74      0.75      0.74       193
weighted avg       0.75      0.75      0.75       193



## Dumping into a .pkl File

In [17]:
import pickle
import os

os.makedirs('../../Models/NR-AR-LBD/', exist_ok=True)

with open('../../Models/NR-AR-LBD/xgboost_smoteen_rs_spw1.pkl', 'wb') as file:
    pickle.dump(model, file)