# Imports

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

# Data Read-In

In [3]:
df = pd.read_csv('../../DATA/filled_toxicity_df.csv')

df.head()

Unnamed: 0,mol_id,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,TOX3021,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,2.0,...,1,0,0,0,0,1,0,0,0,0
1,TOX3020,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,TOX3024,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,4.0,...,0,0,0,0,0,0,0,0,0,0
3,TOX3027,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,TOX20800,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.set_index('mol_id', inplace=True)

df.head()

Unnamed: 0_level_0,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,AromaticProportion,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
mol_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TOX3021,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,2.0,0.5625,...,1,0,0,0,0,1,0,0,0,0
TOX3020,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,2.0,0.4,...,0,0,0,0,0,0,0,0,0,0
TOX3024,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,4.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TOX3027,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,1.0,0.3,...,0,0,0,0,0,0,0,0,0,0
TOX20800,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df.columns

Index(['MolecularWeight', 'LogP', 'TPSA', 'HBDonors', 'HBAcceptors',
       'RotatableBonds', 'FractionCSP3', 'HeavyAtoms', 'RingCount',
       'AromaticProportion', 'LogS_ESOL', 'PositiveCharges', 'NegativeCharges',
       'FormalCharge', 'AromaticRings', 'AromaticHeterocycles',
       'AliphaticRings', 'MolecularComplexity', 'MolarRefractivity',
       'Heteroatoms', 'HalogenCount', 'PhenolicGroups', 'NR-AR', 'NR-AR-LBD',
       'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma',
       'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'],
      dtype='object')

In [6]:
df['NR-AR-LBD'].value_counts()

0    7560
1     271
Name: NR-AR-LBD, dtype: int64

In [12]:
subset_0 = df[df['NR-AR-LBD'] == 0].sample(n=312, random_state=42)

subset_1 = df[df['NR-AR-LBD'] == 1]

balanced_df = pd.concat([subset_0, subset_1])

features_df = balanced_df[['MolecularWeight', 'LogP', 'TPSA', 'HBDonors', 'HBAcceptors',
       'RotatableBonds', 'FractionCSP3', 'HeavyAtoms', 'RingCount', 'LogS_ESOL',
       'FormalCharge', 'AromaticRings', 'AromaticHeterocycles',
       'AliphaticRings', 'MolecularComplexity', 'MolarRefractivity']]

target_df = balanced_df[['NR-AR-LBD']]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(features_df, target_df, test_size=0.33, random_state=42)

In [14]:
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV

smote_enn = SMOTEENN(random_state=42)

pipeline = Pipeline([
    ('smote_enn', smote_enn),
    ('clf', OneVsRestClassifier(LogisticRegression(class_weight='balanced', max_iter=10000)))
])

param_grid = {
    'clf__estimator__C': [0.1, 1, 10],
    'clf__estimator__solver': ['liblinear', 'lbfgs']
}

# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1',   # Can be changed to 'f1_macro' or other metrics
    cv=5,
    n_jobs=1,
    verbose=2
)

# Fit grid search on training data
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)

# Predict on test data
y_pred = grid_search.predict(X_test)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END clf__estimator__C=0.1, clf__estimator__solver=liblinear; total time=   0.0s
[CV] END clf__estimator__C=0.1, clf__estimator__solver=liblinear; total time=   0.0s
[CV] END clf__estimator__C=0.1, clf__estimator__solver=liblinear; total time=   0.0s
[CV] END clf__estimator__C=0.1, clf__estimator__solver=liblinear; total time=   0.0s
[CV] END clf__estimator__C=0.1, clf__estimator__solver=liblinear; total time=   0.0s
[CV] END clf__estimator__C=0.1, clf__estimator__solver=lbfgs; total time=   0.3s
[CV] END clf__estimator__C=0.1, clf__estimator__solver=lbfgs; total time=   0.2s
[CV] END clf__estimator__C=0.1, clf__estimator__solver=lbfgs; total time=   0.3s
[CV] END clf__estimator__C=0.1, clf__estimator__solver=lbfgs; total time=   0.4s
[CV] END clf__estimator__C=0.1, clf__estimator__solver=lbfgs; total time=   0.3s
[CV] END clf__estimator__C=1, clf__estimator__solver=liblinear; total time=   0.0s
[CV] END clf__estimator__C=

In [15]:
from sklearn.metrics import classification_report 
from sklearn.metrics import roc_auc_score

y_proba = grid_search.predict_proba(X_test)[:, 1]  
print("AUC-ROC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

AUC-ROC: 0.8088685015290519
              precision    recall  f1-score   support

           0       0.77      0.84      0.81       109
           1       0.77      0.68      0.72        84

    accuracy                           0.77       193
   macro avg       0.77      0.76      0.76       193
weighted avg       0.77      0.77      0.77       193



In [16]:
model = grid_search.best_estimator_

## Dumping into a .pkl File

In [18]:
import pickle
import os

os.makedirs('../../Models/NR-AR-LBD/', exist_ok=True)

with open('../../Models/NR-AR-LBD/ovr_rsampling_smoteen_.pkl', 'wb') as file:
    pickle.dump(model, file)