# Imports

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

# Read-In Data

In [2]:
df = pd.read_csv('../../DATA/filled_toxicity_df.csv')

df.head()

Unnamed: 0,mol_id,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,TOX3021,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,2.0,...,1,0,0,0,0,1,0,0,0,0
1,TOX3020,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,TOX3024,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,4.0,...,0,0,0,0,0,0,0,0,0,0
3,TOX3027,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,TOX20800,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df.set_index('mol_id', inplace=True)

df.head()

Unnamed: 0_level_0,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,AromaticProportion,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
mol_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TOX3021,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,2.0,0.5625,...,1,0,0,0,0,1,0,0,0,0
TOX3020,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,2.0,0.4,...,0,0,0,0,0,0,0,0,0,0
TOX3024,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,4.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TOX3027,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,1.0,0.3,...,0,0,0,0,0,0,0,0,0,0
TOX20800,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.columns

Index(['MolecularWeight', 'LogP', 'TPSA', 'HBDonors', 'HBAcceptors',
       'RotatableBonds', 'FractionCSP3', 'HeavyAtoms', 'RingCount',
       'AromaticProportion', 'LogS_ESOL', 'PositiveCharges', 'NegativeCharges',
       'FormalCharge', 'AromaticRings', 'AromaticHeterocycles',
       'AliphaticRings', 'MolecularComplexity', 'MolarRefractivity',
       'Heteroatoms', 'HalogenCount', 'PhenolicGroups', 'NR-AR', 'NR-AR-LBD',
       'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma',
       'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'],
      dtype='object')

In [5]:
subset_0 = df[df['NR-PPAR-gamma'] == 0].sample(n=884, random_state=42)

subset_1 = df[df['NR-PPAR-gamma'] == 1]

balanced_df = pd.concat([subset_0, subset_1])

features_df = balanced_df[['MolecularWeight', 'LogP', 'TPSA', 'HBDonors', 'HBAcceptors',
       'RotatableBonds', 'FractionCSP3', 'HeavyAtoms', 'RingCount', 'LogS_ESOL',
       'FormalCharge', 'AromaticRings', 'AromaticHeterocycles',
       'AliphaticRings', 'MolecularComplexity', 'MolarRefractivity']]

target_df = balanced_df[['NR-PPAR-gamma']]

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features_df, target_df, test_size=0.33, random_state=42)

# Stacking Classifier

### with ```LogisticRegression```, ```RandomForestClassifier``` and ```XGBoost```

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

X_train, X_val, y_train, y_val = train_test_split(
    features_df, target_df.values.ravel(),
    test_size=0.2, stratify=target_df, random_state=42
)

logreg = LogisticRegression(max_iter=10000, random_state=42)
logreg_params = {
    'C': [0.1, 1, 10]
}
logreg_grid = GridSearchCV(logreg, logreg_params, cv=5, scoring='roc_auc', n_jobs=1)
logreg_grid.fit(X_train, y_train)
best_logreg = logreg_grid.best_estimator_
rf = RandomForestClassifier(random_state=42)
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10]
}
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='roc_auc', n_jobs=1)
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_
xgb = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_params = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1]
}
xgb_grid = GridSearchCV(xgb, xgb_params, cv=5, scoring='roc_auc', n_jobs=1)
xgb_grid.fit(X_train, y_train)
best_xgb = xgb_grid.best_estimator_
voting_clf = VotingClassifier(
    estimators=[
        ('lr', best_logreg),
        ('rf', best_rf),
        ('xgb', best_xgb)
    ],
    voting='soft' 
)
voting_clf.fit(X_train, y_train)

for name, model in voting_clf.named_estimators_.items():
    y_proba = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_proba)
    print(f"{name.upper()} AUC: {auc:.4f}")

y_pred_proba = voting_clf.predict_proba(X_val)[:, 1]
ensemble_auc = roc_auc_score(y_val, y_pred_proba)
print(f"Ensemble AUC: {ensemble_auc:.4f}")

LR AUC: 0.7069
RF AUC: 0.7301
XGB AUC: 0.7056
Ensemble AUC: 0.7347


In [8]:
from sklearn.metrics import classification_report

print("\n--- Classification Reports ---\n")

for name, model in voting_clf.named_estimators_.items():
    y_pred = model.predict(X_val)
    print(f"{name.upper()} Classification Report:")
    print(classification_report(y_val, y_pred))
    print('-' * 60)

# Ensemble model
ensemble_pred = voting_clf.predict(X_val)
print("ENSEMBLE Classification Report:")
print(classification_report(y_val, ensemble_pred))


--- Classification Reports ---

LR Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.97      0.88       177
           1       0.45      0.11      0.18        44

    accuracy                           0.80       221
   macro avg       0.63      0.54      0.53       221
weighted avg       0.74      0.80      0.74       221

------------------------------------------------------------
RF Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.97      0.89       177
           1       0.57      0.18      0.28        44

    accuracy                           0.81       221
   macro avg       0.70      0.57      0.58       221
weighted avg       0.78      0.81      0.77       221

------------------------------------------------------------
XGB Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.95      0.89       177
   

### With Grid Search and Hyperparameter Tuning

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report
from imblearn.combine import SMOTEENN

# 1. Train-test split
X_train, X_val, y_train, y_val = train_test_split(
    features_df, target_df.values.ravel(),
    test_size=0.2, stratify=target_df, random_state=42
)

# 2. Apply SMOTEEN on training data only
smoteen = SMOTEENN(random_state=42)
X_train_resampled, y_train_resampled = smoteen.fit_resample(X_train, y_train)

# 3. Logistic Regression + GridSearchCV
logreg = LogisticRegression(max_iter=10000, random_state=42)
logreg_params = {'C': [0.1, 1, 10]}
logreg_grid = GridSearchCV(logreg, logreg_params, cv=5, scoring='roc_auc', n_jobs=1)
logreg_grid.fit(X_train_resampled, y_train_resampled)
best_logreg = logreg_grid.best_estimator_

# 4. Random Forest + GridSearchCV
rf = RandomForestClassifier(random_state=42)
rf_params = {'n_estimators': [100, 200], 'max_depth': [None, 10]}
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='roc_auc', n_jobs=1)
rf_grid.fit(X_train_resampled, y_train_resampled)
best_rf = rf_grid.best_estimator_

# 5. XGBoost + GridSearchCV
xgb = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_params = {'n_estimators': [100, 200], 'learning_rate': [0.05, 0.1]}
xgb_grid = GridSearchCV(xgb, xgb_params, cv=5, scoring='roc_auc', n_jobs=1)
xgb_grid.fit(X_train_resampled, y_train_resampled)
best_xgb = xgb_grid.best_estimator_

# 6. Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('lr', best_logreg),
        ('rf', best_rf),
        ('xgb', best_xgb)
    ],
    voting='soft'
)
voting_clf.fit(X_train_resampled, y_train_resampled)

# 7. AUC + Classification Report for individual models
for name, model in voting_clf.named_estimators_.items():
    y_proba = model.predict_proba(X_val)[:, 1]
    y_pred = model.predict(X_val)
    auc = roc_auc_score(y_val, y_proba)
    print(f"\n{name.upper()} AUC: {auc:.4f}")
    print(f"{name.upper()} Classification Report:\n{classification_report(y_val, y_pred)}")

# 8. Ensemble AUC + Classification Report
y_pred_proba = voting_clf.predict_proba(X_val)[:, 1]
y_pred_ensemble = voting_clf.predict(X_val)
ensemble_auc = roc_auc_score(y_val, y_pred_proba)
print(f"\nEnsemble AUC: {ensemble_auc:.4f}")
print(f"Ensemble Classification Report:\n{classification_report(y_val, y_pred_ensemble)}")


LR AUC: 0.7138
LR Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.58      0.71       177
           1       0.31      0.77      0.45        44

    accuracy                           0.62       221
   macro avg       0.61      0.68      0.58       221
weighted avg       0.79      0.62      0.66       221


RF AUC: 0.7269
RF Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.71      0.79       177
           1       0.35      0.61      0.44        44

    accuracy                           0.69       221
   macro avg       0.61      0.66      0.62       221
weighted avg       0.77      0.69      0.72       221


XGB AUC: 0.7485
XGB Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.73      0.80       177
           1       0.37      0.64      0.47        44

    accuracy                           0.71       221


In [11]:
import pickle

model = voting_clf 

with open('../../Models/NR-PPAR-gamma/voting_lr_rf_xgb.pkl', 'wb') as file:
    pickle.dump(model, file)

### Stacking Classifier added with: SVM, Gradient Boosting, Extra Trees, and KNN

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import (
    GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, classification_report

import joblib

# --- Define models and hyperparameters ---
model_configs = {
    'svc': {
        'model': SVC(probability=True, random_state=42),
        'params': {'C': [0.1, 1], 'kernel': ['linear', 'rbf']}  # Reduced for speed
    },
    'gb': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {'n_estimators': [100], 'learning_rate': [0.1]}
    },
    'et': {
        'model': ExtraTreesClassifier(random_state=42),
        'params': {'n_estimators': [100], 'max_depth': [None]}
    },
    'knn': {
        'model': KNeighborsClassifier(),
        'params': {'n_neighbors': [3, 5]}
    }
}

# --- Train models in loop using parallel grid search ---
best_models = {}

for name, config in model_configs.items():
    print(f"{name.upper()} Training started")
    grid = GridSearchCV(
        estimator=config['model'],
        param_grid=config['params'],
        cv=5,
        scoring='roc_auc',
        n_jobs=1  # Full parallel grid search
    )
    grid.fit(X_train_resampled, y_train_resampled)
    best_models[name] = grid.best_estimator_
    print(f"{name.upper()} Training ended")

# --- Add existing models ---
best_models['lr'] = best_logreg
best_models['rf'] = best_rf
best_models['xgb'] = best_xgb

# --- Soft Voting Ensemble ---
voting_clf = VotingClassifier(
    estimators=[(name, model) for name, model in best_models.items()],
    voting='soft',
    n_jobs=1  # Parallel prediction across models
)
voting_clf.fit(X_train_resampled, y_train_resampled)

# --- Evaluation loop ---
for name, model in voting_clf.named_estimators_.items():
    try:
        y_proba = model.predict_proba(X_val)[:, 1]
    except AttributeError:
        print(f"{name.upper()} does not support predict_proba. Skipping AUC.")
        y_proba = None

    y_pred = model.predict(X_val)
    if y_proba is not None:
        auc = roc_auc_score(y_val, y_proba)
        print(f"\n{name.upper()} AUC: {auc:.4f}")
    print(f"{name.upper()} Classification Report:\n{classification_report(y_val, y_pred)}")

# --- Ensemble Evaluation ---
y_pred_proba = voting_clf.predict_proba(X_val)[:, 1]
y_pred_ensemble = voting_clf.predict(X_val)
ensemble_auc = roc_auc_score(y_val, y_pred_proba)
print(f"\nEnsemble AUC: {ensemble_auc:.4f}")
print(f"Ensemble Classification Report:\n{classification_report(y_val, y_pred_ensemble)}")

SVC Training started
SVC Training ended
GB Training started
GB Training ended
ET Training started
ET Training ended
KNN Training started
KNN Training ended

SVC AUC: 0.6990
SVC Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.55      0.68       177
           1       0.30      0.77      0.43        44

    accuracy                           0.59       221
   macro avg       0.60      0.66      0.56       221
weighted avg       0.79      0.59      0.63       221


GB AUC: 0.7390
GB Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.70      0.78       177
           1       0.34      0.61      0.44        44

    accuracy                           0.68       221
   macro avg       0.61      0.66      0.61       221
weighted avg       0.77      0.68      0.71       221


ET AUC: 0.7352
ET Classification Report:
              precision    recall  f1-score   support

       

In [15]:
model = voting_clf 

with open('../../Models/NR-PPAR-gamma/voting_lr_rf_xgb_svc_gb_Et_knn.pkl', 'wb') as file:
    pickle.dump(model, file)