# Imports

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

# Read-In Data

In [2]:
df = pd.read_csv('../../DATA/filled_toxicity_df.csv')

df.head()

Unnamed: 0,mol_id,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,TOX3021,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,2.0,...,1,0,0,0,0,1,0,0,0,0
1,TOX3020,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,TOX3024,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,4.0,...,0,0,0,0,0,0,0,0,0,0
3,TOX3027,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,TOX20800,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df.set_index('mol_id', inplace=True)

df.head()

Unnamed: 0_level_0,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,AromaticProportion,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
mol_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TOX3021,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,2.0,0.5625,...,1,0,0,0,0,1,0,0,0,0
TOX3020,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,2.0,0.4,...,0,0,0,0,0,0,0,0,0,0
TOX3024,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,4.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TOX3027,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,1.0,0.3,...,0,0,0,0,0,0,0,0,0,0
TOX20800,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.columns

Index(['MolecularWeight', 'LogP', 'TPSA', 'HBDonors', 'HBAcceptors',
       'RotatableBonds', 'FractionCSP3', 'HeavyAtoms', 'RingCount',
       'AromaticProportion', 'LogS_ESOL', 'PositiveCharges', 'NegativeCharges',
       'FormalCharge', 'AromaticRings', 'AromaticHeterocycles',
       'AliphaticRings', 'MolecularComplexity', 'MolarRefractivity',
       'Heteroatoms', 'HalogenCount', 'PhenolicGroups', 'NR-AR', 'NR-AR-LBD',
       'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma',
       'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'],
      dtype='object')

In [5]:
subset_0 = df[df['NR-ER-LBD'] == 0].sample(n=378, random_state=42)

subset_1 = df[df['NR-ER-LBD'] == 1]

balanced_df = pd.concat([subset_0, subset_1])

features_df = balanced_df[['MolecularWeight', 'LogP', 'TPSA', 'HBDonors', 'HBAcceptors',
       'RotatableBonds', 'FractionCSP3', 'HeavyAtoms', 'RingCount', 'LogS_ESOL',
       'FormalCharge', 'AromaticRings', 'AromaticHeterocycles',
       'AliphaticRings', 'MolecularComplexity', 'MolarRefractivity']]

target_df = balanced_df[['NR-ER-LBD']]

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features_df, target_df, test_size=0.33, random_state=42)

# Stacking Classifier

### with ```LogisticRegression```, ```RandomForestClassifier``` and ```XGBoost```

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

X_train, X_val, y_train, y_val = train_test_split(
    features_df, target_df.values.ravel(),
    test_size=0.2, stratify=target_df, random_state=42
)

logreg = LogisticRegression(max_iter=10000, random_state=42)
logreg_params = {
    'C': [0.1, 1, 10]
}
logreg_grid = GridSearchCV(logreg, logreg_params, cv=5, scoring='roc_auc', n_jobs=1)
logreg_grid.fit(X_train, y_train)
best_logreg = logreg_grid.best_estimator_
rf = RandomForestClassifier(random_state=42)
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10]
}
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='roc_auc', n_jobs=1)
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_
xgb = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_params = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1]
}
xgb_grid = GridSearchCV(xgb, xgb_params, cv=5, scoring='roc_auc', n_jobs=1)
xgb_grid.fit(X_train, y_train)
best_xgb = xgb_grid.best_estimator_
voting_clf = VotingClassifier(
    estimators=[
        ('lr', best_logreg),
        ('rf', best_rf),
        ('xgb', best_xgb)
    ],
    voting='soft' 
)
voting_clf.fit(X_train, y_train)

for name, model in voting_clf.named_estimators_.items():
    y_proba = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_proba)
    print(f"{name.upper()} AUC: {auc:.4f}")

y_pred_proba = voting_clf.predict_proba(X_val)[:, 1]
ensemble_auc = roc_auc_score(y_val, y_pred_proba)
print(f"Ensemble AUC: {ensemble_auc:.4f}")

LR AUC: 0.7202
RF AUC: 0.8131
XGB AUC: 0.7905
Ensemble AUC: 0.8063


In [8]:
from sklearn.metrics import classification_report

print("\n--- Classification Reports ---\n")

for name, model in voting_clf.named_estimators_.items():
    y_pred = model.predict(X_val)
    print(f"{name.upper()} Classification Report:")
    print(classification_report(y_val, y_pred))
    print('-' * 60)

# Ensemble model
ensemble_pred = voting_clf.predict(X_val)
print("ENSEMBLE Classification Report:")
print(classification_report(y_val, ensemble_pred))


--- Classification Reports ---

LR Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.68      0.68        76
           1       0.68      0.67      0.68        76

    accuracy                           0.68       152
   macro avg       0.68      0.68      0.68       152
weighted avg       0.68      0.68      0.68       152

------------------------------------------------------------
RF Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.72      0.73        76
           1       0.73      0.74      0.73        76

    accuracy                           0.73       152
   macro avg       0.73      0.73      0.73       152
weighted avg       0.73      0.73      0.73       152

------------------------------------------------------------
XGB Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.70      0.70        76
   

### With GridSearchCv

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report
from imblearn.combine import SMOTEENN

# 1. Train-test split
X_train, X_val, y_train, y_val = train_test_split(
    features_df, target_df.values.ravel(),
    test_size=0.2, stratify=target_df, random_state=42
)

# 2. Apply SMOTEEN on training data only
smoteen = SMOTEENN(random_state=42)
X_train_resampled, y_train_resampled = smoteen.fit_resample(X_train, y_train)

# 3. Logistic Regression + GridSearchCV
logreg = LogisticRegression(max_iter=10000, random_state=42)
logreg_params = {'C': [0.1, 1, 10]}
logreg_grid = GridSearchCV(logreg, logreg_params, cv=5, scoring='roc_auc', n_jobs=1)
logreg_grid.fit(X_train_resampled, y_train_resampled)
best_logreg = logreg_grid.best_estimator_

# 4. Random Forest + GridSearchCV
rf = RandomForestClassifier(random_state=42)
rf_params = {'n_estimators': [100, 200], 'max_depth': [None, 10]}
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='roc_auc', n_jobs=1)
rf_grid.fit(X_train_resampled, y_train_resampled)
best_rf = rf_grid.best_estimator_

# 5. XGBoost + GridSearchCV
xgb = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_params = {'n_estimators': [100, 200], 'learning_rate': [0.05, 0.1]}
xgb_grid = GridSearchCV(xgb, xgb_params, cv=5, scoring='roc_auc', n_jobs=1)
xgb_grid.fit(X_train_resampled, y_train_resampled)
best_xgb = xgb_grid.best_estimator_

# 6. Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('lr', best_logreg),
        ('rf', best_rf),
        ('xgb', best_xgb)
    ],
    voting='soft'
)
voting_clf.fit(X_train_resampled, y_train_resampled)

# 7. AUC + Classification Report for individual models
for name, model in voting_clf.named_estimators_.items():
    y_proba = model.predict_proba(X_val)[:, 1]
    y_pred = model.predict(X_val)
    auc = roc_auc_score(y_val, y_proba)
    print(f"\n{name.upper()} AUC: {auc:.4f}")
    print(f"{name.upper()} Classification Report:\n{classification_report(y_val, y_pred)}")

# 8. Ensemble AUC + Classification Report
y_pred_proba = voting_clf.predict_proba(X_val)[:, 1]
y_pred_ensemble = voting_clf.predict(X_val)
ensemble_auc = roc_auc_score(y_val, y_pred_proba)
print(f"\nEnsemble AUC: {ensemble_auc:.4f}")
print(f"Ensemble Classification Report:\n{classification_report(y_val, y_pred_ensemble)}")


LR AUC: 0.7820
LR Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.70      0.72        76
           1       0.72      0.76      0.74        76

    accuracy                           0.73       152
   macro avg       0.73      0.73      0.73       152
weighted avg       0.73      0.73      0.73       152


RF AUC: 0.7868
RF Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.61      0.68        76
           1       0.67      0.82      0.74        76

    accuracy                           0.71       152
   macro avg       0.72      0.71      0.71       152
weighted avg       0.72      0.71      0.71       152


XGB AUC: 0.7402
XGB Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.67      0.66        76
           1       0.66      0.64      0.65        76

    accuracy                           0.66       152


In [10]:
best_model = voting_clf

In [11]:
import pickle
with open('../../Models/NR-ER-LBD/voter_xgb_lr_rf.pkl', 'wb') as file:
    pickle.dump(model, file)

### Stacking Classifier added with: SVM, Gradient Boosting, Extra Trees, and KNN

In [16]:
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier

# --- Support Vector Machine ---
svc = SVC(probability=True, random_state=42)
svc_params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
svc_grid = GridSearchCV(svc, svc_params, cv=5, scoring='roc_auc', n_jobs=1)
svc_grid.fit(X_train_resampled, y_train_resampled)
best_svc = svc_grid.best_estimator_

# --- Gradient Boosting Classifier ---
gb = GradientBoostingClassifier(random_state=42)
gb_params = {'n_estimators': [100, 200], 'learning_rate': [0.05, 0.1]}
gb_grid = GridSearchCV(gb, gb_params, cv=5, scoring='roc_auc', n_jobs=1)
gb_grid.fit(X_train_resampled, y_train_resampled)
best_gb = gb_grid.best_estimator_

# --- Extra Trees Classifier ---
et = ExtraTreesClassifier(random_state=42)
et_params = {'n_estimators': [100, 200], 'max_depth': [None, 10]}
et_grid = GridSearchCV(et, et_params, cv=5, scoring='roc_auc', n_jobs=1)
et_grid.fit(X_train_resampled, y_train_resampled)
best_et = et_grid.best_estimator_

# --- K-Nearest Neighbors ---
knn = KNeighborsClassifier()
knn_params = {'n_neighbors': [3, 5, 7]}
knn_grid = GridSearchCV(knn, knn_params, cv=5, scoring='roc_auc', n_jobs=1)
knn_grid.fit(X_train_resampled, y_train_resampled)
best_knn = knn_grid.best_estimator_

# --- Final Voting Ensemble with all best models ---
voting_clf = VotingClassifier(
    estimators=[
        ('lr', best_logreg),
        ('rf', best_rf),
        ('xgb', best_xgb),
        ('svc', best_svc),
        ('gb', best_gb),
        ('et', best_et),
        ('knn', best_knn)
    ],
    voting='soft',
    n_jobs=1
)
voting_clf.fit(X_train_resampled, y_train_resampled)

# --- Evaluation ---
for name, model in voting_clf.named_estimators_.items():
    y_proba = model.predict_proba(X_val)[:, 1]
    y_pred = model.predict(X_val)
    auc = roc_auc_score(y_val, y_proba)
    print(f"\n{name.upper()} AUC: {auc:.4f}")
    print(f"{name.upper()} Classification Report:\n{classification_report(y_val, y_pred)}")

y_pred_proba = voting_clf.predict_proba(X_val)[:, 1]
y_pred_ensemble = voting_clf.predict(X_val)
ensemble_auc = roc_auc_score(y_val, y_pred_proba)
print(f"\nEnsemble AUC: {ensemble_auc:.4f}")
print(f"Ensemble Classification Report:\n{classification_report(y_val, y_pred_ensemble)}")


LR AUC: 0.7820
LR Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.70      0.72        76
           1       0.72      0.76      0.74        76

    accuracy                           0.73       152
   macro avg       0.73      0.73      0.73       152
weighted avg       0.73      0.73      0.73       152


RF AUC: 0.7868
RF Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.61      0.68        76
           1       0.67      0.82      0.74        76

    accuracy                           0.71       152
   macro avg       0.72      0.71      0.71       152
weighted avg       0.72      0.71      0.71       152


XGB AUC: 0.7402
XGB Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.67      0.66        76
           1       0.66      0.64      0.65        76

    accuracy                           0.66       152


In [17]:
import pickle
with open('../../Models/NR-ER-LBD/voter_xgb_lr_rf_svm_gb_knn_et.pkl', 'wb') as file:
    pickle.dump(model, file)

### Comparative Analysis - Successive Model Stacking

In [18]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import roc_auc_score, classification_report

models = [
    ('lr', best_logreg),
    ('rf', best_rf),
    ('xgb', best_xgb),
    ('svc', best_svc),
    ('gb', best_gb),
    ('et', best_et),
    ('knn', best_knn)
]

ensemble_results = []

# Incrementally add models
for i in range(1, len(models) + 1):
    selected_models = models[:i]
    voting_clf = VotingClassifier(estimators=selected_models, voting='soft')
    voting_clf.fit(X_train_resampled, y_train_resampled)
    
    y_pred = voting_clf.predict(X_val)
    y_proba = voting_clf.predict_proba(X_val)[:, 1]
    
    auc = roc_auc_score(y_val, y_proba)
    report = classification_report(y_val, y_pred, output_dict=True)
    
    result = {
        'n_models': i,
        'model_names': [name for name, _ in selected_models],
        'auc': round(auc, 4),
        'macro_f1': round(report['macro avg']['f1-score'], 4),
        'weighted_f1': round(report['weighted avg']['f1-score'], 4),
        'class1_recall': round(report['1']['recall'], 4),
        'class1_precision': round(report['1']['precision'], 4)
    }
    ensemble_results.append(result)

# Display results
results_df = pd.DataFrame(ensemble_results)
print(results_df)

   n_models                      model_names     auc  macro_f1  weighted_f1  \
0         1                             [lr]  0.7820    0.7300       0.7300   
1         2                         [lr, rf]  0.8177    0.7331       0.7331   
2         3                    [lr, rf, xgb]  0.7964    0.7225       0.7225   
3         4               [lr, rf, xgb, svc]  0.7948    0.7002       0.7002   
4         5           [lr, rf, xgb, svc, gb]  0.7898    0.7220       0.7220   
5         6       [lr, rf, xgb, svc, gb, et]  0.7985    0.7283       0.7283   
6         7  [lr, rf, xgb, svc, gb, et, knn]  0.7917    0.7206       0.7206   

   class1_recall  class1_precision  
0         0.7632            0.7160  
1         0.8553            0.6915  
2         0.7895            0.6977  
3         0.8158            0.6667  
4         0.8026            0.6932  
5         0.8158            0.6966  
6         0.8289            0.6848  


In [19]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
import pandas as pd

base_learners = [
    ('lr', best_logreg),
    ('rf', best_rf), 
    ('xgb', best_xgb),
    ('svc', best_svc),
    ('gb', best_gb),
    ('et', best_et)
]

meta_learner = LogisticRegression(max_iter=10000, random_state=42)

stacking_clf = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=5,
    stack_method='predict_proba',
    n_jobs=1,
    passthrough=False
)

stacking_clf.fit(X_train_resampled, y_train_resampled)

# Predict on validation set
y_pred_proba_stack = stacking_clf.predict_proba(X_val)[:, 1]
y_pred_stack = stacking_clf.predict(X_val)

# Evaluate
stack_auc = roc_auc_score(y_val, y_pred_proba_stack)
print(f"\nStackingClassifier AUC: {stack_auc:.4f}")
print("StackingClassifier Classification Report:")
print(classification_report(y_val, y_pred_stack))


StackingClassifier AUC: 0.7798
StackingClassifier Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.70      0.73        76
           1       0.72      0.78      0.75        76

    accuracy                           0.74       152
   macro avg       0.74      0.74      0.74       152
weighted avg       0.74      0.74      0.74       152



In [20]:
model = stacking_clf

with open('../../Models/NR-ER-LBD/voter_xgb_lr_rf_svm_gb_et.pkl', 'wb') as file:
    pickle.dump(model, file)

### Using Only Extra Trees Classifier

In [21]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
import pandas as pd

base_learners = [
    ('et', best_et)
]

meta_learner = LogisticRegression(max_iter=10000, random_state=42)

stacking_clf = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=5,
    stack_method='predict_proba',
    n_jobs=1,
    passthrough=False
)

stacking_clf.fit(X_train_resampled, y_train_resampled)

# Predict on validation set
y_pred_proba_stack = stacking_clf.predict_proba(X_val)[:, 1]
y_pred_stack = stacking_clf.predict(X_val)

# Evaluate
stack_auc = roc_auc_score(y_val, y_pred_proba_stack)
print(f"\nStackingClassifier AUC: {stack_auc:.4f}")
print("StackingClassifier Classification Report:")
print(classification_report(y_val, y_pred_stack))


StackingClassifier AUC: 0.8130
StackingClassifier Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.59      0.68        76
           1       0.68      0.86      0.76        76

    accuracy                           0.72       152
   macro avg       0.74      0.72      0.72       152
weighted avg       0.74      0.72      0.72       152



In [22]:
model = stacking_clf

In [23]:
model = stacking_clf

with open('../../Models/NR-ER-LBD/voter_lr_et.pkl', 'wb') as file:
    pickle.dump(model, file)