# Imports 

In [36]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

# Read-In Data

In [37]:
df = pd.read_csv('../../DATA/filled_toxicity_df.csv')

df.head()

Unnamed: 0,mol_id,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,TOX3021,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,2.0,...,1,0,0,0,0,1,0,0,0,0
1,TOX3020,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,TOX3024,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,4.0,...,0,0,0,0,0,0,0,0,0,0
3,TOX3027,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,TOX20800,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
df.set_index('mol_id', inplace=True)

df.head()

Unnamed: 0_level_0,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,AromaticProportion,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
mol_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TOX3021,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,2.0,0.5625,...,1,0,0,0,0,1,0,0,0,0
TOX3020,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,2.0,0.4,...,0,0,0,0,0,0,0,0,0,0
TOX3024,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,4.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TOX3027,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,1.0,0.3,...,0,0,0,0,0,0,0,0,0,0
TOX20800,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
df.columns

Index(['MolecularWeight', 'LogP', 'TPSA', 'HBDonors', 'HBAcceptors',
       'RotatableBonds', 'FractionCSP3', 'HeavyAtoms', 'RingCount',
       'AromaticProportion', 'LogS_ESOL', 'PositiveCharges', 'NegativeCharges',
       'FormalCharge', 'AromaticRings', 'AromaticHeterocycles',
       'AliphaticRings', 'MolecularComplexity', 'MolarRefractivity',
       'Heteroatoms', 'HalogenCount', 'PhenolicGroups', 'NR-AR', 'NR-AR-LBD',
       'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma',
       'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'],
      dtype='object')

In [40]:
subset_0 = df[df['NR-Aromatase'] == 0].sample(n=411, random_state=42)

subset_1 = df[df['NR-Aromatase'] == 1]

balanced_df = pd.concat([subset_0, subset_1])

features_df = balanced_df[['MolecularWeight', 'LogP', 'TPSA', 'HBDonors', 'HBAcceptors',
       'RotatableBonds', 'FractionCSP3', 'HeavyAtoms', 'RingCount', 'LogS_ESOL',
       'FormalCharge', 'AromaticRings', 'AromaticHeterocycles',
       'AliphaticRings', 'MolecularComplexity', 'MolarRefractivity']]

target_df = balanced_df[['NR-Aromatase']]

In [41]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features_df, target_df, test_size=0.33, random_state=42)

## Logistic Regression + Random Forest + XGBoost Classifier

In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

X_train, X_val, y_train, y_val = train_test_split(
    features_df, target_df.values.ravel(),
    test_size=0.2, stratify=target_df, random_state=42
)

logreg = LogisticRegression(max_iter=10000, random_state=42)
logreg_params = {
    'C': [0.1, 1, 10]
}
logreg_grid = GridSearchCV(logreg, logreg_params, cv=5, scoring='roc_auc', n_jobs=1)
logreg_grid.fit(X_train, y_train)
best_logreg = logreg_grid.best_estimator_
rf = RandomForestClassifier(random_state=42)
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10]
}
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='roc_auc', n_jobs=1)
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_
xgb = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_params = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1]
}
xgb_grid = GridSearchCV(xgb, xgb_params, cv=5, scoring='roc_auc', n_jobs=1)
xgb_grid.fit(X_train, y_train)
best_xgb = xgb_grid.best_estimator_
voting_clf = VotingClassifier(
    estimators=[
        ('lr', best_logreg),
        ('rf', best_rf),
        ('xgb', best_xgb)
    ],
    voting='soft' 
)
voting_clf.fit(X_train, y_train)

for name, model in voting_clf.named_estimators_.items():
    y_proba = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_proba)
    print(f"{name.upper()} AUC: {auc:.4f}")

y_pred_proba = voting_clf.predict_proba(X_val)[:, 1]
ensemble_auc = roc_auc_score(y_val, y_pred_proba)
print(f"Ensemble AUC: {ensemble_auc:.4f}")

LR AUC: 0.7216
RF AUC: 0.7766
XGB AUC: 0.7509
Ensemble AUC: 0.7580


In [43]:
from sklearn.metrics import classification_report

print("\n--- Classification Reports ---\n")

for name, model in voting_clf.named_estimators_.items():
    y_pred = model.predict(X_val)
    print(f"{name.upper()} Classification Report:")
    print(classification_report(y_val, y_pred))
    print('-' * 60)

# Ensemble model
ensemble_pred = voting_clf.predict(X_val)
print("ENSEMBLE Classification Report:")
print(classification_report(y_val, ensemble_pred))


--- Classification Reports ---

LR Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.64      0.63        83
           1       0.63      0.62      0.63        82

    accuracy                           0.63       165
   macro avg       0.63      0.63      0.63       165
weighted avg       0.63      0.63      0.63       165

------------------------------------------------------------
RF Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.64      0.69        83
           1       0.68      0.78      0.73        82

    accuracy                           0.71       165
   macro avg       0.71      0.71      0.71       165
weighted avg       0.71      0.71      0.71       165

------------------------------------------------------------
XGB Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.65      0.65        83
   

In [44]:
best_model = voting_clf

In [45]:
import pickle
import os

os.makedirs('../../Models/NR-Aromatase/Voting_Classifiers', exist_ok=True)

with open('../../Models/NR-Aromatase/Voting_Classifiers/lr_rf_xgboost.pkl', 'wb') as file:
    pickle.dump(best_model, file)

## Logistic Regression + Random Forest + XGBoost Classifier - with GridSearch and SMOTEEN

In [46]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report
from imblearn.combine import SMOTEENN

# 1. Train-test split
X_train, X_val, y_train, y_val = train_test_split(
    features_df, target_df.values.ravel(),
    test_size=0.2, stratify=target_df, random_state=42
)

# 2. Apply SMOTEEN on training data only
smoteen = SMOTEENN(random_state=42)
X_train_resampled, y_train_resampled = smoteen.fit_resample(X_train, y_train)

# 3. Logistic Regression + GridSearchCV
logreg = LogisticRegression(max_iter=10000, random_state=42)
logreg_params = {'C': [0.1, 1, 10]}
logreg_grid = GridSearchCV(logreg, logreg_params, cv=5, scoring='roc_auc', n_jobs=1)
logreg_grid.fit(X_train_resampled, y_train_resampled)
best_logreg = logreg_grid.best_estimator_

# 4. Random Forest + GridSearchCV
rf = RandomForestClassifier(random_state=42)
rf_params = {'n_estimators': [100, 200], 'max_depth': [None, 10]}
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='roc_auc', n_jobs=1)
rf_grid.fit(X_train_resampled, y_train_resampled)
best_rf = rf_grid.best_estimator_

# 5. XGBoost + GridSearchCV
xgb = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_params = {'n_estimators': [100, 200], 'learning_rate': [0.05, 0.1]}
xgb_grid = GridSearchCV(xgb, xgb_params, cv=5, scoring='roc_auc', n_jobs=1)
xgb_grid.fit(X_train_resampled, y_train_resampled)
best_xgb = xgb_grid.best_estimator_

# 6. Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('lr', best_logreg),
        ('rf', best_rf),
        ('xgb', best_xgb)
    ],
    voting='soft'
)
voting_clf.fit(X_train_resampled, y_train_resampled)

# 7. AUC + Classification Report for individual models
for name, model in voting_clf.named_estimators_.items():
    y_proba = model.predict_proba(X_val)[:, 1]
    y_pred = model.predict(X_val)
    auc = roc_auc_score(y_val, y_proba)
    print(f"\n{name.upper()} AUC: {auc:.4f}")
    print(f"{name.upper()} Classification Report:\n{classification_report(y_val, y_pred)}")

# 8. Ensemble AUC + Classification Report
y_pred_proba = voting_clf.predict_proba(X_val)[:, 1]
y_pred_ensemble = voting_clf.predict(X_val)
ensemble_auc = roc_auc_score(y_val, y_pred_proba)
print(f"\nEnsemble AUC: {ensemble_auc:.4f}")
print(f"Ensemble Classification Report:\n{classification_report(y_val, y_pred_ensemble)}")


LR AUC: 0.7748
LR Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.65      0.69        83
           1       0.68      0.77      0.72        82

    accuracy                           0.71       165
   macro avg       0.71      0.71      0.71       165
weighted avg       0.71      0.71      0.71       165


RF AUC: 0.7276
RF Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.54      0.63        83
           1       0.64      0.83      0.72        82

    accuracy                           0.68       165
   macro avg       0.70      0.69      0.68       165
weighted avg       0.70      0.68      0.68       165


XGB AUC: 0.7141
XGB Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.55      0.64        83
           1       0.64      0.82      0.72        82

    accuracy                           0.68       165


In [47]:
best_model = voting_clf

with open('../../Models/NR-Aromatase/Voting_Classifiers/lr_rf_xgboost_gridsearch_smoteen.pkl', 'wb') as file:
    pickle.dump(best_model, file)

## Addition of Models: SVC, GBC and ExtraTrees

In [48]:
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier

# --- Support Vector Machine ---
svc = SVC(probability=True, random_state=42)
svc_params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
svc_grid = GridSearchCV(svc, svc_params, cv=5, scoring='roc_auc', n_jobs=1)
svc_grid.fit(X_train_resampled, y_train_resampled)
best_svc = svc_grid.best_estimator_

# --- Gradient Boosting Classifier ---
gb = GradientBoostingClassifier(random_state=42)
gb_params = {'n_estimators': [100, 200], 'learning_rate': [0.05, 0.1]}
gb_grid = GridSearchCV(gb, gb_params, cv=5, scoring='roc_auc', n_jobs=1)
gb_grid.fit(X_train_resampled, y_train_resampled)
best_gb = gb_grid.best_estimator_

# --- Extra Trees Classifier ---
et = ExtraTreesClassifier(random_state=42)
et_params = {'n_estimators': [100, 200], 'max_depth': [None, 10]}
et_grid = GridSearchCV(et, et_params, cv=5, scoring='roc_auc', n_jobs=1)
et_grid.fit(X_train_resampled, y_train_resampled)
best_et = et_grid.best_estimator_

# --- K-Nearest Neighbors ---
knn = KNeighborsClassifier()
knn_params = {'n_neighbors': [3, 5, 7]}
knn_grid = GridSearchCV(knn, knn_params, cv=5, scoring='roc_auc', n_jobs=1)
knn_grid.fit(X_train_resampled, y_train_resampled)
best_knn = knn_grid.best_estimator_

# --- Final Voting Ensemble with all best models ---
voting_clf = VotingClassifier(
    estimators=[
        ('lr', best_logreg),
        ('rf', best_rf),
        ('xgb', best_xgb),
        ('svc', best_svc),
        ('gb', best_gb),
        ('et', best_et),
        ('knn', best_knn)
    ],
    voting='soft',
    n_jobs=1
)
voting_clf.fit(X_train_resampled, y_train_resampled)

# --- Evaluation ---
for name, model in voting_clf.named_estimators_.items():
    y_proba = model.predict_proba(X_val)[:, 1]
    y_pred = model.predict(X_val)
    auc = roc_auc_score(y_val, y_proba)
    print(f"\n{name.upper()} AUC: {auc:.4f}")
    print(f"{name.upper()} Classification Report:\n{classification_report(y_val, y_pred)}")

y_pred_proba = voting_clf.predict_proba(X_val)[:, 1]
y_pred_ensemble = voting_clf.predict(X_val)
ensemble_auc = roc_auc_score(y_val, y_pred_proba)
print(f"\nEnsemble AUC: {ensemble_auc:.4f}")
print(f"Ensemble Classification Report:\n{classification_report(y_val, y_pred_ensemble)}")


LR AUC: 0.7748
LR Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.65      0.69        83
           1       0.68      0.77      0.72        82

    accuracy                           0.71       165
   macro avg       0.71      0.71      0.71       165
weighted avg       0.71      0.71      0.71       165


RF AUC: 0.7276
RF Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.54      0.63        83
           1       0.64      0.83      0.72        82

    accuracy                           0.68       165
   macro avg       0.70      0.69      0.68       165
weighted avg       0.70      0.68      0.68       165


XGB AUC: 0.7141
XGB Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.55      0.64        83
           1       0.64      0.82      0.72        82

    accuracy                           0.68       165


In [49]:
best_model = voting_clf

with open('../../Models/NR-Aromatase/Voting_Classifiers/lr_rf_xgboost_knn_et_svc_gb.pkl', 'wb') as file:
    pickle.dump(best_model, file)

## Incremental Addition to Find Best Stack

In [50]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import roc_auc_score, classification_report

models = [
    ('lr', best_logreg),
    ('rf', best_rf),
    ('xgb', best_xgb),
    ('svc', best_svc),
    ('gb', best_gb),
    ('et', best_et),
    ('knn', best_knn)
]

ensemble_results = []

# Incrementally add models
for i in range(1, len(models) + 1):
    selected_models = models[:i]
    voting_clf = VotingClassifier(estimators=selected_models, voting='soft')
    voting_clf.fit(X_train_resampled, y_train_resampled)
    
    y_pred = voting_clf.predict(X_val)
    y_proba = voting_clf.predict_proba(X_val)[:, 1]
    
    auc = roc_auc_score(y_val, y_proba)
    report = classification_report(y_val, y_pred, output_dict=True)
    
    result = {
        'n_models': i,
        'model_names': [name for name, _ in selected_models],
        'auc': round(auc, 4),
        'macro_f1': round(report['macro avg']['f1-score'], 4),
        'weighted_f1': round(report['weighted avg']['f1-score'], 4),
        'class1_recall': round(report['1']['recall'], 4),
        'class1_precision': round(report['1']['precision'], 4)
    }
    ensemble_results.append(result)

# Display results
results_df = pd.DataFrame(ensemble_results)
print(results_df)

   n_models                      model_names     auc  macro_f1  weighted_f1  \
0         1                             [lr]  0.7748    0.7082       0.7081   
1         2                         [lr, rf]  0.7515    0.6863       0.6861   
2         3                    [lr, rf, xgb]  0.7465    0.7033       0.7031   
3         4               [lr, rf, xgb, svc]  0.7492    0.7033       0.7031   
4         5           [lr, rf, xgb, svc, gb]  0.7427    0.6797       0.6794   
5         6       [lr, rf, xgb, svc, gb, et]  0.7458    0.6853       0.6851   
6         7  [lr, rf, xgb, svc, gb, et, knn]  0.7476    0.7090       0.7087   

   class1_recall  class1_precision  
0         0.7683            0.6848  
1         0.8171            0.6505  
2         0.8537            0.6604  
3         0.8537            0.6604  
4         0.8171            0.6442  
5         0.8293            0.6476  
6         0.8659            0.6636  


## Final Stacking Classifier

In [51]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
import pandas as pd

base_learners = [
    ('lr', best_logreg),
]

meta_learner = LogisticRegression(max_iter=10000, random_state=42)

stacking_clf = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=5,
    stack_method='predict_proba',
    n_jobs=1,
    passthrough=False
)

stacking_clf.fit(X_train_resampled, y_train_resampled)

# Predict on validation set
y_pred_proba_stack = stacking_clf.predict_proba(X_val)[:, 1]
y_pred_stack = stacking_clf.predict(X_val)

# Evaluate
stack_auc = roc_auc_score(y_val, y_pred_proba_stack)
print(f"\nStackingClassifier AUC: {stack_auc:.4f}")
print("StackingClassifier Classification Report:")
print(classification_report(y_val, y_pred_stack))


StackingClassifier AUC: 0.7748
StackingClassifier Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.65      0.69        83
           1       0.68      0.77      0.72        82

    accuracy                           0.71       165
   macro avg       0.71      0.71      0.71       165
weighted avg       0.71      0.71      0.71       165



In [52]:
best_model = voting_clf

with open('../../Models/NR-Aromatase/Voting_Classifiers/final_stacking_classifier.pkl', 'wb') as file:
    pickle.dump(best_model, file)