# Imports

In [3]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

# Data Read-In

In [4]:
df = pd.read_csv('../../DATA/filled_toxicity_df.csv')

In [5]:
df.head()

Unnamed: 0,mol_id,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,TOX3021,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,2.0,...,1,0,0,0,0,1,0,0,0,0
1,TOX3020,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,TOX3024,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,4.0,...,0,0,0,0,0,0,0,0,0,0
3,TOX3027,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,TOX20800,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df.set_index('mol_id', inplace=True)

In [7]:
df.head()

Unnamed: 0_level_0,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,AromaticProportion,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
mol_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TOX3021,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,2.0,0.5625,...,1,0,0,0,0,1,0,0,0,0
TOX3020,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,2.0,0.4,...,0,0,0,0,0,0,0,0,0,0
TOX3024,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,4.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TOX3027,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,1.0,0.3,...,0,0,0,0,0,0,0,0,0,0
TOX20800,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
df.columns

Index(['MolecularWeight', 'LogP', 'TPSA', 'HBDonors', 'HBAcceptors',
       'RotatableBonds', 'FractionCSP3', 'HeavyAtoms', 'RingCount',
       'AromaticProportion', 'LogS_ESOL', 'PositiveCharges', 'NegativeCharges',
       'FormalCharge', 'AromaticRings', 'AromaticHeterocycles',
       'AliphaticRings', 'MolecularComplexity', 'MolarRefractivity',
       'Heteroatoms', 'HalogenCount', 'PhenolicGroups', 'NR-AR', 'NR-AR-LBD',
       'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma',
       'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'],
      dtype='object')

# Using Random Sampling

## ML Preprocessing

### Feature | Target Split

In [9]:
subset_0 = df[df['NR-AR'] == 0].sample(n=312, random_state=42)

subset_1 = df[df['NR-AR'] == 1]

balanced_df = pd.concat([subset_0, subset_1])

In [10]:
features_df = balanced_df[['MolecularWeight', 'LogP', 'TPSA', 'HBDonors', 'HBAcceptors',
       'RotatableBonds', 'FractionCSP3', 'HeavyAtoms', 'RingCount', 'LogS_ESOL',
       'FormalCharge', 'AromaticRings', 'AromaticHeterocycles',
       'AliphaticRings', 'MolecularComplexity', 'MolarRefractivity']]

target_df = balanced_df[['NR-AR']]

In [11]:
target_df.value_counts()

NR-AR
0        312
1        312
dtype: int64

### Train | Test Split

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features_df, target_df, test_size=0.33, random_state=42)

In [16]:
y_train.head()

Unnamed: 0_level_0,NR-AR
mol_id,Unnamed: 1_level_1
TOX27098,0
TOX25493,0
TOX27766,1
TOX12376,0
TOX2901,1


# Voting Classifier

In [18]:
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, classification_report

y_train_numeric = y_train["NR-AR"].astype(int)

# Step 1: Define base models
xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight = (len(y_train_numeric) - sum(y_train_numeric)) / sum(y_train_numeric),
    random_state=42
)

lgbm = LGBMClassifier(
    class_weight='balanced',
    random_state=42
)

ann = MLPClassifier(
    hidden_layer_sizes=(100,),
    max_iter=1000,
    random_state=42
)

# Step 2: Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('xgb', xgb),
        ('lgbm', lgbm),
        ('ann', ann)
    ],
    voting='soft'  # use soft voting for probabilities
)

# Step 3: Full pipeline
pipeline = Pipeline([
    ('smote_enn', SMOTEENN(random_state=42)),
    ('scaler', StandardScaler()),
    ('voting', voting_clf)
])

# Step 4: Fit
pipeline.fit(X_train, y_train)

# Step 5: Predict and Evaluate
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Number of positive: 89, number of negative: 71
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000169 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 425
[LightGBM] [Info] Number of data points in the train set: 160, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


In [19]:
print("AUC-ROC:", roc_auc_score(y_test, y_proba))
print("\n Classification Report:")
print(classification_report(y_test, y_pred))

AUC-ROC: 0.8064516129032258

 Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.75      0.76       113
           1       0.71      0.73      0.72        93

    accuracy                           0.74       206
   macro avg       0.74      0.74      0.74       206
weighted avg       0.74      0.74      0.74       206



## Dumping into a .pkl File

In [23]:
import pickle
import os

os.makedirs('../../Models/NR-AR/', exist_ok=True)

with open('../../Models/NR-AR/voting_xgb_lgbm_ann.pkl', 'wb') as file:
    pickle.dump(pipeline, file)