# Imports

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

# Read-In Data

In [2]:
df = pd.read_csv('../../DATA/filled_toxicity_df.csv')

df.head()

Unnamed: 0,mol_id,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,TOX3021,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,2.0,...,1,0,0,0,0,1,0,0,0,0
1,TOX3020,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,TOX3024,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,4.0,...,0,0,0,0,0,0,0,0,0,0
3,TOX3027,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,TOX20800,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df.set_index('mol_id', inplace=True)

df.head()

Unnamed: 0_level_0,MolecularWeight,LogP,TPSA,HBDonors,HBAcceptors,RotatableBonds,FractionCSP3,HeavyAtoms,RingCount,AromaticProportion,...,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
mol_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TOX3021,258.324,1.3424,82.28,1.0,5.0,3.0,0.222222,16.0,2.0,0.5625,...,1,0,0,0,0,1,0,0,0,0
TOX3020,204.229,1.2994,49.41,1.0,2.0,2.0,0.272727,15.0,2.0,0.4,...,0,0,0,0,0,0,0,0,0,0
TOX3024,288.475,5.0903,20.23,1.0,1.0,1.0,0.9,21.0,4.0,0.0,...,0,0,0,0,0,0,0,0,0,0
TOX3027,276.424,3.75244,32.34,1.0,2.0,7.0,0.588235,20.0,1.0,0.3,...,0,0,0,0,0,0,0,0,0,0
TOX20800,206.027,-0.9922,135.29,5.0,3.0,2.0,1.0,11.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.columns

Index(['MolecularWeight', 'LogP', 'TPSA', 'HBDonors', 'HBAcceptors',
       'RotatableBonds', 'FractionCSP3', 'HeavyAtoms', 'RingCount',
       'AromaticProportion', 'LogS_ESOL', 'PositiveCharges', 'NegativeCharges',
       'FormalCharge', 'AromaticRings', 'AromaticHeterocycles',
       'AliphaticRings', 'MolecularComplexity', 'MolarRefractivity',
       'Heteroatoms', 'HalogenCount', 'PhenolicGroups', 'NR-AR', 'NR-AR-LBD',
       'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma',
       'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'],
      dtype='object')

In [5]:
subset_0 = df[df['NR-ER'] == 0].sample(n=884, random_state=42)

subset_1 = df[df['NR-ER'] == 1]

balanced_df = pd.concat([subset_0, subset_1])

features_df = balanced_df[['MolecularWeight', 'LogP', 'TPSA', 'HBDonors', 'HBAcceptors',
       'RotatableBonds', 'FractionCSP3', 'HeavyAtoms', 'RingCount', 'LogS_ESOL',
       'FormalCharge', 'AromaticRings', 'AromaticHeterocycles',
       'AliphaticRings', 'MolecularComplexity', 'MolarRefractivity']]

target_df = balanced_df[['NR-ER']]

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features_df, target_df, test_size=0.33, random_state=42)

# ANN + SMOTEEN 

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
import numpy as np

# --- 1. Convert data to numpy and split ---
X = features_df.values.astype(np.float32)
y = target_df.values.astype(np.float32).ravel()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# --- 2. Standardize features ---
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# --- 3. Convert to torch tensors ---
X_train_tensor = torch.tensor(X_train)
y_train_tensor = torch.tensor(y_train).unsqueeze(1)  # shape: (batch, 1)
X_test_tensor = torch.tensor(X_test)
y_test_tensor = torch.tensor(y_test).unsqueeze(1)

# --- 4. Define the ANN model ---
class ANNModel(nn.Module):
    def __init__(self, input_dim):
        super(ANNModel, self).__init__()
        self.network = nn.Sequential(
        nn.Linear(input_dim, 128),
        nn.BatchNorm1d(128),
        nn.ReLU(),
        nn.Dropout(0.3),

        nn.Linear(128, 64),
        nn.BatchNorm1d(64),
        nn.ReLU(),
        nn.Dropout(0.3),

        nn.Linear(64, 32),
        nn.BatchNorm1d(32),
        nn.ReLU(),
        nn.Dropout(0.3),

        nn.Linear(32, 1),
        nn.Sigmoid()
    )

    def forward(self, x):
        return self.network(x)

model = ANNModel(X_train.shape[1])

# --- 5. Loss and Optimizer ---
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# --- 6. Training loop ---
epochs = 2000
for epoch in range(epochs):
    model.train()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

# --- 7. Evaluation ---
model.eval()
with torch.no_grad():
    y_pred_proba = model(X_test_tensor).numpy()
    y_pred_label = (y_pred_proba >= 0.5).astype(int)

print("\nClassification Report:\n", classification_report(y_test, y_pred_label))
print("AUC-ROC:", roc_auc_score(y_test, y_pred_proba))

import torch.nn as nn

class ToxMLP(nn.Module):
    def __init__(self, input_dim):
        super(ToxMLP, self).__init__()
        self.network = nn.Sequential(            
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.4),

            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.4),

            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.network(x)

model = ToxMLP(X_train.shape[1])

# --- 5. Loss and Optimizer ---
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# --- 6. Training loop ---
epochs = 100
for epoch in range(epochs):
    model.train()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

# --- 7. Evaluation ---
model.eval()
with torch.no_grad():
    y_pred_proba = model(X_test_tensor).numpy()
    y_pred_label = (y_pred_proba >= 0.5).astype(int)

print("\nClassification Report:\n", classification_report(y_test, y_pred_label))
print("AUC-ROC:", roc_auc_score(y_test, y_pred_proba))

Epoch [10/2000], Loss: 0.6769
Epoch [20/2000], Loss: 0.6523
Epoch [30/2000], Loss: 0.6409
Epoch [40/2000], Loss: 0.6374
Epoch [50/2000], Loss: 0.6223
Epoch [60/2000], Loss: 0.6192
Epoch [70/2000], Loss: 0.6084
Epoch [80/2000], Loss: 0.6044
Epoch [90/2000], Loss: 0.6055
Epoch [100/2000], Loss: 0.6039
Epoch [110/2000], Loss: 0.5889
Epoch [120/2000], Loss: 0.5839
Epoch [130/2000], Loss: 0.5909
Epoch [140/2000], Loss: 0.5863
Epoch [150/2000], Loss: 0.5737
Epoch [160/2000], Loss: 0.5725
Epoch [170/2000], Loss: 0.5707
Epoch [180/2000], Loss: 0.5568
Epoch [190/2000], Loss: 0.5625
Epoch [200/2000], Loss: 0.5571
Epoch [210/2000], Loss: 0.5476
Epoch [220/2000], Loss: 0.5462
Epoch [230/2000], Loss: 0.5311
Epoch [240/2000], Loss: 0.5385
Epoch [250/2000], Loss: 0.5476
Epoch [260/2000], Loss: 0.5258
Epoch [270/2000], Loss: 0.5359
Epoch [280/2000], Loss: 0.5363
Epoch [290/2000], Loss: 0.5193
Epoch [300/2000], Loss: 0.5317
Epoch [310/2000], Loss: 0.5128
Epoch [320/2000], Loss: 0.5206
Epoch [330/2000],

In [9]:
import joblib

# Define the file paths for your saved model and scaler
MODEL_PATH = 'tox_mlp_model_weights.pth'
SCALER_PATH = 'standard_scaler.joblib'

# 1. Save the model's state_dict (weights and biases)
# This is the recommended way to save PyTorch models.
torch.save(model.state_dict(), MODEL_PATH)
print(f"PyTorch model weights saved to: {MODEL_PATH}")

# 2. Save the StandardScaler object
# This is crucial because new data must be scaled using the *same* scaler
# that was fitted on the training data.
joblib.dump(scaler, SCALER_PATH)
print(f"StandardScaler saved to: {SCALER_PATH}")

print("\nModel and scaler saved successfully!")

PyTorch model weights saved to: tox_mlp_model_weights.pth
StandardScaler saved to: standard_scaler.joblib

Model and scaler saved successfully!
