<a href="https://colab.research.google.com/github/shreekar2005/Cheminformatics_DC/blob/main/DC2_parallel_NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [27]:
import numpy as np
import pandas as pd
from scipy import io
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim

# Load data
y_tr = pd.read_csv('drive/MyDrive/DATASET/tox21_labels_train.csv.gz', index_col=0, compression="gzip")
y_te = pd.read_csv('drive/MyDrive/DATASET/tox21_labels_test.csv.gz', index_col=0, compression="gzip")
x_tr_dense = pd.read_csv('drive/MyDrive/DATASET/tox21_dense_train.csv.gz', index_col=0, compression="gzip").values
x_te_dense = pd.read_csv('drive/MyDrive/DATASET/tox21_dense_test.csv.gz', index_col=0, compression="gzip").values
x_tr_sparse = io.mmread('drive/MyDrive/DATASET/tox21_sparse_train.mtx.gz').tocsc()
x_te_sparse = io.mmread('drive/MyDrive/DATASET/tox21_sparse_test.mtx.gz').tocsc()

# Filter sparse features
sparse_col_idx = ((x_tr_sparse > 0).mean(0) > 0.05).A.ravel()
x_tr = np.hstack([x_tr_dense, x_tr_sparse[:, sparse_col_idx].toarray()])
x_te = np.hstack([x_te_dense, x_te_sparse[:, sparse_col_idx].toarray()])

# Normalize
scaler = StandardScaler()
x_tr = scaler.fit_transform(x_tr)
x_te = scaler.transform(x_te)

# Convert to torch tensors
X_train_full = torch.tensor(x_tr, dtype=torch.float32)
X_test_full = torch.tensor(x_te, dtype=torch.float32)

In [28]:
# Define simple binary classifier NN
class BinaryNN(nn.Module):
    def __init__(self, input_dim):
        super(BinaryNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x)

In [29]:
models = [None] * len(y_tr.columns)  # Create list to store models

# Training loop for each assay
for i, target in enumerate(y_tr.columns):
    print(f"\n{'='*60}\nTraining NN for assay {i+1}: {target}")

    # Get valid rows for training
    train_mask = np.isfinite(y_tr[target].values)
    X_train = X_train_full[train_mask]
    Y_train = torch.tensor(y_tr[target][train_mask].values.reshape(-1, 1), dtype=torch.float32)

    # Initialize model, loss, optimizer
    model = BinaryNN(X_train.shape[1])
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    # Train model
    epochs = 20
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, Y_train)
        loss.backward()
        optimizer.step()
        if (epoch+1) % 5 == 0:
            print(f"Epoch {epoch+1}/{epochs} - Loss: {loss.item():.4f}")

    # Store model after training
    models[i] = model



Training NN for assay 1: NR.AhR
Epoch 5/20 - Loss: 0.3716
Epoch 10/20 - Loss: 0.2992
Epoch 15/20 - Loss: 0.2618
Epoch 20/20 - Loss: 0.2377

Training NN for assay 2: NR.AR
Epoch 5/20 - Loss: 0.2447
Epoch 10/20 - Loss: 0.1381
Epoch 15/20 - Loss: 0.1347
Epoch 20/20 - Loss: 0.1284

Training NN for assay 3: NR.AR.LBD
Epoch 5/20 - Loss: 0.3912
Epoch 10/20 - Loss: 0.1579
Epoch 15/20 - Loss: 0.1247
Epoch 20/20 - Loss: 0.1217

Training NN for assay 4: NR.Aromatase
Epoch 5/20 - Loss: 0.4132
Epoch 10/20 - Loss: 0.2053
Epoch 15/20 - Loss: 0.1823
Epoch 20/20 - Loss: 0.1722

Training NN for assay 5: NR.ER
Epoch 5/20 - Loss: 0.3855
Epoch 10/20 - Loss: 0.3571
Epoch 15/20 - Loss: 0.3231
Epoch 20/20 - Loss: 0.3007

Training NN for assay 6: NR.ER.LBD
Epoch 5/20 - Loss: 0.3286
Epoch 10/20 - Loss: 0.1962
Epoch 15/20 - Loss: 0.1915
Epoch 20/20 - Loss: 0.1712

Training NN for assay 7: NR.PPAR.gamma
Epoch 5/20 - Loss: 0.3677
Epoch 10/20 - Loss: 0.1637
Epoch 15/20 - Loss: 0.1483
Epoch 20/20 - Loss: 0.1464

Tr

In [30]:
# Evaluation Loop for each assay
threshold = 0.5  # Default threshold; try adjusting this value for better results.

print("\n" + "-"*60)
print("Final Classification Report for Selected Assays:")
print("-"*60)

for i, target in enumerate(y_te.columns):
    print(f"\nAssay {i+1}: {target}")

    # Get valid test rows
    test_mask = np.isfinite(y_te[target].values)
    X_test = X_test_full[test_mask]
    Y_test = y_te[target][test_mask].values

    model = models[i]
    model.eval()

    with torch.no_grad():
        logits = model(X_test).squeeze()
        probs = torch.sigmoid(logits).numpy()
        preds = (probs >= threshold).astype(int)

    # Check if there are positive predictions or labels
    if np.sum(preds) == 0 and np.sum(Y_test) == 0:
        print("Only negative samples — skipping evaluation.")
        continue
    elif np.sum(preds) == 0:
        print("⚠️ No positive predictions — skipping classification report.")
        print("Consider adjusting the threshold or using class weighting.")
        continue
    elif np.sum(Y_test) == 0:
        print("⚠️ No positive labels — skipping classification report.")
        continue

    # Classification report
    report = classification_report(Y_test, preds, output_dict=True)
    print(f"macro avg  precision: {report['macro avg']['precision']:.6f} | "
          f"recall: {report['macro avg']['recall']:.6f} | "
          f"f1-score: {report['macro avg']['f1-score']:.6f} | "
          f"support: {report['macro avg']['support']}")
    print(f"weighted avg precision: {report['weighted avg']['precision']:.6f} | "
          f"recall: {report['weighted avg']['recall']:.6f} | "
          f"f1-score: {report['weighted avg']['f1-score']:.6f} | "
          f"support: {report['weighted avg']['support']}")

    # Confusion matrix
    tn, fp, fn, tp = confusion_matrix(Y_test, preds).ravel()
    print(f"Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")

    # Additional diagnostics for the assay if no positives are predicted
    if np.sum(preds) == 0:
        print(f"Warning: No positive predictions for assay {target}.")
        print(f"Positive samples in the test set: {np.sum(Y_test)}")
        print(f"Predicted positives: {np.sum(preds)}")
        print("Consider lowering the classification threshold, checking class imbalance, or adjusting model training.")


------------------------------------------------------------
Final Classification Report for Selected Assays:
------------------------------------------------------------

Assay 1: NR.AhR
macro avg  precision: 0.724744 | recall: 0.732673 | f1-score: 0.728598 | support: 610.0
weighted avg precision: 0.885688 | recall: 0.883607 | f1-score: 0.884615 | support: 610.0
Confusion Matrix: TN=500, FP=37, FN=34, TP=39

Assay 2: NR.AR
macro avg  precision: 0.573851 | recall: 0.537311 | f1-score: 0.548623 | support: 586.0
weighted avg precision: 0.964358 | recall: 0.972696 | f1-score: 0.968217 | support: 586.0
Confusion Matrix: TN=569, FP=5, FN=11, TP=1

Assay 3: NR.AR.LBD
macro avg  precision: 0.493080 | recall: 0.496516 | f1-score: 0.494792 | support: 582.0
weighted avg precision: 0.972604 | recall: 0.979381 | f1-score: 0.975981 | support: 582.0
Confusion Matrix: TN=570, FP=4, FN=8, TP=0

Assay 4: NR.Aromatase
⚠️ No positive predictions — skipping classification report.
Consider adjusting the t