<a href="https://colab.research.google.com/github/shreekar2005/Cheminformatics_DC/blob/main/DC2_parallel_NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
from scipy import io
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim

# Load data
y_tr = pd.read_csv('drive/MyDrive/DATASET/tox21_labels_train.csv.gz', index_col=0, compression="gzip")
y_te = pd.read_csv('drive/MyDrive/DATASET/tox21_labels_test.csv.gz', index_col=0, compression="gzip")
x_tr_dense = pd.read_csv('drive/MyDrive/DATASET/tox21_dense_train.csv.gz', index_col=0, compression="gzip").values
x_te_dense = pd.read_csv('drive/MyDrive/DATASET/tox21_dense_test.csv.gz', index_col=0, compression="gzip").values
x_tr_sparse = io.mmread('drive/MyDrive/DATASET/tox21_sparse_train.mtx.gz').tocsc()
x_te_sparse = io.mmread('drive/MyDrive/DATASET/tox21_sparse_test.mtx.gz').tocsc()

# Filter sparse features
sparse_col_idx = ((x_tr_sparse > 0).mean(0) > 0.05).A.ravel()
x_tr = np.hstack([x_tr_dense, x_tr_sparse[:, sparse_col_idx].toarray()])
x_te = np.hstack([x_te_dense, x_te_sparse[:, sparse_col_idx].toarray()])

# Normalize
scaler = StandardScaler()
x_tr = scaler.fit_transform(x_tr)
x_te = scaler.transform(x_te)

# Convert to torch tensors
X_train_full = torch.tensor(x_tr, dtype=torch.float32)
X_test_full = torch.tensor(x_te, dtype=torch.float32)

In [4]:
# Define simple binary classifier NN
class BinaryNN(nn.Module):
    def __init__(self, input_dim):
        super(BinaryNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x)

In [5]:
models = [None] * len(y_tr.columns)  # Create list to store models

# Training loop for each assay
for i, target in enumerate(y_tr.columns):
    print(f"\n{'='*60}\nTraining NN for assay {i+1}: {target}")

    # Get valid rows for training
    train_mask = np.isfinite(y_tr[target].values)
    X_train = X_train_full[train_mask]
    Y_train = torch.tensor(y_tr[target][train_mask].values.reshape(-1, 1), dtype=torch.float32)
    print(Y_train.shape)
    # Initialize model, loss, optimizer
    model = BinaryNN(X_train.shape[1])
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    # Train model
    epochs = 20
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, Y_train)
        loss.backward()
        optimizer.step()
        if (epoch+1) % 5 == 0:
            print(f"Epoch {epoch+1}/{epochs} - Loss: {loss.item():.4f}")

    # Store model after training
    models[i] = model



Training NN for assay 1: NR.AhR
torch.Size([8441, 1])
Epoch 5/20 - Loss: 0.3824
Epoch 10/20 - Loss: 0.2991
Epoch 15/20 - Loss: 0.2588
Epoch 20/20 - Loss: 0.2378

Training NN for assay 2: NR.AR
torch.Size([9654, 1])
Epoch 5/20 - Loss: 0.2651
Epoch 10/20 - Loss: 0.1419
Epoch 15/20 - Loss: 0.1378
Epoch 20/20 - Loss: 0.1306

Training NN for assay 3: NR.AR.LBD
torch.Size([8852, 1])
Epoch 5/20 - Loss: 0.1968
Epoch 10/20 - Loss: 0.1229
Epoch 15/20 - Loss: 0.1208
Epoch 20/20 - Loss: 0.1087

Training NN for assay 4: NR.Aromatase
torch.Size([7440, 1])
Epoch 5/20 - Loss: 0.3527
Epoch 10/20 - Loss: 0.1966
Epoch 15/20 - Loss: 0.1795
Epoch 20/20 - Loss: 0.1671

Training NN for assay 5: NR.ER
torch.Size([7962, 1])
Epoch 5/20 - Loss: 0.3804
Epoch 10/20 - Loss: 0.3489
Epoch 15/20 - Loss: 0.3200
Epoch 20/20 - Loss: 0.2969

Training NN for assay 6: NR.ER.LBD
torch.Size([9040, 1])
Epoch 5/20 - Loss: 0.2986
Epoch 10/20 - Loss: 0.1924
Epoch 15/20 - Loss: 0.1879
Epoch 20/20 - Loss: 0.1704

Training NN for a

In [6]:
from tabulate import tabulate

# Evaluation Loop for each assay
threshold = 0.5

results_table = []
macro_precisions = []
macro_recalls = []
macro_f1s = []
weighted_precisions = []
weighted_recalls = []
weighted_f1s = []
total_support = 0

print("\n" + "-"*60)
print("Final Classification Report for Selected Assays (Class 1 only):")
print("-"*60)

for i, target in enumerate(y_te.columns):
    # Get valid test rows
    test_mask = np.isfinite(y_te[target].values)
    X_test = X_test_full[test_mask]
    Y_test = y_te[target][test_mask].values

    model = models[i]
    model.eval()

    with torch.no_grad():
        logits = model(X_test).squeeze()
        probs = torch.sigmoid(logits).numpy()
        preds = (probs >= threshold).astype(int)

    # if np.sum(preds) == 0 and np.sum(Y_test) == 0:
    #     continue
    # elif np.sum(preds) == 0 or np.sum(Y_test) == 0:
    #     continue

    # Confusion matrix
    tn, fp, fn, tp = confusion_matrix(Y_test, preds).ravel()
    support_1 = fn + tp
    support_total = tn + fp + fn + tp

    # Metrics for class 1
    precision_1 = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall_1 = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1_1 = (2 * precision_1 * recall_1) / (precision_1 + recall_1) if (precision_1 + recall_1) > 0 else 0.0

    results_table.append([
        f"Assay {i+1}: {target}", f"{precision_1:.4f}", f"{recall_1:.4f}", f"{f1_1:.4f}", support_total,
        f"{tn}", f"{fp}", f"{fn}", f"{tp}"
    ])

    # Accumulate macro and weighted scores
    macro_precisions.append(precision_1)
    macro_recalls.append(recall_1)
    macro_f1s.append(f1_1)

    weighted_precisions.append(precision_1 * support_1)
    weighted_recalls.append(recall_1 * support_1)
    weighted_f1s.append(f1_1 * support_1)
    total_support += support_1

# Print results in table
headers = ["Assay", "Precision", "Recall", "F1-Score", "Support", "TN", "FP", "FN", "TP"]
print(tabulate(results_table, headers=headers, tablefmt="fancy_grid"))

# Print final averages
summary_table = [[
    "Macro Avg", f"{np.mean(macro_precisions):.6f}", f"{np.mean(macro_recalls):.6f}",
    f"{np.mean(macro_f1s):.6f}"
], [
    "Weighted Avg", f"{np.sum(weighted_precisions)/total_support:.6f}",
    f"{np.sum(weighted_recalls)/total_support:.6f}",
    f"{np.sum(weighted_f1s)/total_support:.6f}"
]]
print("\n" + tabulate(summary_table, headers=["Metric", "Precision", "Recall", "F1-Score"], tablefmt="fancy_grid"))



------------------------------------------------------------
Final Classification Report for Selected Assays (Class 1 only):
------------------------------------------------------------
╒════════════════════════╤═════════════╤══════════╤════════════╤═══════════╤══════╤══════╤══════╤══════╕
│ Assay                  │   Precision │   Recall │   F1-Score │   Support │   TN │   FP │   FN │   TP │
╞════════════════════════╪═════════════╪══════════╪════════════╪═══════════╪══════╪══════╪══════╪══════╡
│ Assay 1: NR.AhR        │      0.5238 │   0.6027 │     0.5605 │       610 │  497 │   40 │   29 │   44 │
├────────────────────────┼─────────────┼──────────┼────────────┼───────────┼──────┼──────┼──────┼──────┤
│ Assay 2: NR.AR         │      0.125  │   0.0833 │     0.1    │       586 │  567 │    7 │   11 │    1 │
├────────────────────────┼─────────────┼──────────┼────────────┼───────────┼──────┼──────┼──────┼──────┤
│ Assay 3: NR.AR.LBD     │      0.1429 │   0.125  │     0.1333 │       582 │  