# **Data input 3D**

In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix



In [3]:
df1 = pd.read_csv("RDKit-3D_scal_aug_data.csv")

In [4]:
df1.head()

Unnamed: 0,PMI1,PMI2,NPR1,NPR2,RadiusOfGyration,InertialShapeFactor,SpherocityIndex,PBF,values
0,-0.416267,-0.260116,1.345511,-0.602951,-1.931896,0.585006,0.157481,-1.340883,1
1,-0.25662,-0.175355,0.052397,-0.147606,-0.548179,-0.046893,0.233452,-0.067527,1
2,0.885918,0.014085,2.078184,-1.046947,0.087002,-0.138292,2.132503,2.278375,1
3,-0.174179,-0.106656,-0.30693,-0.348407,-0.117882,-0.080889,-0.94596,-1.007047,1
4,-0.082943,-0.000443,-0.475669,0.449613,0.191616,-0.093591,-0.16423,0.28157,1


In [5]:
# Separate features and target
X = df1.drop(columns=["values"]).values  # Molecular descriptors (features)
y = df1["values"].values  # Target variable (binary classification)


# Split data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

In [6]:
X.shape

(22000, 8)

# **NN 3D relu**

In [None]:
# Define Neural Network
class MolecularNN(nn.Module):
    def __init__(self, input_dim):
        super(MolecularNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)  # Increased neurons for non-linearity
        self.bn1 = nn.BatchNorm1d(256)  # Batch normalization for stable training
        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.fc3 = nn.Linear(128, 64)
        self.bn3 = nn.BatchNorm1d(64)
        self.fc4 = nn.Linear(64, 1)  # Binary classification output
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)  # Dropout to prevent overfitting
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = self.relu(self.bn3(self.fc3(x)))
        x = self.sigmoid(self.fc4(x))  # Sigmoid for binary classification
        return x

# Initialize model
input_dim = X_train.shape[1]  # Number of molecular descriptors
model = MolecularNN(input_dim)

# Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy loss
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)  # L2 regularization

# Training loop
epochs = 500
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    y_pred = model(X_train_tensor)
    loss = criterion(y_pred, y_train_tensor)
    loss.backward()  # Backpropagation
    optimizer.step()

    if (epoch+1) % 5 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

In [None]:
# Evaluate model
model.eval()
with torch.no_grad():
    y_pred_test = model(X_test_tensor)
    y_pred_labels = (y_pred_test >= 0.5).float()  # Convert probabilities to binary labels

# Convert tensors to numpy for metrics calculation
y_test_np = y_test_tensor.numpy()
y_pred_np = y_pred_labels.numpy()

# Calculate evaluation metrics
accuracy = accuracy_score(y_test_np, y_pred_np)
precision = precision_score(y_test_np, y_pred_np)
recall = recall_score(y_test_np, y_pred_np)
f1 = f1_score(y_test_np, y_pred_np)
roc_auc = roc_auc_score(y_test_np, y_pred_test.numpy())
conf_matrix = confusion_matrix(y_test_np, y_pred_np)

# Print evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("Confusion Matrix:")
print(conf_matrix)


# **Optimized NN 3D leaky relu**

In [None]:
class ImprovedMolecularNN(nn.Module):
    def __init__(self, input_dim):
        super(ImprovedMolecularNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, 64)
        self.bn4 = nn.BatchNorm1d(64)
        self.fc5 = nn.Linear(64, 1)

        self.leaky_relu = nn.LeakyReLU(0.1)
        self.dropout = nn.Dropout(0.4)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.leaky_relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = self.leaky_relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = self.leaky_relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = self.leaky_relu(self.bn4(self.fc4(x)))
        x = self.sigmoid(self.fc5(x))
        return x

# Initialize model
input_dim = X_train.shape[1]  # Number of molecular descriptors
model = ImprovedMolecularNN(input_dim)

# Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy loss
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)  # L2 regularization

# Training loop
epochs = 500
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    y_pred = model(X_train_tensor)
    loss = criterion(y_pred, y_train_tensor)
    loss.backward()  # Backpropagation
    optimizer.step()

    if (epoch+1) % 5 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

In [None]:
# Evaluate model
model.eval()
with torch.no_grad():
    y_pred_test = model(X_test_tensor)
    y_pred_labels = (y_pred_test >= 0.5).float()  # Convert probabilities to binary labels

# Convert tensors to numpy for metrics calculation
y_test_np = y_test_tensor.numpy()
y_pred_np = y_pred_labels.numpy()

# Calculate evaluation metrics
accuracy = accuracy_score(y_test_np, y_pred_np)
precision = precision_score(y_test_np, y_pred_np)
recall = recall_score(y_test_np, y_pred_np)
f1 = f1_score(y_test_np, y_pred_np)
roc_auc = roc_auc_score(y_test_np, y_pred_test.numpy())
conf_matrix = confusion_matrix(y_test_np, y_pred_np)

# Print evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

# **Early stopping 3D features**

In [11]:
from sklearn.model_selection import train_test_split

# Load and preprocess data
X = df1.drop(columns=["values"]).values  # Features (molecular descriptors)
y = df1["values"].values  # Target variable (binary classification)

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train (70%), validation (15%), and test (15%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)


In [None]:
# Training loop with Early Stopping
epochs = 500
best_loss = float('inf')
counter = 0
patience = 50  # Stop training if validation loss doesn't improve for 50 epochs

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()

    # Forward pass
    y_pred = model(X_train_tensor)
    loss = criterion(y_pred, y_train_tensor)

    # Backpropagation
    loss.backward()
    optimizer.step()

    # Validation phase
    model.eval()
    with torch.no_grad():
        val_pred = model(X_val_tensor)
        val_loss = criterion(val_pred, y_val_tensor)  # Now correctly defined

    # Check early stopping condition
    if val_loss.item() < best_loss - 0.001:
        best_loss = val_loss.item()
        counter = 0  # Reset counter
        best_model = model.state_dict()  # Save best model
    else:
        counter += 1  # Increase counter if no improvement

    if counter >= patience:
        print(f"Early stopping at epoch {epoch+1}")
        break

    # Print loss every 10 epochs
    if (epoch+1) % 5 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}")

# Load the best model before testing
model.load_state_dict(best_model)


In [None]:
# Evaluate model on the test set
model.eval()
with torch.no_grad():
    y_pred_test = model(X_test_tensor)
    y_pred_labels = (y_pred_test >= 0.5).float()  # Convert probabilities to binary labels

# Convert tensors to numpy for metric calculations
y_test_np = y_test_tensor.numpy()
y_pred_np = y_pred_labels.numpy()

# Calculate evaluation metrics
accuracy = accuracy_score(y_test_np, y_pred_np)
precision = precision_score(y_test_np, y_pred_np)
recall = recall_score(y_test_np, y_pred_np)
f1 = f1_score(y_test_np, y_pred_np)
roc_auc = roc_auc_score(y_test_np, y_pred_test.numpy())
conf_matrix = confusion_matrix(y_test_np, y_pred_np)

# Print evaluation results
print("\n--- Final Model Evaluation on Test Set ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("Confusion Matrix:")
print(conf_matrix)


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Assuming you have y_test_np and y_pred_test from your code
fpr, tpr, thresholds = roc_curve(y_test_np, y_pred_test.numpy())
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()


In [None]:
# prompt: draw confusion marix

import seaborn as sns

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Predicted 0", "Predicted 1"],
            yticklabels=["Actual 0", "Actual 1"])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()


In [16]:
# Save trained model
torch.save(model.state_dict(), "3D rdki best nn_model.pth")
print("Best model saved as 'best_molecular_nn_model.pth'")


Best model saved as 'best_molecular_nn_model.pth'
