In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
# Load your dataset
df = pd.read_csv('Drug.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269547 entries, 0 to 269546
Data columns (total 22 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Drug name                269547 non-null  object 
 1   Drug ID                  269547 non-null  int64  
 2   Drug target              266795 non-null  object 
 3   Target Pathway           269129 non-null  object 
 4   Feature Name             269547 non-null  object 
 5   n_feature_pos            269547 non-null  int64  
 6   n_feature_neg            269547 non-null  int64  
 7   ic50_effect_size         269547 non-null  float64
 8   log_ic50_mean_pos        269547 non-null  float64
 9   log_ic50_mean_neg        269547 non-null  float64
 10  log_max_conc_tested      0 non-null       float64
 11  log_max_conc_tested_2    0 non-null       float64
 12  feature_ic50_t_pval      269547 non-null  float64
 13  feature_delta_mean_ic50  269547 non-null  float64
 14  feat

In [None]:
# Step 1: Drop unnecessary columns
df.drop(columns=['log_max_conc_tested', 'log_max_conc_tested_2' , 'Drug name', 'Drug ID'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269547 entries, 0 to 269546
Data columns (total 18 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Drug target              266795 non-null  object 
 1   Target Pathway           269129 non-null  object 
 2   Feature Name             269547 non-null  object 
 3   n_feature_pos            269547 non-null  int64  
 4   n_feature_neg            269547 non-null  int64  
 5   ic50_effect_size         269547 non-null  float64
 6   log_ic50_mean_pos        269547 non-null  float64
 7   log_ic50_mean_neg        269547 non-null  float64
 8   feature_ic50_t_pval      269547 non-null  float64
 9   feature_delta_mean_ic50  269547 non-null  float64
 10  feature_pos_ic50_var     269547 non-null  float64
 11  feature_neg_ic50_var     269547 non-null  float64
 12  feature_pval             269547 non-null  float64
 13  tissue_pval              269547 non-null  float64
 14  msi_

In [None]:
# Step 2: Handle missing values
# Impute missing values with a placeholder or mean/median
df['Drug target'].fillna('Unknown', inplace=True)  # Placeholder for categorical
df['Target Pathway'].fillna(df['Target Pathway'].mode()[0], inplace=True)  # Most frequent value

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269547 entries, 0 to 269546
Data columns (total 18 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Drug target              269547 non-null  object 
 1   Target Pathway           269547 non-null  object 
 2   Feature Name             269547 non-null  object 
 3   n_feature_pos            269547 non-null  int64  
 4   n_feature_neg            269547 non-null  int64  
 5   ic50_effect_size         269547 non-null  float64
 6   log_ic50_mean_pos        269547 non-null  float64
 7   log_ic50_mean_neg        269547 non-null  float64
 8   feature_ic50_t_pval      269547 non-null  float64
 9   feature_delta_mean_ic50  269547 non-null  float64
 10  feature_pos_ic50_var     269547 non-null  float64
 11  feature_neg_ic50_var     269547 non-null  float64
 12  feature_pval             269547 non-null  float64
 13  tissue_pval              269547 non-null  float64
 14  msi_

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Drug target'].fillna('Unknown', inplace=True)  # Placeholder for categorical
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Target Pathway'].fillna(df['Target Pathway'].mode()[0], inplace=True)  # Most frequent value


In [None]:
# Step 3: Scale numerical features
# Identify numerical columns
numerical_cols = ['n_feature_pos', 'n_feature_neg', 'ic50_effect_size', 'log_ic50_mean_pos',
                  'log_ic50_mean_neg', 'feature_ic50_t_pval', 'feature_delta_mean_ic50',
                  'feature_pos_ic50_var', 'feature_neg_ic50_var', 'feature_pval',
                  'tissue_pval', 'msi_pval', 'fdr']

# Scale numerical features
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

df

Unnamed: 0,Drug target,Target Pathway,Feature Name,n_feature_pos,n_feature_neg,ic50_effect_size,log_ic50_mean_pos,log_ic50_mean_neg,feature_ic50_t_pval,feature_delta_mean_ic50,feature_pos_ic50_var,feature_neg_ic50_var,feature_pval,tissue_pval,msi_pval,fdr,Tissue Type,Screening Set
0,EGFR,EGFR signaling,ABCB1_mut,-0.506583,-2.407918,0.772921,0.372236,0.218921,-0.914995,0.795469,-1.031314,-0.438545,1.766138,-0.062739,-0.589931,0.739481,PANCANCER,GDSC1
1,EGFR,EGFR signaling,ABL2_mut,-0.577753,-2.384875,0.701235,0.366858,0.221107,-1.324133,0.756749,-2.112038,-0.442029,1.134197,-0.062739,-0.590025,0.688297,PANCANCER,GDSC1
2,EGFR,EGFR signaling,ACACA_mut,-0.577753,-2.384875,0.132321,0.306001,0.221580,-1.354791,0.442111,-2.242740,-0.441088,1.424208,-0.062739,-0.589959,0.696975,PANCANCER,GDSC1
3,EGFR,EGFR signaling,ACVR2A_mut,-0.506583,-2.407918,-0.596694,0.227343,0.221576,1.087323,0.038568,-0.364884,-0.444933,0.149466,-0.062739,-0.590614,0.688297,PANCANCER,GDSC1
4,EGFR,EGFR signaling,AFF4_mut,-0.559961,-2.390636,-1.103030,0.164341,0.222694,1.738445,-0.290356,-1.806841,-0.437513,-0.068808,-0.062739,-0.590850,0.688297,PANCANCER,GDSC1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269542,"EP300, CBP",Chromatin histone acetylation,cnaPANCAN421,-0.488790,0.092294,0.957372,0.915063,1.116478,-0.520547,-0.988108,-1.002413,-1.483178,-1.298915,-0.062739,-0.379773,-0.705522,PANCANCER,GDSC1
269543,"EP300, CBP",Chromatin histone acetylation,cnaPANCAN422,-0.488790,0.092294,0.242181,0.962381,1.116012,0.300837,-0.742968,-0.761841,-1.485139,-0.888855,-0.062739,-0.378627,-0.336968,PANCANCER,GDSC1
269544,"EP300, CBP",Chromatin histone acetylation,cnaPANCAN423,-0.488790,0.092294,0.242181,0.962381,1.116012,0.300837,-0.742968,-0.761841,-1.485139,-0.888855,-0.062739,-0.378627,-0.336968,PANCANCER,GDSC1
269545,"EP300, CBP",Chromatin histone acetylation,cnaPANCAN424,-0.364242,0.051968,-0.375909,1.003769,1.116049,0.417739,-0.530812,-1.009930,-1.482263,-0.837289,-0.062739,-0.378538,-0.336968,PANCANCER,GDSC1


In [None]:
# Convert only one-hot encoded columns to integer type

import pandas as pd

# Step 4: Encode categorical variables
# Identify categorical columns
categorical_cols = ['Drug target', 'Feature Name', 'Tissue Type', 'Screening Set', 'Target Pathway']

# One-hot encode categorical variables
data = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

one_hot_cols = data.columns[data.columns.str.startswith(tuple(categorical_cols))]
data[one_hot_cols] = data[one_hot_cols].astype(int)
data

Unnamed: 0,n_feature_pos,n_feature_neg,ic50_effect_size,log_ic50_mean_pos,log_ic50_mean_neg,feature_ic50_t_pval,feature_delta_mean_ic50,feature_pos_ic50_var,feature_neg_ic50_var,feature_pval,...,Target Pathway_Metabolism,Target Pathway_Mitosis,Target Pathway_Other,"Target Pathway_Other, kinases",Target Pathway_PI3K/MTOR signaling,Target Pathway_Protein stability and degradation,Target Pathway_RTK signaling,Target Pathway_Unclassified,Target Pathway_WNT signaling,Target Pathway_p53 pathway
0,-0.506583,-2.407918,0.772921,0.372236,0.218921,-0.914995,0.795469,-1.031314,-0.438545,1.766138,...,0,0,0,0,0,0,0,0,0,0
1,-0.577753,-2.384875,0.701235,0.366858,0.221107,-1.324133,0.756749,-2.112038,-0.442029,1.134197,...,0,0,0,0,0,0,0,0,0,0
2,-0.577753,-2.384875,0.132321,0.306001,0.221580,-1.354791,0.442111,-2.242740,-0.441088,1.424208,...,0,0,0,0,0,0,0,0,0,0
3,-0.506583,-2.407918,-0.596694,0.227343,0.221576,1.087323,0.038568,-0.364884,-0.444933,0.149466,...,0,0,0,0,0,0,0,0,0,0
4,-0.559961,-2.390636,-1.103030,0.164341,0.222694,1.738445,-0.290356,-1.806841,-0.437513,-0.068808,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269542,-0.488790,0.092294,0.957372,0.915063,1.116478,-0.520547,-0.988108,-1.002413,-1.483178,-1.298915,...,0,0,0,0,0,0,0,0,0,0
269543,-0.488790,0.092294,0.242181,0.962381,1.116012,0.300837,-0.742968,-0.761841,-1.485139,-0.888855,...,0,0,0,0,0,0,0,0,0,0
269544,-0.488790,0.092294,0.242181,0.962381,1.116012,0.300837,-0.742968,-0.761841,-1.485139,-0.888855,...,0,0,0,0,0,0,0,0,0,0
269545,-0.364242,0.051968,-0.375909,1.003769,1.116049,0.417739,-0.530812,-1.009930,-1.482263,-0.837289,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler


# # Step 2: Split into features and target
# X = data.drop(columns=['ic50_effect_size']).values
# y = data['ic50_effect_size'].values

# # Step 3: Split the data into training and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# # Step 4: Scale the features
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_val = scaler.transform(X_val)

# # Convert to PyTorch tensors
# X_train_tensor = torch.FloatTensor(X_train)
# y_train_tensor = torch.FloatTensor(y_train).view(-1, 1)  # Reshape for a single output
# X_val_tensor = torch.FloatTensor(X_val)
# y_val_tensor = torch.FloatTensor(y_val).view(-1, 1)

# # Create DataLoader
# train_data = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
# train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=64, shuffle=True)

# # Step 5: Define the ANN model

# class ANNModel(nn.Module):
#     def _init_(self, input_size, dropout_rate=0.2):
#         super(ANNModel, self)._init_()
#         self.fc1 = nn.Linear(input_size, 128)  # First layer
#         self.dropout1 = nn.Dropout(dropout_rate)  # Dropout layer after the first layer
#         self.fc2 = nn.Linear(128, 64)           # Second layer
#         self.dropout2 = nn.Dropout(dropout_rate)  # Dropout layer after the second layer
#         self.fc3 = nn.Linear(64, 1)             # Output layer
#         self.relu = nn.ReLU()                    # Activation function

#     def forward(self, x):
#         x = self.relu(self.fc1(x))
#         x = self.dropout1(x)  # Apply dropout after the first layer
#         x = self.relu(self.fc2(x))
#         x = self.dropout2(x)  # Apply dropout after the second layer
#         x = self.fc3(x)  # No activation for the output layer for regression
#         return x


# # Initialize the model
# model = ANNModel(input_size=X_train.shape[1])

# # Step 6: Define loss function and optimizer
# criterion = nn.MSELoss()  # Mean Squared Error for regression
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Step 7: Train the model
# num_epochs = 100
# for epoch in range(num_epochs):
#     model.train()  # Set the model to training mode
#     for batch_X, batch_y in train_loader:
#         optimizer.zero_grad()  # Zero the gradients
#         outputs = model(batch_X)  # Forward pass
#         loss = criterion(outputs, batch_y)  # Calculate loss
#         loss.backward()  # Backward pass
#         optimizer.step()  # Update weights

#     # Calculate training loss
#     model.eval()  # Set the model to evaluation mode
#     with torch.no_grad():
#         train_loss = criterion(model(X_train_tensor), y_train_tensor)
#         val_loss = criterion(model(X_val_tensor), y_val_tensor)

#     print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss.item():.4f}, Validation Loss: {val_loss.item():.4f}')

# # Step 8: Evaluate the model
# with torch.no_grad():
#     model.eval()
#     val_predictions = model(X_val_tensor)
#     val_predictions = val_predictions.numpy()

# # Calculate accuracy (R² score for regression)
# from sklearn.metrics import r2_score
# accuracy = r2_score(y_val, val_predictions)
# print(f'Validation R² Score: {accuracy:.4f}')




import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import os

seed = 100
torch.manual_seed(seed)

# Assuming you have your 'data' dataframe loaded already
# Step 2: Split into features and target
X = data.drop(columns=['ic50_effect_size']).values
y = data['ic50_effect_size'].values

# Step 3: Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train).view(-1, 1)  # Reshape for a single output
X_val_tensor = torch.FloatTensor(X_val)
y_val_tensor = torch.FloatTensor(y_val).view(-1, 1)

# Create DataLoader
train_data = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=64, shuffle=True)

# Step 5: Define the ANN model
class ANNModel(nn.Module):
    def __init__(self, input_size, dropout_rate=0.2):  # Corrected __init__ method
        super(ANNModel, self).__init__()  # Corrected super call
        self.fc1 = nn.Linear(input_size, 128)  # First layer
        self.dropout1 = nn.Dropout(dropout_rate)  # Dropout layer after the first layer
        self.fc2 = nn.Linear(128, 64)  # Second layer
        self.dropout2 = nn.Dropout(dropout_rate)  # Dropout layer after the second layer
        self.fc3 = nn.Linear(64, 1)  # Output layer
        self.relu = nn.ReLU()  # Activation function

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout1(x)  # Apply dropout after the first layer
        x = self.relu(self.fc2(x))
        x = self.dropout2(x)  # Apply dropout after the second layer
        x = self.fc3(x)  # No activation for the output layer for regression
        return x

# Initialize the model
model = ANNModel(input_size=X_train.shape[1])

# Step 6: Define loss function and optimizer
criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Step 7: Train the model
num_epochs = 81
best_val_loss = float('inf')  # Initialize best validation loss
checkpoint_path = "best_model.pth"  # Path to save the model

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()  # Zero the gradients
        outputs = model(batch_X)  # Forward pass
        loss = criterion(outputs, batch_y)  # Calculate loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights

    # Calculate training and validation loss
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        train_loss = criterion(model(X_train_tensor), y_train_tensor)
        val_loss = criterion(model(X_val_tensor), y_val_tensor)

    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss.item():.4f}, Validation Loss: {val_loss.item():.4f}')

    # Checkpointing: save the model if validation loss decreases
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), checkpoint_path)  # Save the model
        print(f'Saved model with improved validation loss: {val_loss.item():.4f} at epoch {epoch+1}')

# Step 8: Evaluate the model
with torch.no_grad():
    model.eval()
    val_predictions = model(X_val_tensor)
    val_predictions = val_predictions.numpy()

# Calculate accuracy (R² score for regression)
accuracy = r2_score(y_val, val_predictions)
print(f'Validation R² Score: {accuracy:.4f}')


Epoch [1/81], Train Loss: 0.0684, Validation Loss: 0.0741
Saved model with improved validation loss: 0.0741 at epoch 1
Epoch [2/81], Train Loss: 0.0411, Validation Loss: 0.0455
Saved model with improved validation loss: 0.0455 at epoch 2
Epoch [3/81], Train Loss: 0.0374, Validation Loss: 0.0410
Saved model with improved validation loss: 0.0410 at epoch 3
Epoch [4/81], Train Loss: 0.0338, Validation Loss: 0.0376
Saved model with improved validation loss: 0.0376 at epoch 4
Epoch [5/81], Train Loss: 0.0238, Validation Loss: 0.0268
Saved model with improved validation loss: 0.0268 at epoch 5
Epoch [6/81], Train Loss: 0.0233, Validation Loss: 0.0262
Saved model with improved validation loss: 0.0262 at epoch 6
Epoch [7/81], Train Loss: 0.0217, Validation Loss: 0.0246
Saved model with improved validation loss: 0.0246 at epoch 7
Epoch [8/81], Train Loss: 0.0180, Validation Loss: 0.0209
Saved model with improved validation loss: 0.0209 at epoch 8
Epoch [9/81], Train Loss: 0.0194, Validation Los