In [2]:
!pip install deepchem

Collecting deepchem
  Downloading deepchem-2.8.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdkit (from deepchem)
  Downloading rdkit-2024.9.6-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading deepchem-2.8.0-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading rdkit-2024.9.6-cp310-cp310-manylinux_2_28_x86_64.whl (34.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: rdkit, deepchem
Successfully installed deepchem-2.8.0 rdkit-2024.9.6


**Random Forest**

In [38]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors

In [39]:
# Load dataset
file_path = "/kaggle/input/chembl-dataset/chembl_5thresh.csv"
data = pd.read_csv(file_path)


In [40]:
# Filter rows where 'CHEMBL1829' is not missing
df_target = data[data['CHEMBL1829'].notna()][['smiles', 'CHEMBL1829']].reset_index(drop=True)

In [41]:
# Create a binary label: 1 if CHEMBL1829 >= 5 (active), else 0 (inactive)
df_target['label'] = (df_target['CHEMBL1829'] >= 5).astype(int)
print("Class Distribution:\n", df_target['label'].value_counts())

Class Distribution:
 label
1    370
0     38
Name: count, dtype: int64


In [42]:
# Generate molecular descriptors
descriptor_list = [desc[0] for desc in Descriptors._descList]
calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_list)

In [43]:
# Compute descriptors and handle invalid SMILES
X_ml = []
invalid_smiles = []

for i, s in enumerate(df_target['smiles']):
    mol = Chem.MolFromSmiles(s)
    if mol:
        X_ml.append(calculator.CalcDescriptors(mol))
    else:
        invalid_smiles.append(i)

In [44]:
# Remove rows with invalid SMILES
if invalid_smiles:
    print(f"Removing {len(invalid_smiles)} invalid SMILES.")
    df_target.drop(index=invalid_smiles, inplace=True)

In [45]:
# Convert descriptors to a NumPy array
X_ml = np.array(X_ml)
y = df_target['label'].values

In [46]:
print(f"Descriptor matrix shape: {X_ml.shape}")

Descriptor matrix shape: (408, 217)


In [47]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_ml, y, test_size=0.2, random_state=42)

In [48]:
# Initialize RandomForest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)


In [49]:
# Train model
rf_model.fit(X_train, y_train)

In [50]:
# Make predictions
y_pred = rf_model.predict(X_test)

In [64]:
# Evaluate model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(f"Test Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.926829268292683
Test Accuracy: 92.68%

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.14      0.25         7
           1       0.93      1.00      0.96        75

    accuracy                           0.93        82
   macro avg       0.96      0.57      0.61        82
weighted avg       0.93      0.93      0.90        82


Confusion Matrix:
 [[ 1  6]
 [ 0 75]]


In [65]:
# Feature importance
importances = pd.Series(rf_model.feature_importances_, index=descriptor_list)
print("\nFeature Importances:\n", importances.sort_values(ascending=False))

# Function to check binding affinity
def check_binding_affinity(smiles_input):
    mol = Chem.MolFromSmiles(smiles_input)
    if not mol:
        return "Invalid SMILES."

    # Generate descriptors for the input molecule
    input_descriptor = np.array(calculator.CalcDescriptors(mol)).reshape(1, -1)

    # Predict binding affinity (1: Active, 0: Inactive)
    prediction = rf_model.predict(input_descriptor)[0]
    probability = rf_model.predict_proba(input_descriptor)[0][1]

    if prediction == 1:
        return f"Binding Affinity: ACTIVE (Confidence: {probability:.2f})"
    else:
        return f"Binding Affinity: INACTIVE (Confidence: {probability:.2f})"

# Example usage
smiles_example = "COc1ccccc1N2CCN(CCCCCNC(=O)c3nnn(Cc4ccccc4)c3C)CC2"  # Replace with a valid SMILES
print(check_binding_affinity(smiles_example))



Feature Importances:
 BCUT2D_MWLOW    0.020945
EState_VSA3     0.019783
qed             0.019567
BCUT2D_MRLOW    0.018869
PEOE_VSA7       0.017890
                  ...   
fr_guanido      0.000000
fr_hdrzine      0.000000
fr_hdrzone      0.000000
fr_imide        0.000000
fr_urea         0.000000
Length: 217, dtype: float64
Binding Affinity: ACTIVE (Confidence: 0.75)


In [68]:
# Create a function to check the binding affinity for all molecules
def check_binding_affinity_all(df):
    # List to store results
    results = []

    for i, s in enumerate(df['smiles']):
        mol = Chem.MolFromSmiles(s)
        if mol:
            # Generate descriptors for the molecule
            input_descriptor = np.array(calculator.CalcDescriptors(mol)).reshape(1, -1)

            # Predict binding affinity (1: Active, 0: Inactive)
            prediction = rf_model.predict(input_descriptor)[0]
            probability = rf_model.predict_proba(input_descriptor)[0][1]

            # Store the result
            binding_affinity = "ACTIVE" if prediction == 1 else "INACTIVE"
            confidence = probability
            results.append([s, binding_affinity, confidence])
        else:
            results.append([s, "Invalid SMILES", None])

    # Convert results to a DataFrame
    results_df = pd.DataFrame(results, columns=["smiles", "binding_affinity", "confidence"])

    return results_df

# Check binding affinity for all molecules
results_df = check_binding_affinity_all(df_target)

# Save the results to a CSV file
output_file_path = r"/kaggle/working/rf_binding_affinity_results.csv"
results_df.to_csv(output_file_path, index=False)

print(f"Binding affinity results saved to {output_file_path}")


Binding affinity results saved to /kaggle/working/rf_binding_affinity_results.csv


**Support Vector Machine**

In [3]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors

In [4]:
# oad dataset
file_path = "/kaggle/input/chembl-dataset/chembl_5thresh.csv"
data = pd.read_csv(file_path)

In [5]:
df_target = data[data['CHEMBL1829'].notna()][['smiles', 'CHEMBL1829']].reset_index(drop=True)
df_target['label'] = (df_target['CHEMBL1829'] >= 5).astype(int)

print("Class Distribution:\n", df_target['label'].value_counts())

Class Distribution:
 label
1    370
0     38
Name: count, dtype: int64


In [6]:
# Generate molecular descriptors
descriptor_list = [desc[0] for desc in Descriptors._descList]
calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_list)

X_ml = []
invalid_smiles = []
for i, s in enumerate(df_target['smiles']):
    mol = Chem.MolFromSmiles(s)
    if mol:
        X_ml.append(calculator.CalcDescriptors(mol))
    else:
        invalid_smiles.append(i)

if invalid_smiles:
    print(f"Removing {len(invalid_smiles)} invalid SMILES.")
    df_target.drop(index=invalid_smiles, inplace=True)

X_ml = np.array(X_ml)
y = df_target['label'].values

print(f"Descriptor matrix shape: {X_ml.shape}")

Descriptor matrix shape: (408, 217)


In [7]:
# Feature scaling
scaler = StandardScaler()
X_ml = scaler.fit_transform(X_ml)


In [8]:
# Addressing class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_ml, y = smote.fit_resample(X_ml, y)

In [9]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_ml, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
# Hyperparameter tuning
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
svm_model = SVC(probability=True, class_weight='balanced', random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(svm_model, param_grid, scoring='f1_weighted', cv=cv, verbose=2)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.1s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.1s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.1s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.1s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.1s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.2s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.2s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.2s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.2s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.2s
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time=   0.1s
[CV] END ...................C=0.1, gamma=auto, k

In [11]:
# Evaluate model
y_pred = best_model.predict(X_test)
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(f"Test Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Accuracy: 0.9797297297297297
Test Accuracy: 97.97%

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98        74
           1       0.99      0.97      0.98        74

    accuracy                           0.98       148
   macro avg       0.98      0.98      0.98       148
weighted avg       0.98      0.98      0.98       148


Confusion Matrix:
 [[73  1]
 [ 2 72]]


In [12]:
# Function to check binding affinity
def check_binding_affinity(smiles_input):
    mol = Chem.MolFromSmiles(smiles_input)
    if not mol:
        return "Invalid SMILES."

    input_descriptor = np.array(calculator.CalcDescriptors(mol)).reshape(1, -1)
    input_descriptor = scaler.transform(input_descriptor)

    prediction = best_model.predict(input_descriptor)[0]
    probability = best_model.predict_proba(input_descriptor)[0][1]

    if prediction == 1:
        return f"Binding Affinity: ACTIVE (Confidence: {probability:.2f})"
    else:
        return f"Binding Affinity: INACTIVE (Confidence: {probability:.2f})"


In [13]:
# Save predictions for all molecules
def check_binding_affinity_all(df):
    results = []
    for i, s in enumerate(df['smiles']):
        mol = Chem.MolFromSmiles(s)
        if mol:
            input_descriptor = np.array(calculator.CalcDescriptors(mol)).reshape(1, -1)
            input_descriptor = scaler.transform(input_descriptor)

            prediction = best_model.predict(input_descriptor)[0]
            probability = best_model.predict_proba(input_descriptor)[0][1]

            binding_affinity = "ACTIVE" if prediction == 1 else "INACTIVE"
            results.append([s, binding_affinity, probability])
        else:
            results.append([s, "Invalid SMILES", None])

    results_df = pd.DataFrame(results, columns=["smiles", "binding_affinity", "confidence"])
    return results_df

results_df = check_binding_affinity_all(df_target)
output_file_path = "/kaggle/working/binding_affinity_results_svm_improved.csv"
results_df.to_csv(output_file_path, index=False)
print(f"Binding affinity results saved to {output_file_path}")

Binding affinity results saved to /kaggle/working/binding_affinity_results_svm_improved.csv


**Graph Neural Network**

In [94]:
!pip install torch torchvision torchaudio



In [95]:
!pip install torch-geometric



In [96]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch_geometric.data import Data, Dataset, DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from torch.nn import functional as F
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from rdkit import Chem
from rdkit.Chem import rdchem

In [99]:
# Load dataset
file_path = "/kaggle/input/chembl-dataset/chembl_5thresh.csv"
data = pd.read_csv(file_path)

In [100]:
# Filter rows where 'CHEMBL1829' is not missing
df_target = data[data['CHEMBL1829'].notna()][['smiles', 'CHEMBL1829']].reset_index(drop=True)

In [101]:
# Create a binary label: 1 if CHEMBL1829 >= 5 (active), else 0 (inactive)
df_target['label'] = (df_target['CHEMBL1829'] >= 5).astype(int)
print("Class Distribution:\n", df_target['label'].value_counts())

Class Distribution:
 label
1    370
0     38
Name: count, dtype: int64


In [103]:
#Helper function to convert SMILES to Graph
def mol_to_graph(smiles, label):
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return None
    
    # Atom features (atomic number)
    x = torch.tensor([atom.GetAtomicNum() for atom in mol.GetAtoms()], dtype=torch.float).view(-1, 1)

    # Edge features (bonds)
    edge_index = []
    for bond in mol.GetBonds():
        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_index.append((start, end))
        edge_index.append((end, start))

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    # Graph data object
    return Data(x=x, edge_index=edge_index, y=torch.tensor([label], dtype=torch.float))



In [104]:
# Create graph dataset
graphs = [mol_to_graph(smiles, label) for smiles, label in zip(df_target['smiles'], df_target['label'])]
graphs = [g for g in graphs if g is not None]

print(f"Valid Graphs: {len(graphs)}")

Valid Graphs: 408


In [105]:
# Split dataset
train_data, test_data = train_test_split(graphs, test_size=0.2, random_state=42)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

In [117]:
class GNNClassifier(nn.Module):
    def __init__(self, in_channels, hidden_dim=128, out_channels=1, dropout=0.4):
        super(GNNClassifier, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)  # BatchNorm after the first GCN layer

        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)  # BatchNorm after the second GCN layer

        self.conv3 = GCNConv(hidden_dim, hidden_dim)
        self.bn3 = nn.BatchNorm1d(hidden_dim)  # BatchNorm after the third GCN layer

        self.fc = nn.Linear(hidden_dim, out_channels)
        self.dropout = dropout

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch

        x = F.relu(self.bn1(self.conv1(x, edge_index)))
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = F.relu(self.bn2(self.conv2(x, edge_index)))
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = F.relu(self.bn3(self.conv3(x, edge_index)))
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = global_mean_pool(x, batch)  # Global pooling

        x = self.fc(x)
        return torch.sigmoid(x).view(-1)


In [118]:
# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GNNClassifier(in_channels=1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [119]:
# Train GNN Model
def train():
    model.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

In [120]:
# Evaluate Model
def evaluate(loader):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            out = model(data)
            y_true.extend(data.y.cpu().numpy())
            y_pred.extend((out.cpu().numpy() >= 0.5).astype(int))
    return y_true, y_pred

In [121]:
# Training Loop
for epoch in range(200):
    train_loss = train()
    print(f"Epoch {epoch + 1}, Loss: {train_loss:.4f}")

Epoch 1, Loss: 0.6269
Epoch 2, Loss: 0.4571
Epoch 3, Loss: 0.3626
Epoch 4, Loss: 0.3576
Epoch 5, Loss: 0.3235
Epoch 6, Loss: 0.3235
Epoch 7, Loss: 0.3327
Epoch 8, Loss: 0.3152
Epoch 9, Loss: 0.3206
Epoch 10, Loss: 0.2829
Epoch 11, Loss: 0.3117
Epoch 12, Loss: 0.2845
Epoch 13, Loss: 0.2768
Epoch 14, Loss: 0.3000
Epoch 15, Loss: 0.2788
Epoch 16, Loss: 0.2796
Epoch 17, Loss: 0.2757
Epoch 18, Loss: 0.3791
Epoch 19, Loss: 0.2773
Epoch 20, Loss: 0.2735
Epoch 21, Loss: 0.2777
Epoch 22, Loss: 0.2966
Epoch 23, Loss: 0.2786
Epoch 24, Loss: 0.2808
Epoch 25, Loss: 0.3449
Epoch 26, Loss: 0.2810
Epoch 27, Loss: 0.2807
Epoch 28, Loss: 0.2724
Epoch 29, Loss: 0.2809
Epoch 30, Loss: 0.2745
Epoch 31, Loss: 0.2664
Epoch 32, Loss: 0.2942
Epoch 33, Loss: 0.2753
Epoch 34, Loss: 0.2763
Epoch 35, Loss: 0.3035
Epoch 36, Loss: 0.3167
Epoch 37, Loss: 0.2697
Epoch 38, Loss: 0.3031
Epoch 39, Loss: 0.2790
Epoch 40, Loss: 0.2661
Epoch 41, Loss: 0.2740
Epoch 42, Loss: 0.2677
Epoch 43, Loss: 0.2713
Epoch 44, Loss: 0.30

In [122]:
# Evaluate model
y_true, y_pred = evaluate(test_loader)

print("Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_true, y_pred))

Accuracy: 0.926829268292683

Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      0.14      0.25         7
         1.0       0.93      1.00      0.96        75

    accuracy                           0.93        82
   macro avg       0.96      0.57      0.61        82
weighted avg       0.93      0.93      0.90        82


Confusion Matrix:
 [[ 1  6]
 [ 0 75]]


In [None]:
# Predict and Save Results
def predict_and_save(df, output_file):
    results = []
    for smiles in df['smiles']:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            graph = mol_to_graph(smiles, 0)
            graph = graph.to(device)
            model.eval()
            with torch.no_grad():
                pred = model(graph).item()
                label = "ACTIVE" if pred >= 0.5 else "INACTIVE"
                results.append([smiles, label, pred])
        else:
            results.append([smiles, "Invalid SMILES", None])

    # Save to CSV
    results_df = pd.DataFrame(results, columns=["smiles", "binding_affinity", "confidence"])
    results_df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

# Predict for entire dataset and save results
output_file_path = r"/kaggle/working/binding_affinity_results_gnn.csv"
predict_and_save(df_target, output_file_path)