In [1]:
%%capture
!pip install rdkit
!pip install torch_geometric

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from math import sqrt

from rdkit import Chem
from rdkit.Chem import AllChem, Draw, PandasTools, Descriptors

import torch.nn.functional as F
from torch.utils.data import random_split, SubsetRandomSampler

from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.utils import to_networkx
from torch_geometric.nn import AttentiveFP

import os
import random
from collections import Counter

import torch

from torch_geometric.data import InMemoryDataset, download_url, extract_gz
from torch_geometric.utils import from_smiles
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold

In [3]:
def seed_set(seed=50):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

In [4]:
seed_set()

In [5]:
df_final = pd.read_csv('Lipophilicity_final.csv')

graph_list = []
for i, smile in enumerate(df_final['smiles']):
  g = from_smiles(smile)
  g.x = g.x.float()
  y = torch.tensor(df_final['exp'][i], dtype=torch.float).view(1, -1)
  g.y = y
  graph_list.append(g)


## **Define Model**

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AttentiveFP(in_channels=9, hidden_channels=64, out_channels=1,
                    edge_dim=3, num_layers=4, num_timesteps=2,
                    dropout=0.2).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=10**-2.5,
                             weight_decay=10**-5)

## **Train and Test Functions**

In [7]:
def train():
    total_loss = total_samples = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, data.edge_attr, data.batch)
        loss = F.mse_loss(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * data.num_graphs
        total_samples += data.num_graphs
    return sqrt(total_loss / total_samples)


@torch.no_grad()
def test(loader):
    mse = []
    model.eval()
    for data in loader:
        data = data.to(device)
        out = model(data.x, data.edge_index, data.edge_attr,data.batch)
        l = F.mse_loss(out, data.y, reduction='none').cpu()
        mse.append(l)
    rmse = float(torch.cat(mse, dim=0).mean().sqrt())
    return rmse

@torch.no_grad()
def eval(loader):
    output = []
    smi = []
    model.eval()
    for data in loader:
        data = data.to(device)
        out = model(data.x, data.edge_index, data.edge_attr, data.batch)
        concatenated_data = torch.cat((out, data.y.view(-1, 1)), dim=1)
        output.append(concatenated_data)
        smi.append(data.smiles)

    # Stack the tensors along batch dimension
    stacked_output = torch.cat(output, dim=0)
    stacked_smiles = np.concatenate(smi)
    results = pd.concat([pd.DataFrame(stacked_output, columns=['pred', 'actual']), pd.DataFrame(stacked_smiles, columns=['smiles'])], axis=1)
    r2 = r2_score(results['actual'], results['pred'])
    print(f"The R2 score for {fold} epoch is {r2}")
    return r2

## **Cross-Validation**

In [11]:
kfold = KFold(n_splits=5, shuffle=True, random_state=50)
for fold, (train_ids, test_ids) in enumerate(kfold.split(graph_list)):
  print(train_ids)
  print(len(train_ids))
  print(len(test_ids))
  break

[   0    1    2 ... 4189 4190 4191]
3353
839


In [None]:
##It allows you to execute code when the file is run directly, 
##but not when it's imported as a module.
if __name__ == '__main__':

  # Configuration options
  k_folds = 5
  epochs = 75

  # For fold results
  results = []

  # Define the K-fold Cross Validator
  kfold = KFold(n_splits=k_folds, shuffle=True, random_state=50)

  # Start print
  print('--------------------------------')

  # K-fold Cross Validation model evaluation
  for fold, (train_ids, test_ids) in enumerate(kfold.split(graph_list)):

    # Print
    print(f'FOLD {fold}')
    print('--------------------------------')

    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = SubsetRandomSampler(train_ids)
    test_subsampler = SubsetRandomSampler(test_ids)

    train_loader = DataLoader(graph_list, batch_size=32, sampler=train_subsampler)

    test_loader = DataLoader(graph_list, batch_size=32, sampler=test_subsampler)

    model.reset_parameters()
    for epoch in range(epochs):
        train_rmse = train()
        test_rmse = test(test_loader)
        if epoch%15 == 0:
          print(f'Epoch: {epoch:03d}, Train Loss: {train_rmse:.4f} '
              f'Test Loss: {test_rmse:.4f}')
    # Process is complete.
    print('Training process has finished. Saving trained model.')

    # Print about testing
    print('Starting testing')

    # Saving the model
    save_path = f'/content/model-fold-{fold}.pth'
    torch.save(model.state_dict(), save_path)
    results.append(eval(test_loader))

# Print fold results
print(f'K-FOLD CROSS VALIDATION RESULTS FOR {k_folds} FOLDS')
print('--------------------------------')
print(f'Average test {k_folds} fold cross-val r2: {sum(results)/len(results)}')