# GNN project using HIV dataset

## Get Data and Create Features

In [26]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
import deepchem as dc
import torch
import torch_geometric
import numpy as np
from torch_geometric.loader import DataLoader
from torch.utils.data import Dataset, random_split
from sklearn.model_selection import train_test_split

In [3]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
import deepchem as dc
import torch
import torch_geometric
import numpy as np
from torch.utils.data import Dataset, DataLoader, random_split

""""
# read dataset

dataset = pd.read_csv("C:\\Users\\rezav\\OneDrive\\Desktop\\data\\HIV_train.csv")
"C:\Users\rezav\OneDrive\Desktop\data"
# convert smiles string to RdKit format and extract node features and edge index using deepchem lib
G = {
    "node_features" : [],
    "edge_index" : [],
    "label" : []
}


for i in range(1000):
    mol = Chem.MolFromSmiles(dataset.iloc[i]["smiles"])
    featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)
    f = featurizer._featurize(mol)
    G["node_features"].append(f.node_features)
    G["edge_index"].append(f.edge_index)
    G["label"].append(dataset.iloc[i]["HIV_active"])

graph_df = pd.DataFrame(data=G)

graph_df.head() """

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 103-104: truncated \UXXXXXXXX escape (234719960.py, line 33)

## under-sampling

In [27]:
root = "C:\\Users\\rezav\\OneDrive\\Desktop\\data\\"

dataset = pd.read_csv(root + "HIV.csv")

#under-sampling
        
positive = dataset[dataset['HIV_active']==1]
negative = dataset[dataset['HIV_active']==0]
negative = negative.sample(n=len(positive), random_state=101)
dataset = pd.concat([positive,negative],axis=0)
        
#split to train, test
        
train, test = train_test_split(dataset , test_size=0.3, random_state=42, shuffle = True)

train.to_csv(root + "raw\\HIV_train.csv", index = False)
test.to_csv(root + "raw\\HIV_test.csv", index = False)

## Dataset

In [29]:
import os.path as osp
import os

import torch
from torch_geometric.data import Dataset, download_url, Data


class MyOwnDataset(Dataset):
    def __init__(self, root, test, filename, transform=None, pre_transform=None, pre_filter=None):
        self.test = test
        self.filename = filename
        super().__init__(root, transform, pre_transform, pre_filter)

    @property
    def raw_file_names(self):
        return self.filename

    @property
    def processed_file_names(self):
        return "no.pt"

    def download(self):
        # Download to `self.raw_dir`.
        pass

    def process(self):
        self.data = pd.read_csv(self.raw_paths[0])
        
        idx = 0
        
        for index, x in tqdm(self.data.iterrows(), total=self.data.shape[0]):
            
            #get smiles code and call featurizer
            
            mol = Chem.MolFromSmiles(x["smiles"])
            featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)
            f = featurizer._featurize(mol)
            
            # get graph properties
            
            node_features = torch.tensor(f.node_features)
            edge_index = torch.tensor(f.edge_index)
            edge_features = torch.tensor(f.edge_features)
            label = torch.tensor(np.asarray([x["HIV_active"]]), dtype=torch.int64)
        
            data = Data(x = node_features,
                        edge_index = edge_index,
                        edge_attr = edge_features,
                        y = label,
                        smiles = x["smiles"])

            if self.test:
                torch.save(data, osp.join(self.processed_dir, f'data_test_{idx}.pt'))
            else:
                torch.save(data, osp.join(self.processed_dir, f'data_{idx}.pt'))
            idx += 1
            
    def len(self):
        return self.data.shape[0]

    def get(self, idx):
        if self.test:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_test_{idx}.pt'))
        else:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_{idx}.pt'))
        return data

## Dataloader

In [32]:
from sklearn.model_selection import train_test_split
from torch_geometric.loader import DataLoader
import numpy as np
from tqdm import tqdm
from torch_geometric.data import Data
   

#define dataset

train_ds = MyOwnDataset(root, False, "HIV_train.csv")
test_ds = MyOwnDataset(root, True, "HIV_test.csv")

# define data loader

train_dataloader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_ds, batch_size=32, shuffle=True)



Processing...
100%|██████████████████████████████████████████████████████████████████████████████| 1789/1789 [00:24<00:00, 72.83it/s]
Done!
Processing...
100%|████████████████████████████████████████████████████████████████████████████████| 767/767 [00:11<00:00, 68.47it/s]
Done!


## Model

In [40]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv, GATv2Conv, TransformerConv
from torch_geometric.nn import global_mean_pool


class GNN(torch.nn.Module):
    def __init__(self, input_size, hidden_channels, conv, conv_params={}):
        super(GNN, self).__init__()
        torch.manual_seed(12345)
        
        self.conv1 = conv(
            30, 128)
        
        self.conv2 = conv(
            128, 256)
        
        self.conv3 = conv(
            256, 128)
        
        self.conv4 = conv(
            128, 64)
        
        self.lin1 = Linear(64, 16)
        self.lin2 = Linear(16, 1)
    
    def forward(self, x, edge_index, batch = None,  edge_col = None):
        
        # Node embedding 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)
        x = x.relu()
        x = self.conv4(x, edge_index)
        x = x.relu()
        # Readout layer
        batch = torch.zeros(x.shape[0],dtype=int) if batch is None else batch
        x = global_mean_pool(x, batch)
        
        # Final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin1(x)
        x = x.relu()
        x = self.lin2(x)
    
        return x
    
model = GNN(30, 16, GATConv)
print(model)

GNN(
  (conv1): GATConv(30, 128, heads=1)
  (conv2): GATConv(128, 256, heads=1)
  (conv3): GATConv(256, 128, heads=1)
  (conv4): GATConv(128, 64, heads=1)
  (lin1): Linear(in_features=64, out_features=16, bias=True)
  (lin2): Linear(in_features=16, out_features=1, bias=True)
)


## train

In [41]:

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

train_losses = []
train_accs = []

counter = 0

for data in train_dataloader:
    prd = model(data.x.float(), data.edge_index)
    optimizer.zero_grad()
    print(prd)
    print(data.x)
    print(data.y)
    loss = criterion(prd, data.y)
    loss.backward()
    optimizer.step()
    loss = loss.item()
    preds = prd.argmax(dim=1)
    correct += (preds == Y).sum().item()
    acc = 0
    for x in range(len(preds)):
        if preds[x] == y[x]:
            acc += 1
    acc /= len(y)
    train_losses.append(loss)
    train_accs.append(acc)



tensor([[-0.2721]], grad_fn=<AddmmBackward0>)
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)
tensor([0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
        0, 1, 1, 0, 1, 1, 1, 1])


ValueError: Expected input batch_size (1) to match target batch_size (32).

In [None]:
import matplotlib.pyplot as plt

plt.title("train loss")
plt.plot(train_losses)
plt.show()
plt.title("train accuracy")
plt.plot(train_accs)
plt.show()


## Test

In [None]:
from sklearn.metrics import accuracy_score

real = []
prd = []


for data in test_dataloader:
    preds = []
    for features, target in data:
        out = model(features[0], features[1])
        preds.append(out)
        real.append(target)
    preds = np.argmax(prd, axis=1)
    prd = prd + preds
    acc = 0
    for x in range(len(preds)):
        if preds[x] == real[x]:
            acc += 1
    acc /= len(real)
    accuracy += acc
    train_accs.append(acc)
    
print("accuracy: " + accuracy_score(real, prd))