<a href="https://colab.research.google.com/github/pranavirohit/cancer-drug-discovery/blob/main/model_building.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initialization

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
from numpy import vstack

## PyTorch

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor

In [None]:
from torch.utils.data import DataLoader
from torch.nn import ReLU
from torch.nn import Sigmoid

In [None]:
from torch.utils.data import TensorDataset

## Sklearn

In [None]:
import sklearn as sk
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## TensorFlow

In [None]:
from tensorflow import keras
from keras import layers as L

# Preparing Input Data

In [None]:
chembl_all_1_2 = pd.read_csv('/content/drive/MyDrive/Data/chembl_all_1_2.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


## SMILES Code

In [None]:
# Defining SMILES characters (assigning a number value to each character).
SMILES_CHARS = [' ',
                '#', '%', '(', ')', '+', '-', '.', '/',
                '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                '=', '@',
                'A', 'B', 'C', 'F', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P',
                'R', 'S', 'T', 'V', 'X', 'Z',
                '[', '\\', ']',
                'a', 'b', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'p', 'r', 's',
                't', 'u']



*   An encoder takes a readable entity (such as the SMILES code) and transforms it into code, such as an array
*   A decoder completes the inverse, transforms code into a readable entity
> Both are defined as a function to be applied later



In [None]:
# Defining an encoder and decoder.
smi2index = dict( (c,i) for i,c in enumerate( SMILES_CHARS ) )
index2smi = dict( (i,c) for i,c in enumerate( SMILES_CHARS ) )

def smiles_encoder(smiles, maxlen=120):
    X = np.zeros( ( maxlen, len( SMILES_CHARS ) ) )
    for i, c in enumerate( smiles ):
        X[i, smi2index[c] ] = 1
    return X

def smiles_decoder( X ):
    smi = ''
    X = X.argmax( axis=-1 )
    for i in X:
        smi += index2smi[ i ]
    return smi

In [None]:
# Subset dataframe just for now so things compute faster.
cas1 = chembl_all_1_2[chembl_all_1_2.cancer_status == False].sample(10000)
cas2 = chembl_all_1_2[chembl_all_1_2.cancer_status == True]

In [None]:
chembl_all_1_2_smiles = pd.concat([cas1, cas2]) # Joining both datasets to create dataset used for machine learning.

In [None]:
chembl_all_1_2_smiles.Smiles = chembl_all_1_2_smiles.Smiles.replace('nan', np.nan) # Replacing NaN values.
chembl_all_1_2_smiles = chembl_all_1_2_smiles[chembl_all_1_2_smiles['Smiles'].notna()]

In [None]:
chembl_all_1_2_smiles = chembl_all_1_2_smiles[['cancer_status', 'Smiles']]
chembl_all_1_2_smiles.shape # Shaping SMILES dataset.

(10987, 2)

In [None]:
chembl_all_1_2_smiles.to_csv('/content/drive/MyDrive/Data/chembl_all_1_2_smiles.csv', index = False)

In [None]:
print(smiles_encoder("CN", maxlen =  2))

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]]


In [None]:
chembl_all_1_2_smiles = pd.read_csv('/content/drive/MyDrive/Data/chembl_all_1_2_smiles.csv')

In [None]:
max_length = max(chembl_all_1_2_smiles.Smiles.apply(lambda x: len(str(x))))


*   Changing SMILES code into a numpy array, then transforms itinto a flat list instead of a two dimensional array
> Flat lists are easier to input into neural network, although it is possible to input the 2-dimensional array instead of a list directly

*   Flat lists do not contain any nested lists, such as below (no lists within a list)

```
my_list = [1, 2, 3, [4, 5], 6]
```




In [None]:
# Creating input dataset for neural network.
chembl_all_1_2_smiles_flat = chembl_all_1_2_smiles['Smiles'].apply(lambda x: smiles_encoder(x, maxlen = max_length).flatten())

## Tensor Input

In [None]:
# Split test and train dataset here

In [None]:
tensor_x = torch.Tensor(chembl_all_1_2_smiles_flat.tolist())
tensor_y = torch.Tensor(chembl_all_1_2_smiles['cancer_status']) # my_y is a list of cancer_status

  tensor_x = torch.Tensor(chembl_all_1_2_smiles_flat.tolist())


In [None]:
tensor_y = tensor_y.type(torch.int32)

In [None]:
import tensorflow as tf
x = tf.convert_to_tensor(tensor_x)
y = tf.convert_to_tensor(tensor_y)

In [None]:
print(x)

tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], shape=(10987, 82768), dtype=float32)


In [None]:
print(y)

tf.Tensor([0 0 0 ... 1 1 1], shape=(10987,), dtype=int32)


In [None]:
# tensor_x_shape = tensor_x.shape
# print(tensor_x_shape)
# Because PyTorch tensors are now converted into TensorFlow tensors, necessary to shape of those tensors instead
tx_shape = x.shape
print(tx_shape)

(10987, 82768)


In [None]:
# tensor_y_shape = tensor_y.shape
# print(tensor_y_shape)
# Because PyTorch tensors are now converted into TensorFlow tensors, necessary to shape of those tensors instead
ty_shape = y.shape
print(ty_shape)

(10987,)


In [None]:
# my_dataset = TensorDataset(tensor_x, tensor_y) # create your datset
# my_dataloader = DataLoader(my_dataset) # create your dataloader

### Training and Test Datasets

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

# Model Building

## TensorFlow

In [None]:
model = keras.models.Sequential()

In [None]:
# model.add(L.Input((82768,)))

In [None]:
model.add(L.Dense(256, activation = "relu"))
model.add(L.Dense(128, activation = "relu"))
model.add(L.Dense(36, activation = "relu"))
model.add(L.Dense(16, activation = "relu"))
model.add(L.Dense(1, activation = "sigmoid"))

In [None]:
model.compile(optimizer = "adam", loss = "binary_crossentropy")

In [None]:
model.fit(x,y, epochs = 1000, batch_size = 10)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
 179/1099 [===>..........................] - ETA: 30s - loss: 0.0028

KeyboardInterrupt: ignored

In [None]:
accuracy = model.evaluate(x, y)
print("Model accuracy: %.2f" %(accuracy * 100))

In [None]:
predictions = model.predict(x)
print([round(x[0]) for x in predictions])

## PyTorch

In [None]:
# Defining the conventional neural network.
class CNN(nn.Module):
    def __init__(self, input_size, output_size, hidden_sizes, dropout = 0.5):
        super(CNN, self).__init__()

        # Defining five layers.
        self.conv1 = nn.Conv1d(input_size, hidden_sizes[0], kernel_size = 3, padding = 1)
        self.conv2 = nn.Conv1d(hidden_sizes[0], hidden_sizes[1], kernel_size = 3, padding = 1)
        self.conv3 = nn.Conv1d(hidden_sizes[1], hidden_sizes[2], kernel_size = 3, padding = 1)
        self.conv4 = nn.Conv1d(hidden_sizes[2], hidden_sizes[3], kernel_size = 3, padding = 1)
        self.conv5 = nn.Conv1d(hidden_sizes[3], hidden_sizes[4], kernel_size = 3, padding = 1)

        # Defining the activation function.
        self.relu = nn.ReLU() # Using ReLU, which clearly activates the neuron (negative values do not, positive values do).

        # Defining the dropout layer, ensures model is less prone to overfitting.
        self.dropout = nn.Dropout(dropout)

        # Defining the output layer
        self.fc = nn.Sequential(
            nn.Linear(hidden_sizes[-1], output_size),
            nn.Sigmoid() # Values between 0 and 1?
        )

    def forward(self, x):
        # Applying the activation function to each layer.
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = self.relu(self.conv3(x))
        x = self.relu(self.conv4(x))
        x = self.relu(self.conv5(x))

        # Applying the dropout layer and making it a one-dimensional tensor.
        x = self.dropout(x)
        x = x.view(x.size(0), -1)

        # Applying the output layer, there is currently only one return value.
        x = self.fc(x)

        return x



*   Describe model (# of input layers, etc.)
*   List item



In [None]:
model = CNN(input_size = 1, hidden_sizes = [10, 10, 10, 10, 10], output_size = 1, dropout = 0.5)

In [None]:
# Split the dataset to create a test set (will only use when model is final)
train_all, test_final = torch.utils.data.random_split(my_dataloader, [0.8, 0.2], generator=torch.Generator().manual_seed(0))

# Sometimes, you might want to know how your model performs at the moment, so you can split again
# Splitting the datasets to assess how model is performing at that exact moment
train, test = torch.utils.data.random_split(train_all, [0.8, 0.2], generator=torch.Generator().manual_seed(0))

In [None]:
train_dataloader = torch.utils.data.DataLoader(train, batch_size=64, shuffle=True)

In [None]:
len(test)

In [None]:
# Define the optimizer
criterion =  torch.nn.BCELoss() # MSELoss: mean squared loss for regression, BCELoss: Binary cross-entropy loss for binary classification, CrossEntropyLoss: Categorical cross-entropy loss for multi-class classification
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01, momentum = 0.9)

# Loop through the training epochs and make mini-batches for the stochastic gradient descent
for epoch in range(10): # usually, > 100, but likely around 1000x
  print(epoch)
  # enumerate mini batches
  for i, (inputs, targets) in enumerate(train_dataloader):
    #print(i, inputs, targets)
    # clear the gradients
    optimizer.zero_grad()
    # compute the model output
    yhat = model(inputs)
    # calculate loss
    loss = criterion(yhat, targets)
    # credit assignment
    loss.backward()
    # update model weights
    optimizer.step()

In [None]:
import torch.nn as nn
import torch.nn.functional as F

# PyTorch models inherit from torch.nn.Module
class GarmentClassifier(nn.Module):
    def __init__(self):
        super(GarmentClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 4 * 4, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 4 * 4)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


model = GarmentClassifier()

In [None]:
loss_fn = torch.nn.CrossEntropyLoss()

# NB: Loss functions expect data in batches, so we're creating batches of 4
# Represents the model's confidence in each of the 10 classes for a given input
dummy_outputs = torch.rand(4, 10)
# Represents the correct class among the 10 being tested
dummy_labels = torch.tensor([1, 5, 3, 7])

print(dummy_outputs)
print(dummy_labels)

loss = loss_fn(dummy_outputs, dummy_labels)
print('Total loss for this batch: {}'.format(loss.item()))

In [None]:
# Optimizers specified in the torch.optim package
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [None]:
def train_one_epoch(epoch_index):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(train):
        # Every data instance is an input + label pair
        inputs, labels = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(training_loader) + i + 1
            running_loss = 0.

    return last_loss

In [None]:
from numpy.core.arrayprint import printoptions
# Initializing in a separate cell so we can easily add more epochs to the same run
epoch_number = 0

EPOCHS = 5

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(epoch_number)

    # We don't need gradients on to do reporting
    model.train(False)

    running_vloss = 0.0
    for i, vdata in enumerate(validation_loader):
        vinputs, vlabels = vdata
        voutputs = model(vinputs)
        vloss = loss_fn(voutputs, vlabels)
        running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    # for both training and validation
    print('Training', avg_loss, 'Validation', avg_vloss, epoch_number + 1)

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'model_{}_{}'.format(timestamp, epoch_number)
        torch.save(model.state_dict(), model_path)

    epoch_number += 1

In [None]:
# Train the model

my_dataloader

In [None]:
# model = MLP(17)

# # Define the optimizer
# criterion =  torch.nn.BCELoss() # MSELoss: mean squared loss for regression, BCELoss: Binary cross-entropy loss for binary classification, CrossEntropyLoss: Categorical cross-entropy loss for multi-class classification
# optimizer = torch.optim.SGD(model.parameters(), lr = 0.01, momentum = 0.9)

# # Loop through the training epochs and make mini-batches for the stochastic gradient descent
# for epoch in range(10): # usually, > 100, but likely around 1000x
#   print(i)
#   # enumerate mini batches
#   for i, (inputs, targets) in enumerate(train_dl):
#     #print(i, inputs, targets)
#     # clear the gradients
#     optimizer.zero_grad()
#     # compute the model output
#     yhat = model(inputs)
#     # calculate loss
#     loss = criterion(yhat, targets)
#     # credit assignment
#     loss.backward()
#     # update model weights
#     optimizer.step()