# About

A simple CNN binary classification model for predicting the presence of the Stereochemical Layer.  

Stereochemical Layer is represented by */t/m/s* sequential tags in InChI string. If a molecule contains one of the following bonds shown in the image below, Its InChI has to contain the */t* tag.

Here, I trained a binary classifier using PyTorch.

To train the model, I created a balanced dataset that contains 50% stereo bonds. 


![](https://natefsi.weebly.com/uploads/5/4/8/7/54874551/944076.png?493)
 Image from: https://natefsi.weebly.com/


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms

In [None]:
#Config
CFG = {
    "batch": 20,
    "epoch": 30 #Increase for better accuracy
}

In [None]:
#Define dataset
class ImageDataset(Dataset):
    def __init__(self, filenames, y_vals, size):
        self.filenames = filenames
        self.y_vals = y_vals
        self.size = size
        self.resize = transforms.Resize((size, size))
        
    def __len__(self):
        return len(self.filenames)
    
    def __getitem__(self, idx):
        
        filename = self.filenames[idx]
        image = np.array(Image.open("../input/bms-molecular-translation/"+filename))
        image = torch.FloatTensor(image)[None,:,:].repeat((3,1,1))/255.        
        transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])(image)
            
        X = self.resize(image)         
        y = self.y_vals[idx]

        return X.data, y

In [None]:
#Read data and split
featured_data = pd.read_csv('../input/featured-train/featured_data.csv')
featured_data_0 =featured_data[featured_data["/t"]==0]
featured_data_1 =featured_data[featured_data["/t"]==1]
featured_data.head()  

In [None]:
#Define data loaders (5000 for train) (2240 for test) (balance the classes)
featured_data_train =  pd.concat([featured_data_0[:2500], featured_data_1[:2500]], axis=0) 
featured_data_test = pd.concat([featured_data_0[395000:396120], featured_data_1[395000:396120]], axis=0)   
    
train_data = ImageDataset(filenames=featured_data_train["path"].tolist(),y_vals=featured_data_train["/t"].tolist(),size=224)      
train_loader = torch.utils.data.DataLoader(train_data, batch_size=CFG["batch"], shuffle=True, num_workers=0)     

test_data = ImageDataset(filenames=featured_data_test["path"].tolist(),y_vals=featured_data_test["/t"].tolist(),size=224)      
test_loader = torch.utils.data.DataLoader(test_data, batch_size=CFG["batch"], shuffle=True, num_workers=0)     

In [None]:
#Define Network
class Net1(nn.Module):
    def __init__(self):
        super(Net1, self).__init__()
        # convolutional layer
        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)        
        # max pooling layer
        self.pool = nn.MaxPool2d(2, 2)      
        #FC Network
        self.fc1 = nn.Linear(128 * 28 * 28, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256,32)
        self.fc4 = nn.Linear(32,1)
        #Dropout
        self.dropout_fc = nn.Dropout(0.2)

    def forward(self, x):
        # add sequence of convolutional and max pooling layers
        x = self.pool(F.leaky_relu(self.conv1(x)))
        x = self.pool(F.leaky_relu(self.conv2(x)))
        #x = self.dropout_cnn(x)
        x = self.pool(F.leaky_relu(self.conv3(x)))
        #Flatten
        x = x.view(-1, 128 * 28 * 28)
        #x = self.dropout_fc(x)
        x = F.leaky_relu(self.fc1(x))
        x = self.dropout_fc(x)
        x = F.leaky_relu(self.fc2(x))
        x = F.leaky_relu(self.fc3(x))
        x = self.dropout_fc(x)
        x = F.sigmoid(self.fc4(x))
        return x

In [None]:
#Test function
def test_loop(dataloader, model, device, loss_fn):
    
    total_loss=0
    total = 0
    total_correct=0
    with torch.no_grad():
        for data in dataloader:
            # images, labels = data
            images, labels = data[0].to(device), data[1].to(device)
            outputs = model(images)
            total += 1
            
            #Get Loss
            total_loss +=loss_fn(outputs, labels.resize(20,1).to(torch.float32)).item()
            
            #Count correct predictions
            for i, predict in enumerate(outputs):
                if (predict.item() >= 0.5) and (data[1][i].item()==1):
                    total_correct += 1       
                elif (predict.item() < 0.5) and (data[1][i].item()==0):
                    total_correct += 1
                # print(predict.item() ," - ", data[1][i].item())
    
    print('Test loss:', str(total_loss / total)[0:5], "\tAccuracy:", str(total_correct / ((total * CFG["batch"]) / 100))[0:5],)
    return total_loss / total

In [None]:
torch.manual_seed(4)
net = Net1()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
net.to(device)

In [None]:
#Define loss function and optimizer
import torch.optim as optim
criterion = nn.BCELoss()
optimizer = optim.SGD(net.parameters(), lr=0.03, momentum=0.9)

In [None]:
# Training loop (for memory problems select new training data in each epoch)   
train_loss=[]
test_loss=[]

# for epoch in range(CFG["epoch"]): 
for epoch in range(CFG["epoch"]):  # loop over the dataset multiple times
    #Select the datapart and reload the data
    start=(epoch)*2500
    end=start+2500
    featured_data_train =  pd.concat([featured_data_0[start:end], featured_data_1[start:end]], axis=0) 
    train_data = ImageDataset(filenames=featured_data_train["path"].tolist(),y_vals=featured_data_train["/t"].tolist(),size=224)      
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=CFG["batch"], shuffle=True, num_workers=0)     
    
    
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
       
        inputs, labels = data[0].to(device), data[1].to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels.resize(20,1).to(torch.float32))
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 40 == 39:    # print every 10 mini-batches
            print('[%d, %5d] loss: %.3f' %(epoch + 1, i + 1, running_loss / 40)) 
            train_loss.append(running_loss / 40)           
            test_loss.append(test_loop(test_loader,net,device,criterion))
            
            running_loss = 0.0

In [None]:
net.eval()
test_loop(test_loader,net,device,criterion)
net.train()

In [None]:
#Plot Loss
x = np.linspace(1, len(test_loss), len(test_loss))
plt.plot(x, test_loss, '-g', label='test loss')
plt.plot(x, train_loss, ':b', label='train loss')
# plt.axis('epoch')
plt.legend();

In [None]:
# Plot Images as a batch grid 
def imshow(img):
    plt.figure(figsize=(20,20))
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

torch.set_printoptions(precision=3)    
#Get example batch predictions for comparison
dataiter = iter(test_loader)
images, labels = dataiter.next()
outputs = net(images.to(device)) #Iterated on 4 images above
    
imshow(torchvision.utils.make_grid(images, nrow=5))
print("Preds:")
print(outputs.resize(4,5))
print("True values:")
print(labels.resize(4,5))

In [None]:
#Define Test set class (with rotation info)
class ImageTest(Dataset):
    def __init__(self, filenames, rotated, size):
        self.filenames = filenames
        self.rotated = rotated
        self.size = size
        self.resize = transforms.Resize((size, size))        
        
    def __len__(self):
        return len(self.filenames)
    
    def __getitem__(self, idx):
        
        filename = self.filenames[idx]
        image = np.array(Image.open("../input/bms-molecular-translation/"+filename))
        image = torch.FloatTensor(image)[None,:,:].repeat((3,1,1))/255. 
        if self.rotated[idx]==1:
            image = torch.rot90(image, k=1, dims=(1,2))
        transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])(image)
        
        X = self.resize(image)         

        return X.data

In [None]:
#Test the original test data
featured_test_data = pd.read_csv('../input/featured-train/featured_test_data_rot.csv')
featured_test_sample=featured_test_data[:100]
external_test_data = ImageTest(filenames=featured_test_sample["path"].tolist(),rotated=featured_test_sample["rot_flag"].tolist(),size=224)      
external_test_loader = torch.utils.data.DataLoader(external_test_data, batch_size=CFG["batch"], shuffle=False, num_workers=0) 

torch.set_printoptions(precision=3)
dataiter = iter(external_test_loader)
images = dataiter.next()
outputs = net(images.to(device)) #Iterated on 4 images above
    
imshow(torchvision.utils.make_grid(images, nrow=5))
print("Preds:")
print(outputs.resize(4,5))

In [None]:
#Save model
#PATH = './t_pred_net.pth'
#torch.save(net.state_dict(), PATH)