# Identify different types of cell nuclei in a colon cancer sample
<hr>

## Overall Goal : 

Train a deep neural network which can take a 64x64 image with a cell nuclei at the centre and classify it into one of the following types:

1.Normal epithelial cells (shown orange).

2.Cancer epithelial cells (shown in red).

3.Immune Leukocyte cells (shown in green).

4.Connective fibroblast cells (shown in blue).

<br>
<br>

### File descriptions

train.csv - the training set containing references to the image files and their cell type

train.zip - zip file containing the training images

test.zip - zip file containing the test images

example.csv - a sample submission file in the correct format

<br>

### Approach :
I divided this notebook into three major parts. 

### Part 1
The first was file management. Here I split the images into training and testing datasets and put them in individual folders according to their labels. 
For the splitting into folders, I take ideas from the following tutorial (https://www.youtube.com/watch?v=cVXfqGy4CUo&t=606s). 

In [26]:
#Load libraries
import os
import numpy as np
import torch
import glob
import torch.nn as nn
from torchvision.transforms import transforms
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.autograd import Variable
import torchvision
import pathlib
from pandas import read_csv
from numpy.random import RandomState

###### Subpart 1 

I load the CSV file and from there I extract a list of labels. 


In [27]:
map_file = read_csv('train.csv')
length = len(map_file)
print(length)
labels = set()
for i in range(len(map_file)):
    # convert spaced separated tags into an array of tags
    type = map_file['Type'][i].split(' ')
    # add tags to the set of known labels
    labels.update(type)
#print(labels)
labels = list(labels)
labels.sort()
classes = labels
classes

2190


['Cancer', 'Connective', 'Immune', 'Normal']

#####  Subpart 2
Now I separate the dataset into training and testing datasets

In [28]:
rng = RandomState()
train_labels = map_file.sample(frac=0.7, random_state=rng)
test_labels = map_file.loc[~map_file.index.isin(train_labels.index)]

###### Subpart 3
Create separate folders for train and test

In [29]:
from __future__ import print_function
import pandas as pd
import shutil
import os
import sys
from os.path import join


train_dir =r'train/train'
train_path = r"train_labeled"
if not os.path.exists(train_path):
    os.mkdir(train_path)

for filename, class_name in train_labels.values:
    # Create subdirectory with `class_name`
    if not os.path.exists(train_path + '/'+ str(class_name)):
        os.mkdir(train_path + '/'+ str(class_name))
    src_path = train_dir + '/'+ str(filename) + '.png'
    dst_path = train_path + '/'+ str(class_name) + '/' + str(filename) + '.png'
    try:
        shutil.copy(src_path, dst_path)
        print("sucessful")
    except:
        print('Error')

sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful


sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful


In [30]:
test_dir =r'train/train'
test_path = r"test_labeled"
if not os.path.exists(test_path):
    os.mkdir(test_path)

for filename, class_name in test_labels.values:
    # Create subdirectory with `class_name`
    if not os.path.exists(test_path + '/'+ str(class_name)):
        os.mkdir(test_path + '/'+ str(class_name))
    src_path = test_dir + '/'+ str(filename) + '.png'
    dst_path = test_path + '/'+ str(class_name) + '/' + str(filename) + '.png'
    try:
        shutil.copy(src_path, dst_path)
        print("sucessful")
    except:
        print('Error')

sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful
sucessful


### Disclaimer: The above two codes SHOULD NOT be run multiple times. If you have already divided the folders once, please do not repeat

### Part 2

Now we create the convolutional network and train it on the training dataset. I took inspiration from the following CNN tutorial online - https://www.youtube.com/watch?v=9OHlgDjaE2I, and the following website - https://towardsdatascience.com/improves-cnn-performance-by-applying-data-transformation-bf86b3f4cef4

In [31]:
use_cuda = True
device = torch.device("cuda" if (use_cuda and torch.cuda.is_available()) else "cpu")
device

device(type='cuda')

###### Transform function 
In our Lab for day 3, the code transformed the dataset as they were being loaded, but I found that writing a function is more intuitive and many of the online tutorials did the same

In [32]:
transformer=transforms.Compose([
    transforms.Resize((64,64)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(), 
])

###### Loading the data

In [33]:
load_train=DataLoader(
    torchvision.datasets.ImageFolder(train_path,transform=transformer),
    batch_size=64, shuffle=True
)
load_test=DataLoader(
    torchvision.datasets.ImageFolder(test_path,transform=transformer),
    batch_size=32, shuffle=True
)

###### CNN Network

In [34]:
class ConvNet(nn.Module):
    def __init__(self,num_classes=4):
        super(ConvNet,self).__init__()
        
        self.conv1=nn.Conv2d(in_channels=3,out_channels=10,kernel_size=3,stride=1,padding=1)
        self.bn1=nn.BatchNorm2d(num_features=10)
        self.relu1=nn.ReLU()
        
        self.pool=nn.MaxPool2d(kernel_size=2)
        
        
        
        self.conv2=nn.Conv2d(in_channels=10,out_channels=24,kernel_size=3,stride=1,padding=1)
        self.relu2=nn.ReLU()
    
        
        
        
        self.conv3=nn.Conv2d(in_channels=24,out_channels=32,kernel_size=3,stride=1,padding=1)
        self.bn3=nn.BatchNorm2d(num_features=32)
        self.relu3=nn.ReLU()
    
        
        
        self.fc=nn.Linear(in_features=32 **3,out_features=num_classes)
        
        
        
        #Feed forwad function
        
    def forward(self,input):
        output=self.conv1(input)
        output=self.bn1(output)
        output=self.relu1(output)
            
        output=self.pool(output)
            
        output=self.conv2(output)
        output=self.relu2(output)
            
        output=self.conv3(output)
        output=self.bn3(output)
        output=self.relu3(output)
            
            
        
            
        output=output.view(-1,32**3)
            
            
        output=self.fc(output)
            
        return output

In [35]:
model=ConvNet(num_classes=4).to(device)

In [36]:
import torch.optim as optim
optimizer = optim.SGD(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

In [37]:
epoch_count = 100

In [38]:
train_count=len(glob.glob(train_path+'/**/*.png'))
test_count=len(glob.glob(test_path+'/**/*.png'))

In [39]:
print(train_count,test_count)

1533 657


In [40]:
#Model training and saving best model

best_accuracy=0.0

for epoch in range(epoch_count):
    
    #Evaluation and training on training dataset
    model.train()
    train_accuracy=0.0
    train_loss=0.0
    
    for i, (images,labels) in enumerate(load_train):
        if torch.cuda.is_available():
            images=Variable(images.cuda())
            labels=Variable(labels.cuda())
            
        optimizer.zero_grad()
        
        outputs=model(images)
        loss=loss_fn(outputs,labels)
        loss.backward()
        optimizer.step()
        
        
        train_loss+= loss.cpu().data*images.size(0)
        _,prediction=torch.max(outputs.data,1)
        
        train_accuracy+=int(torch.sum(prediction==labels.data))
        
    train_accuracy=train_accuracy/train_count
    train_loss=train_loss/train_count
    
    
    # Evaluation on testing dataset
    model.eval()
    
    test_accuracy=0.0
    for i, (images,labels) in enumerate(load_test):
        if torch.cuda.is_available():
            images=Variable(images.cuda())
            labels=Variable(labels.cuda())
            
        outputs=model(images)
        _,prediction=torch.max(outputs.data,1)
        test_accuracy+=int(torch.sum(prediction==labels.data))
    
    test_accuracy=test_accuracy/test_count
    
    
    print('Epoch: '+str(epoch)+' Train Loss: '+str(train_loss)+' Train Accuracy: '+str(train_accuracy)+' Test Accuracy: '+str(test_accuracy))
    
    #Save the best model
    if test_accuracy>best_accuracy:
        torch.save(model.state_dict(),'best_checkpoint.model')
        best_accuracy=test_accuracy

Epoch: 0 Train Loss: tensor(1.5387) Train Accuracy: 0.45596868884540115 Test Accuracy: 0.4611872146118721
Epoch: 1 Train Loss: tensor(0.9639) Train Accuracy: 0.6001304631441617 Test Accuracy: 0.4459665144596651
Epoch: 2 Train Loss: tensor(0.8324) Train Accuracy: 0.6797129810828441 Test Accuracy: 0.6940639269406392
Epoch: 3 Train Loss: tensor(0.7445) Train Accuracy: 0.7051532941943901 Test Accuracy: 0.6560121765601218
Epoch: 4 Train Loss: tensor(0.6821) Train Accuracy: 0.7188519243313763 Test Accuracy: 0.7427701674277016
Epoch: 5 Train Loss: tensor(0.6707) Train Accuracy: 0.7292889758643183 Test Accuracy: 0.7229832572298326
Epoch: 6 Train Loss: tensor(0.6177) Train Accuracy: 0.7397260273972602 Test Accuracy: 0.7229832572298326
Epoch: 7 Train Loss: tensor(0.6180) Train Accuracy: 0.7488584474885844 Test Accuracy: 0.7366818873668188
Epoch: 8 Train Loss: tensor(0.5563) Train Accuracy: 0.7860404435746902 Test Accuracy: 0.7321156773211568
Epoch: 9 Train Loss: tensor(0.5508) Train Accuracy: 0.

Epoch: 78 Train Loss: tensor(0.1490) Train Accuracy: 0.9791258969341161 Test Accuracy: 0.7884322678843226
Epoch: 79 Train Loss: tensor(0.1402) Train Accuracy: 0.9778212654924984 Test Accuracy: 0.7808219178082192
Epoch: 80 Train Loss: tensor(0.1358) Train Accuracy: 0.9817351598173516 Test Accuracy: 0.7534246575342466
Epoch: 81 Train Loss: tensor(0.1436) Train Accuracy: 0.9804305283757339 Test Accuracy: 0.7808219178082192
Epoch: 82 Train Loss: tensor(0.1340) Train Accuracy: 0.9804305283757339 Test Accuracy: 0.7747336377473364
Epoch: 83 Train Loss: tensor(0.1325) Train Accuracy: 0.9843444227005871 Test Accuracy: 0.7838660578386606
Epoch: 84 Train Loss: tensor(0.1311) Train Accuracy: 0.9843444227005871 Test Accuracy: 0.791476407914764
Epoch: 85 Train Loss: tensor(0.1320) Train Accuracy: 0.9836921069797782 Test Accuracy: 0.786910197869102
Epoch: 86 Train Loss: tensor(0.1283) Train Accuracy: 0.9856490541422048 Test Accuracy: 0.7701674277016742
Epoch: 87 Train Loss: tensor(0.1290) Train Accur

In [41]:
print("The best accuracy is " + str(best_accuracy*100) + " %")

The best accuracy is 79.75646879756468 %


In [42]:
pred_path = r'test/test'

In [43]:
checkpoint=torch.load('best_checkpoint.model')
model=ConvNet(num_classes=4)
model.load_state_dict(checkpoint)
model.eval()

ConvNet(
  (conv1): Conv2d(3, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): ReLU()
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(10, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (relu2): ReLU()
  (conv3): Conv2d(24, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu3): ReLU()
  (fc): Linear(in_features=32768, out_features=4, bias=True)
)

In [44]:
transformer_pred = transforms.Compose([
    transforms.Resize((64,64)),
    transforms.ToTensor() ])

In [45]:
from PIL import Image
def prediction(img_path,transformer_pred):
    
    image=Image.open(img_path)
    
    image_tensor=transformer_pred(image).float()
    
    
    image_tensor=image_tensor.unsqueeze_(0)
    
    if torch.cuda.is_available():
        image_tensor.cuda()
        
    input=Variable(image_tensor)
    
    
    output=model(input)
    
    index=output.data.numpy().argmax()
    
    pred=classes[index]
    
    return pred

In [46]:
images_path=glob.glob(pred_path+'/*.png')
print(images_path)

['test/test\\10001.png', 'test/test\\10002.png', 'test/test\\10003.png', 'test/test\\10004.png', 'test/test\\10005.png', 'test/test\\10006.png', 'test/test\\10007.png', 'test/test\\10008.png', 'test/test\\10009.png', 'test/test\\10010.png', 'test/test\\10011.png', 'test/test\\10012.png', 'test/test\\10013.png', 'test/test\\10014.png', 'test/test\\10015.png', 'test/test\\10016.png', 'test/test\\10017.png', 'test/test\\10018.png', 'test/test\\10019.png', 'test/test\\10020.png', 'test/test\\10021.png', 'test/test\\10022.png', 'test/test\\10023.png', 'test/test\\10024.png', 'test/test\\10025.png', 'test/test\\10026.png', 'test/test\\10027.png', 'test/test\\10028.png', 'test/test\\10029.png', 'test/test\\10030.png', 'test/test\\10031.png', 'test/test\\10032.png', 'test/test\\10033.png', 'test/test\\10034.png', 'test/test\\10035.png', 'test/test\\10036.png', 'test/test\\10037.png', 'test/test\\10038.png', 'test/test\\10039.png', 'test/test\\10040.png', 'test/test\\10041.png', 'test/test\\100

In [47]:
pred_dict={}
pred_dict['Id'] = 'Type'

for i in images_path:
    pred_dict[i[10:15]]=prediction(i,transformer)

In [48]:
pred_dict

{'Id': 'Type',
 '10001': 'Connective',
 '10002': 'Connective',
 '10003': 'Immune',
 '10004': 'Immune',
 '10005': 'Connective',
 '10006': 'Connective',
 '10007': 'Connective',
 '10008': 'Connective',
 '10009': 'Connective',
 '10010': 'Connective',
 '10011': 'Connective',
 '10012': 'Connective',
 '10013': 'Immune',
 '10014': 'Immune',
 '10015': 'Connective',
 '10016': 'Connective',
 '10017': 'Connective',
 '10018': 'Immune',
 '10019': 'Immune',
 '10020': 'Connective',
 '10021': 'Connective',
 '10022': 'Connective',
 '10023': 'Connective',
 '10024': 'Connective',
 '10025': 'Connective',
 '10026': 'Connective',
 '10027': 'Connective',
 '10028': 'Immune',
 '10029': 'Connective',
 '10030': 'Connective',
 '10031': 'Connective',
 '10032': 'Connective',
 '10033': 'Connective',
 '10034': 'Connective',
 '10035': 'Connective',
 '10036': 'Connective',
 '10037': 'Connective',
 '10038': 'Connective',
 '10039': 'Connective',
 '10040': 'Connective',
 '10041': 'Connective',
 '10042': 'Connective',
 '100

In [49]:
import csv
a_file = open("pred.csv", "w")

writer = csv.writer(a_file)
for key, value in pred_dict.items():
    writer.writerow([key, value])

a_file.close()