# Dog vs Cat Classification

### Extract all the image files from zip

In [None]:
# !unzip ../input/dogs-vs-cats-redux-kernels-edition/train.zip
# !unzip ../input/dogs-vs-cats-redux-kernels-edition/test.zip

### DataFrame preparation

In [None]:
#import all the tools we need

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
import torchvision
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from torchvision import datasets, models, transforms
from pytorch_lightning import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import random
import time
import os
import zipfile
from PIL import Image
import numpy as np

In [None]:
train_dir = './train'
test_dir =  './test'

# Get the list of filenames and convert to dataframe
train_df = pd.DataFrame(os.listdir(train_dir),columns=['filename'])
test_df = pd.DataFrame(os.listdir(test_dir),columns=['filename'])

# Build label column and convert it into 1(dog) and 0(cat)
train_df['label'] = train_df.filename.str[:3]
train_df['label'] = train_df['label'].map({'dog':1,'cat':0})

# Change filename into complete file path
train_df['filename'] = train_df['filename'].apply(lambda x: os.path.join(train_dir,x))
test_df['filename'] = test_df['filename'].apply(lambda x: os.path.join(test_dir,x))

"""
Use only 2000 images for testing first, if the model is good, then change back to full dataset
"""
#TRAIN_SAMPLES = 2000
TRAIN_SAMPLES = train_df.shape[0]

# Reduce the train_df according training samples
train_df = train_df.sample(TRAIN_SAMPLES)

# Split the train_df into train_df and validation_df
train_df,val_df,_,_ = train_test_split(train_df,train_df,test_size=0.125,random_state=42)

# Have a look to the dataframe
train_df.head()

In [None]:
# How many files do we got in training set and validation set?
print('Training set images: {}, Validation set image: {}'.format(train_df.shape[0], val_df.shape[0]))

# Have a look on the files

In [None]:
def show_6_photos(dataframe):
    sample_df = dataframe.sample(6)
    paths = sample_df.filename.tolist()
    for path in paths:
        img = plt.imread(path)
        plt.subplots(figsize=(3,3))
        plt.imshow(img)
        plt.show()
  
show_6_photos(train_df)

# Create Custom datasets

- Create two transformers , 1 for training set, another for valdiation and test set.

In [None]:
# Two transformers
# According to the requirement of the model
data_transforms = {
    'train':transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val':transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
}

In [None]:
# Custom dataset, receive a dataframe with complete file path and label(not in test data)
class image_set(Dataset):
  def __init__(self,dataframe,transform=None,test=False):
    self.dataframe = dataframe
    self.transform = transform
    self.test = test 

  def __getitem__(self,index):
    x = self.dataframe.iloc[index,0]
    x = Image.open(x)
    if self.transform:
      x = self.transform(x)
    if self.test==True:
      return x
    else:
      y = self.dataframe.iloc[index,1]
      return x,np.array([y])

  def __len__(self):
    return self.dataframe.shape[0]

In [None]:
#  Create dataset and dataloader

train_set = image_set(train_df,transform=data_transforms['train'])
val_set = image_set(val_df,transform=data_transforms['val'])
test_set = image_set(test_df,transform=data_transforms['val'],test=True)

BATCH_SIZE=32

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True,num_workers=4)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=True,num_workers=4)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=True,num_workers=4)

In [None]:
# Setting up the GPU
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

In [None]:
# Main training function
def train_model(model, cost_function,optimizer,num_epochs=5):

  # List for storing loss and val
  train_losses = []
  val_losses = []
  train_acc = []
  val_acc=[]

  # Metrics object
  train_acc_object = metrics.Accuracy(compute_on_step=False)
  val_acc_object = metrics.Accuracy(compute_on_step=False)

  for epoch in range(num_epochs):
    print('-'*20)
    print('Start training {}/{}'.format(epoch+1,num_epochs))
    print('-'*20)
    train_acc_object.reset()
    val_acc_object.reset()

    # Training model
    model.train()
    epoch_losses = []
    for x,y in train_loader:
      # Clear the grad
      optimizer.zero_grad()
    
      # Put x and y to GPU and get predictions
      x,y = x.to(device),y.to(device)
      outputs = model(x)
        
      # Store the loss
      loss = cost_function(outputs,y.type_as(outputs))
      epoch_losses.append(loss.item())

      # count and update gradients
      loss.backward()
      optimizer.step()
      #scheduler.step()
    
      # Count the metrics
      train_acc_object(outputs.cpu(), y.type_as(outputs).cpu())

    # Counting Validation loss
    model.eval()
    epoch_val_losses = []
    for x,y in val_loader:
      x,y  = x.to(device),y.to(device)
      outputs = model(x)
      loss = cost_function(outputs,y.type_as(outputs))
      epoch_val_losses.append(loss.item())
      # Count the metrics
      val_acc_object(outputs.cpu(), y.type_as(outputs).cpu())

    # Update the loss list
    train_losses.append(np.mean(epoch_losses))
    val_losses.append(np.mean(epoch_val_losses))
    
    # Update the 
    epoch_t_acc = train_acc_object.compute()
    epoch_v_acc = val_acc_object.compute()
    train_acc.append(epoch_t_acc)
    val_acc.append(epoch_v_acc)
    
    # Print the result
    print('loss:{:.3f}, acc:{:.3f}, val_loss:{:.3f}, val_acc:{:.3f}'.format(np.mean(epoch_losses),
                                                                             epoch_t_acc,
                                                                             np.mean(epoch_val_losses),
                                                                             epoch_v_acc))

  print('Finish training.')
  return train_losses, val_losses, train_acc, val_acc

# Setting up model, cost function, optimizer and learning rate scheduler

Model
- I will use resnet18 as base model and fine tune it.

In [None]:
class net(nn.Module):
    def __init__(self,resnet):
        super(net,self).__init__()
        self.resnet = resnet
        self.linear1 = nn.Linear(1000,512)
        self.linear2 = nn.Linear(512,1)
    
    def forward(self,x):
        x = F.relu(self.resnet(x))
        x = F.relu(self.linear1(x))
        x = self.linear2(x)
        x = torch.sigmoid(x)
        return x

In [None]:
# Get the model and change the shape of output layer, since we are doing binary classification, the output is 1
res = models.resnet18(pretrained=True)
for param in res.parameters():
    param.requires_grad=False

# build the model
model_final = net(resnet=res)

# Put the model to GPU
model_final= model_final.to(device)

# cost_function
cost_function = nn.BCELoss()  

# optimizer
optimizer_ft = optim.Adam([param for param in model_final.parameters() if param.requires_grad],lr=0.01)

# learning rate scheduler
#exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=10, gamma=0.01)

# Epoch
EPOCHS=20

## Start training

In [None]:
train_losses, val_losses, train_acc,val_acc = train_model(model=model_final, 
                                        cost_function=cost_function,
                                        optimizer=optimizer_ft,
                                        num_epochs=EPOCHS)

# Plot the result

In [None]:
def plot_result(train_losses, val_losses, train_acc, val_acc):
    fig, (ax1,ax2) = plt.subplots(2,1,figsize=(7,6))
    
    ax1.plot(train_losses,label='train_losses')
    ax1.plot(val_losses, label='val_losses')
    
    ax2.plot(train_acc, label='train_acc', color='brown')
    ax2.plot(val_acc,label='val_acc', color='pink')
    
    ax1.legend()
    ax2.legend()
    plt.show()

In [None]:
plot_result(train_losses, val_losses,train_acc, val_acc)

# Make prediction on test data

In [None]:
def predict_on_loader(test_loader,model):
    predictions = torch.tensor([])
    for x in test_loader:
        x = x.to(device)
        predictions = torch.cat([predictions,model_final(x).detach().cpu()])
    #predictions = torch.where(predictions>0.5,1,0) # threshold = 0.5
    return predictions.numpy()

In [None]:
predictions = predict_on_loader(test_loader,model_final)

In [None]:
predictions

In [None]:
submission = pd.read_csv('../input/dogs-vs-cats-redux-kernels-edition/sample_submission.csv')
submission.label = predictions
submission.to_csv('submission.csv')

# Thank you very much