In this notebook, we perform below steps to understand the modeling challenges
1. Create custom data loader. 
    
    This is required to read id from train.csv and then load the image, transform it. We also re-do the class labels, as class id is greater than number of unique classes

2. Custom function to visulaize images of one batch
3. Function to create pre-trained model
4. Training loop

Due to huge amount of data, its difficult to complete training for one epoch. We probably need some data cleaning to reduce the number of images being sent to training

In [None]:
!pip install torchsummary

In [None]:
!ls -l /kaggle/working

In [None]:
import pandas as pd
import numpy as np
import random
import math

import os
import tqdm

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from IPython.display import display

import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision.models import vgg19

from torchsummary import summary

from pathlib import Path
import PIL
from PIL import Image





In [None]:
def seed_all(seed):
    """Utility function to set seed across all pytorch process for repeatable experiment
    """
    if not seed:
        seed = 10

    print("[ Using Seed : ", seed, " ]")

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_all(100)

In [None]:
def seed_worker(worker_id):
    """Utility function to set random seed for DataLoader
    """
    worker_seed = torch.initial_seed() % 2**32
    numpy.random.seed(worker_seed)
    random.seed(worker_seed)

In [None]:
data_dir = Path("../input/landmark-recognition-2021/")
train_dir = data_dir / "train"
test_dir = data_dir / "test"
train_file = data_dir / "train.csv"
sub_file = data_dir / "sample_submission.csv"

In [None]:
train_df = pd.read_csv(train_file)
sub_df = pd.read_csv(sub_file)

In [None]:
display(train_df.head())
display(sub_df.head())

In [None]:
!ls -l /kaggle/working/

In [None]:
## landmark_id value > number of classes, this leads to error during training pytorch model
landmark_id_map = {lid:i for i, lid in enumerate(train_df.landmark_id.unique())}
train_df['landmark_id'] = train_df['landmark_id'].map(landmark_id_map)

In [None]:
## Building a custom data loader to load the data in batches for pytorch

class LandMarkData(Dataset):
    
    def __init__(self, data_file, data_dir, transform=None, data_type="train"):
        """
        data_file str: file which contains image_id and its class
        data_dir str: directory where data is present
        """
        
        self.data_file = pd.read_csv(data_file)
        ## landmark_id value > number of classes, this leads to error during training pytorch model
        if data_type == "train":
            self.landmark_id_map = {lid:i for i, lid in enumerate(self.data_file.landmark_id.unique())}
            self.data_file['landmark_id'] = self.data_file['landmark_id'].map(self.landmark_id_map)
        elif data_type == "test":
            print("Test data will not have landmarkd id, hence no mapping")
        self.data_dir = data_dir
        self.transform = transform
    
    def __len__(self):
        return len(self.data_file)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        img_id = self.data_file.iloc[idx, 0]
        img_class = self.data_file.iloc[idx, 1]
        img_path = os.path.join(self.data_dir, img_id[0], img_id[1], img_id[2], f'{img_id}.jpg')
        img = Image.open(img_path)
        if self.transform is not None:
            img = transform(img)
        sample = [img, img_class, img_id]
        
        return sample

In [None]:
## define basic transforms

transform = transforms.Compose([ transforms.CenterCrop(224), 
                               transforms.ToTensor()])

In [None]:
train_data = LandMarkData(train_file, train_dir, transform, "train")
test_data = LandMarkData(sub_file, test_dir, transform, "test")

In [None]:
## Manually Checking if dataloader and transforms are getting applied or not.
## All images should be 224*224
print(f"Image Shape               || Image Class|| Image Id")
print("-"*60)
samples = train_df['id'].sample(10, random_state=100).index
for sample in samples:
    img_sample = train_data[sample]
    print(f"{img_sample[0].shape} || {img_sample[1]}      || {img_sample[2]}")

In [None]:
## Manually Checking if dataloader and transforms are getting applied or not.
## All images should be 224*224
print(f"Image Shape               || Image Class|| Image Id")
print("-"*60)
samples = sub_df['id'].sample(10, random_state=100).index
for sample in samples:
    img_sample = test_data[sample]
    print(f"{img_sample[0].shape} || {img_sample[1]} || {img_sample[2]}")

In [None]:
## Taking 20% as valid data
valid_size = 0.2
batch_size = 8

In [None]:
## Splitting train data into valid data. Please note this is vanila split, 
# we need to have better split or agumentation as many landmarks have very few images
num_train = len(train_data)
indices = list(range(num_train))
np.random.seed(100)
np.random.shuffle(indices)
split = int(np.floor(num_train*valid_size))
valid_idx, train_idx = indices[:split], indices[split:]
assert len(valid_idx) + len(train_idx) == num_train

In [None]:
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)
train_loader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler, num_workers=0, worker_init_fn=seed_worker)
valid_loader = DataLoader(train_data, batch_size=batch_size, sampler=valid_sampler, num_workers=0, worker_init_fn=seed_worker)
test_loader = DataLoader(test_data, batch_size=batch_size, num_workers=0, worker_init_fn=seed_worker)

In [None]:
def plot_images(loader, num_images=5):
    images, label, img_id = next(iter(loader))
    # convert to numpy and transpose as (Batch Size, Height, Width, Channel) as needed by matplotlib
    images = images.numpy().transpose(0, 2, 3, 1)
    
    # Analysing images of a train batch
    num_cols = 5
    num_rows = 1
    if num_images > 5:
        num_cols = 5
        num_rows = math.ceil(num_images / 5)
    np.random.seed(100)
    indices = np.random.choice(range(len(label)), size=num_images, replace=False)
    width = 20
    height = 5*num_rows
    plt.figure(figsize=(width, height))
    for i, idx in enumerate(indices):
        plt.subplot(num_rows, num_cols, i + 1)
        image = images[idx]
        plt.imshow(image);
        plt.title(f'label: {label[idx]}\n img_id: {img_id[idx]}');
        plt.axis("off")
    plt.show()


In [None]:
#plotting one batch of images from train
plot_images(train_loader, batch_size)

In [None]:
#plotting one batch images from valid
plot_images(valid_loader, batch_size)

In [None]:
plot_images(test_loader, batch_size)

Post plotting images from test set, where out of 8 images probably 4 are for landmark and rest are human faces and fish, this shows that lot of images in test data are out of sample. Hence, even before we predict on test set, we need to run some similarity check and not predict for out of sample images

In [None]:
def get_pretrained_model(model_name=vgg19, num_class=10, use_gpu=False):
    """ Wrapper function to get pre-trained model 
    """
    model_transfer = model_name(pretrained=True)
    for params in model_transfer.features.parameters():
        params.requires_grad=False

    in_features = model_transfer.classifier[6].in_features
    last_layer = nn.Linear(in_features, num_class)
    model_transfer.classifier[6] = last_layer
    if use_gpu:
        model_transfer.cuda()
    return model_transfer

In [None]:
use_cuda = torch.cuda.is_available()

In [None]:
model = get_pretrained_model(vgg19, train_df.landmark_id.nunique(), use_cuda)

In [None]:
## Checking model summary using torchsummary
summary(model, (3,224,224))

In [None]:
# Using Vanilla CrossEntropyLoss, it would be better to give the weight for each class due to high imbalance
criteria = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
loaders = {'train': train_loader, 'valid': valid_loader, 'test': test_loader}

In [None]:
save_path = "/kaggle/working/"

In [None]:
from time import time

In [None]:
def train(n_epochs, loaders, model, optimizer, criterion, use_cuda, save_path, num_batch=1, verbose=False):
    """returns trained model"""
    # initialize tracker for minimum validation loss
    valid_loss_min = np.Inf 
    
    for epoch in range(1, n_epochs+1):
        # initialize variables to monitor training and validation loss
        train_loss = 0.0
        valid_loss = 0.0
        start_time = time()
        
        ###################
        # train the model #
        ###################
        # set the module to training mode
        model.train()
#         import pdb; pdb.set_trace()
        for batch_idx, (data, target, img_id) in enumerate(loaders['train']):
            # move to GPU
            if use_cuda:
                data, target = data.cuda(), target.cuda()

            ## TODO: find the loss and update the model parameters accordingly
            ## record the average training loss, using something like
            ## train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.data.item() - train_loss))
            optimizer.zero_grad()
            out = model(data)
            loss = criterion(out, target)
            loss.backward()
            optimizer.step()
            train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.data.item() - train_loss))
            train_loss += loss.data.item()*data.size(0)
            if verbose:
                print(f"idx: {batch_idx} Train Loss:{train_loss/(data.size(0) * (batch_idx + 1)) : .6f}")
            if batch_idx > num_batch:
                train_images_used = data.size(0)*(batch_idx + 1)
                break

            
        torch.cuda.empty_cache()
        ######################    
        # validate the model #
        ######################
        # set the model to evaluation mode
        model.eval()
        for batch_idx, (data, target, img_id) in enumerate(loaders['valid']):
            # move to GPU
            if use_cuda:
                data, target = data.cuda(), target.cuda()

            ## TODO: update average validation loss 
            out = model(data)
            loss = criterion(out, target)
            valid_loss += loss.data.item()*data.size(0) 
            if verbose:
                print(f"idx: {batch_idx} Valid Loss:{valid_loss / (data.size(0) * (batch_idx + 1)) : .6f}")
            if batch_idx > num_batch:
                valid_images_used = data.size(0)*(batch_idx + 1)
                break
        train_loss = train_loss/ train_images_used
        valid_loss = valid_loss / valid_images_used

            
            
        end_time = time()
        time_taken = end_time - start_time
        # print training/validation statistics 
        print('Epoch: {} \t Time: {:.2f} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
            epoch, 
            time_taken,
            train_loss,
            valid_loss
            ))

        ## TODO: if the validation loss has decreased, save the model at the filepath stored in save_path
        if valid_loss < valid_loss_min:
            if verbose:
                print(f"Valid loss reduced from {valid_loss_min :.6f} to {valid_loss :.6f}, saving model")
            valid_loss_min = valid_loss
            torch.save(model.state_dict(), save_path)
              
    return model

In [None]:
import tqdm

In [None]:
def predict(loaders, model, use_cuda, landmark_reverse_map):
    
    
    # set the module to evaluation mode
    model.eval()
    sf = nn.Softmax(dim=1)
    img_id_list = []
    confidence_list = []
    label_list = []
    tot_batch = len(loaders['test'])
    for batch_idx, (data, _, img_id) in enumerate(tqdm.tqdm(loaders['test'])):
        # move to GPU
        if use_cuda:
            data = data.cuda()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        output = sf(output)
        output = torch.max(output, dim=1)
        confidence = output[0].cpu().detach().numpy()
        label=output[1].cpu().detach().numpy()
        
        img_id_list.extend(list(img_id))
        confidence_list.extend(confidence.tolist())
        label_list.extend(label.tolist())
    
    predict_df = pd.DataFrame({'id': img_id_list, 
                               'landmarks': label_list, 
                               'conf': confidence_list})
    predict_df['landmarks'] = predict_df['landmarks'].map(landmark_reverse_map)
    predict_df['landmarks'] = predict_df['landmarks'].astype(str) +" " + predict_df['conf'].round(6).astype(str)

    predict_df.drop("conf", axis=1, inplace=True)
    return predict_df
        

        
        

In [None]:
# data is huge, running 1000 batches for 10 epoch, with an assumption that atleast model will see each class once

num_epochs = 20
model_transfer = train(num_epochs, loaders, model, optimizer, 
                      criteria, use_cuda, os.path.join(save_path,'model_transfer.pt'),num_batch=1000, verbose=False)

In [None]:
# Loading the best model
model.load_state_dict(torch.load(os.path.join(save_path,'model_transfer.pt')))

In [None]:
landmark_reverse_map = dict(zip(train_data.landmark_id_map.values(), train_data.landmark_id_map.keys()))

In [None]:
out = predict(loaders, model, use_cuda, landmark_reverse_map)

In [None]:
display(out.tail())

In [None]:
out.to_csv("/kaggle/working/submission.csv")