# Bengali.AI Training Notebook

By Team RausNaus, consisting of Edwin Wenink, Freek van den Bergh and Jordy Naus

## About this notebook

- This notebook offers an adjusted (pretrained) ResNet-50 network with the ability to freeze layers
- Features: conversion to 224x224 RGB input, Cropping, Cutout, Adam optimizer, learning rate scheduler.
- If PRETRAINED=True, this notebook requires internet connection
- Otherwise, set PRETRAINED=False and optionally load in a model located at MODEL_PATH
- Logging using "Weights and Biases" requires an API key
- This committed notebook performs only one training epoch for illustration purposes. Adjust in the "Training parameters" section.

## Global Variables

In [None]:
IMG_HEIGHT = 137
IMG_WIDTH = 236

DATA_PATH = "../input/bengaliai-cv19/"
FEATHER_PATH = "../input/bengaliaicv19feather/"

# SET THESE BEFORE RUNNING

# If USE_LOCAL_MODEL, then the model weights at MODEL_PATH will be loaded into the network
USE_LOCAL_MODEL = False
MODEL_PATH = "/kaggle/input/pretrainednet/rausnaus_resnet50_1584356517.pth"
# If PRETRAINED, weights pretrained on ImageNet will be loaded (USE_LOCAL_MODEL overrides this)
PRETRAINED = True
# Logging uses Weights and Biases (https://www.wandb.com/)
# If MONITOR, provide your own API key
MONITOR = False
WANDB_ID = '[ID GOES HERE]'
TRAINING = True
FREEZE_FROM_LAYER = 0 #Choose 9 to only freeze "classifier" layers to ResNet50

## Imports

All packages required to run this notebook

In [None]:
!pip install torchtoolbox 

import numpy as np
import pandas as pd 

import random
import gc
import wandb
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision import transforms
from torchvision import models
from torchtoolbox.transform import Cutout
from sklearn import metrics as metric
import cv2

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# Install Weights and Biases in current session and login (Freek: idk of die hash mijn log-in is of dat ie algemeen is)
if (MONITOR):
    !pip install --upgrade wandb
    !wandb login WANDB_ID
    # Initialize wand logging
    wandb.init(project="bengali-ai")

# Data

## Data Transformation Classes

In [None]:
class CustomCropResize(object):

    def __call__(self, image):
        image_array = np.array(image)
        cropped_image_array = self.crop_image(image_array)
        return cv2.resize(self.remove_low_values(cropped_image_array), (224,224))

    def __repr__(self):
        return self.__class__.__name__ + '()'
    
    def remove_low_values(self, img, threshold=20):
        """
        Sets low values to 0 to save memory and reduce noise
        """
        return np.where(img < threshold, 0, img )

    def boundary_box(self, img, original):
        """
        Returns the x and y-values of the top, bottom, left, and right of the first non-zero entries in an array
        Source: https://www.kaggle.com/iafoss/image-preprocessing-128x128
        """ 
        # For any row/column containing >=1 True values, np.any returns True
        rows = np.any(img, axis = 1)
        cols = np.any(img, axis = 0)
        # Select indices of the first and last row to be "True", i.e. have a non-zero element. 
        row_top, row_bottom = np.where(rows)[0][[0,-1]] 
        column_left, column_right = np.where(cols)[0][[0,-1]]   
        return row_top, row_bottom, column_left, column_right

    def crop_image(self, image, threshold = 40):
        image = image[5:-5, 5:-5] 
        row_top, row_bottom, column_left, column_right = self.boundary_box(image > threshold, image)
        image = image[row_top:row_bottom, column_left:column_right]
        n_rows, n_cols = image.shape
        diff = int(abs(n_rows - n_cols)/2)
        # If dimension is odd, introduce padding to make it even (2*0.5)
        fix = (n_rows+n_cols) % 2
        if (n_rows > n_cols):           
            padded_image = np.pad(image, [(0,0),(diff, diff+fix)], mode = 'constant')
        else:
            padded_image = np.pad(image, [(diff, diff+fix),(0,0)], mode = 'constant')
        return padded_image

In [None]:
class ToRGBArray(object):
    """
    Converts a 1D array of shape (H,W) to a shape (3,H,W) by duplicating the original image
    """

    def __call__(self, image):
        rgb_image = np.repeat(np.expand_dims(image, axis=0), repeats=3, axis=0)
        # The shape is now (W, 3, H) for some reason, but we want (3, H, W) so we move the axis
        return np.moveaxis(rgb_image, 0, -1)

    def __repr__(self):
        return self.__class__.__name__ + '()'

## Dataset Class

In [None]:
class BengaliDataset(Dataset):
    
    def __init__(self, file_nr, labels, transform=None):
        self.transform = transform
        
        self.df = labels.merge(pd.read_feather(f'{FEATHER_PATH}train_image_data_{file_nr}.feather'), on="image_id")
        self.df = self.df.drop(['image_id'], axis=1)
        self.data = self.df.iloc[:, 3:].values.reshape(-1, IMG_HEIGHT, IMG_WIDTH).astype(np.uint8)
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, index):
        root, vowel, cons = self.df.iloc[index,:3]
        img = self.data[index]
        # Normalization (such that max value is always 255) and reducing memory fingerprint
        img = ((255-img)*(255.0/img.max())).astype(np.uint8)
        if self.transform:
            img = self.transform(img)
        return root, vowel, cons, img

## Function for Creating Train/Validation DataLoaders

In [None]:
def get_train_val_loaders(dataset, batch_size=64, shuffle=True, val_percentage=0.1):
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(val_percentage * dataset_size)
    if shuffle:
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    val_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, 
                                               sampler=train_sampler)
    val_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                sampler=val_sampler)
    
    return train_loader, val_loader

# Model

## Model Class

In [None]:
class Net(nn.Module):
    
    def __init__(self, model):
        super(Net, self).__init__()
        self.model = model
        fc_in = model.fc.in_features
        
        # Remove last layer of original model
        self.model = nn.Sequential(*list(model.children())[:-1])
        
        # Define layers to be added
        self.bn1  = nn.BatchNorm1d(fc_in)
        self.drop1 = nn.Dropout(0.25)
        self.lin1  = nn.Linear(fc_in, 512)
        self.relu = nn.ReLU(inplace=False)
        
        self.bn2 = nn.BatchNorm1d(512)
        self.drop2 = nn.Dropout(0.5)
        
        # Final layers
        self.fc1 = nn.Linear(512, 11)
        self.fc2 = nn.Linear(512,168)
        self.fc3 = nn.Linear(512,7)
    
    def forward(self, x):
        # Run input through ResNet50
        x = self.model(x)
        
        # Turn x into the right shape
        x = x.view(x.size(0), -1)
        
        # Put output x through our self defined layers
        x = self.bn1(x)
        x = self.drop1(x)
        x = self.lin1(x)
        x = self.relu(x)
        
        x = self.bn2(x)
        x = self.drop2(x)
        vowel_preds = self.fc1(x)
        root_preds = self.fc2(x)
        cons_preds = self.fc3(x)
        
        return vowel_preds, root_preds, cons_preds

## Load model and move to GPU

In [None]:
if USE_LOCAL_MODEL:
    model.load_state_dict(torch.load(MODEL_PATH))
elif PRETRAINED:
    model = models.resnet50(pretrained=True)
else:
    model = models.resnet50()

# Freeze layers (depending on FREEZE_FROM_LAYER)
layer_count = 0
for child in model.children():
    if layer_count < FREEZE_FROM_LAYER:
        for name, param in child.named_parameters():
            param.requires_grad = False
            print(f"Child {name} frozen")
    layer_count += 1
    
# Change the final layers of our model:
model = Net(model)

print('-'*50)

for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"{name} is selected for finetuning")

In [None]:
print("Updated model:")
print(model)

In [None]:
# Use GPU if GPU available, else use CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

# Tell model to use hardware available (CPU/GPU)
model = model.to(device)

# Training

## Validation Function

In [None]:
def compute_weighted_metric(metric, preds, labels, weights, **kwargs):
    metric_scores = []
    for i in range(3):
        metric_scores.append(metric(labels[i], preds[i], **kwargs))               
    return np.average(metric_scores, weights=weights)

def validate(model, loss, scheduler, val_loader):
    # put model in evaluation mode
    model.eval()
    
    val_losses = []
    r_preds, r_labels = [], []
    v_preds, v_labels = [], []
    c_preds, c_labels = [], []
    
    start_time = time.time()

    with torch.no_grad():
        for batch_idx, (roots, vowels, cons, imgs) in enumerate(val_loader):
            # Each 50 batches print progress and how long those 50 batches took
            if batch_idx%50 == 0:
                print(f"Batch {batch_idx+1}/{len(val_loader)} in {time.time()-start_time} sec.")
                start_time = time.time()
                
            # get predictions
            roots = roots.to(device); vowels = vowels.to(device); cons = cons.to(device); imgs = imgs.to(device, dtype=torch.float32)
            vowel_pred, root_pred, consonant_pred = model(imgs)

            _, vowel_diacritic = torch.max(vowel_pred, 1)
            _, grapheme_root = torch.max(root_pred, 1)
            _, consonant_diacritic = torch.max(consonant_pred, 1)

            # compute validation loss
            vowel_loss = loss(vowel_pred, vowels)
            root_loss = loss(root_pred, roots)
            consonant_loss = loss(consonant_pred, cons)

            # Append to history, to be averaged later
            val_losses.append([vowel_loss, root_loss, consonant_loss])

            # Keep track of predictions of accuracy and recall computations             
            v_preds.extend(vowel_diacritic.cpu().numpy());     v_labels.extend(vowels.cpu().numpy())
            r_preds.extend(grapheme_root.cpu().numpy());       r_labels.extend(roots.cpu().numpy())
            c_preds.extend(consonant_diacritic.cpu().numpy()); c_labels.extend(cons.cpu().numpy())

    weighted_accuracy = compute_weighted_metric(metric.accuracy_score, [v_preds, r_preds, c_preds], [v_labels, r_labels, c_labels], [1,2,1])
    weighted_recall = compute_weighted_metric(metric.recall_score, [v_preds, r_preds, c_preds], [v_labels, r_labels, c_labels], [1,2,1], average="macro")

    # Compute loss
    avg_val_loss = np.array(val_losses).sum(axis=0)/len(val_losses)
    weighted_val_loss = np.sum(avg_val_loss)

    # Scheduler volgt de validation loss
    scheduler.step(weighted_val_loss)

    if MONITOR:
        wandb.log({"Validation accuracy (weighted)":weighted_accuracy})
        wandb.log({"Validation recall (weighted)":weighted_recall})    
        wandb.log({"Validation vowel loss":avg_val_loss[0], "Validation root loss":avg_val_loss[1], "Validation consonant loss":avg_val_loss[2], "Validation weighted loss":weighted_val_loss })

## Training Function

In [None]:
def train(model, loss_function, optimizer, train_loader):
    
    model.train()
    
    start_time = time.time()

    # Train on batch
    for batch_idx, (roots, vowels, cons, imgs) in enumerate(train_loader):

        # Each 50 batches print progress and how long those 50 batches took
        if batch_idx%50 == 0:
            print(f"Batch {batch_idx+1}/{len(train_loader)} in {time.time()-start_time} sec.")
            start_time = time.time()

        # In PyTorch you have to manually reset gradients before each mini_batch
        optimizer.zero_grad()

        # Get predictions
        roots = roots.to(device); vowels = vowels.to(device); cons = cons.to(device); imgs = imgs.to(device, dtype=torch.float32)
        vowel_pred, root_pred, consonant_pred = model(imgs)  

        # Compute loss
        vowel_loss = loss_function(vowel_pred, vowels)
        root_loss = loss_function(root_pred, roots)
        consonant_loss = loss_function(consonant_pred, cons)

        # Update gradients
        weighted_total_loss = vowel_loss + root_loss + consonant_loss
        weighted_total_loss.backward()
        optimizer.step()

        weighted_loss = weighted_total_loss/3 

        # Visualise losses after every batch in Weights & Biases
        if (MONITOR):
            wandb.log({"Train vowel loss":vowel_loss, "Train root loss":root_loss, "Train consonant loss":consonant_loss, "Train total loss":weighted_total_loss, "Train weighted loss":weighted_loss})

    # Save model after training
    torch.save(model.state_dict(), f'/kaggle/working/models/rausnaus_resnet50_{int(time.time())}.pth')


## Training Parameters

In [None]:
epochs = 1
learning_rate = 0.001
batch_size = 64

# Filter on requires_grad: do not optimze frozen layers
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, min_lr=1e-08)
loss = nn.CrossEntropyLoss()

# Preprocessing pipeline
preprocess = transforms.Compose([
    CustomCropResize(), # Crop and resize
    ToRGBArray(), # Convert 1D image to 3D
    Cutout(),
    transforms.ToTensor(), # Convert to range [0,1]
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # Normalise according to ImageNet
])

In [None]:
# Create directory for saving models
!mkdir /kaggle/working/models/

In [None]:
# Monitor model with Weights & Biases
if (MONITOR):
    wandb.watch(model)

## Actual training

In [None]:
train_df = pd.read_csv(os.path.join(DATA_PATH, "train.csv"))
train_df.drop(['grapheme'], axis=1, inplace=True)
train_df[['grapheme_root', 'vowel_diacritic', 'consonant_diacritic']] = train_df[['grapheme_root', 'vowel_diacritic', 'consonant_diacritic']].astype('uint8')

# If-statement for faster commiting
if TRAINING:
    for epoch in range(epochs):
        print(f"Epoch {epoch}/{epochs}")
        for datafile_index in random.sample([0,1,2,3],4):
            print(f"Loading parquet file {datafile_index}")
            dataset = BengaliDataset(datafile_index, train_df, transform=preprocess)
            train_loader, val_loader = get_train_val_loaders(dataset, batch_size=batch_size, shuffle=True, val_percentage=0.1)

            print("Training")
            train(model, loss, optimizer, train_loader)
            print("Validation")
            validate(model, loss, scheduler, val_loader)
            
            del dataset
            gc.collect()