# âš¡ Fast ensemble
-------
Optimizing your code is usually a good practice, but it is inevitable in a limited environment like Kaggle's Code Competitions. In this Notebook, I implement most of the tips [from this notebook](https://www.kaggle.com/pestipeti/optimization-tips-faster-train-faster-inference). Hopefully, using these, you won't run out of time or memory.


I used this inference code, and my submission has finished in 19 minutes.

**Notes**:
- The duration may vary depending on your model, number of folds, the actual workload on the Kaggle servers, etc.
- I did not upload my trained weights (I don't want to create high scoring kernel), so the result is an average of 5 Resnet34 models (pretrained weights).
- Do not forget to enable GPU in your kernel. (I did not test this fork of my code with GPU. It should work.)

## Summary
- Use script instead of notebook\*
- Import only things you need
- Use logs instead of tqdm
- Cleanup after usage
- Load parquet files once
- Do not load data you don't need
- Check your dtypes
- Preprocess your images once
- Use CUDA for preprocessing (not yet implemented)
- Do not use albumentation (inference)
- Only use 3 channels if you really need it
- Process in batches
- Optimized TTA (not yet implemented)

\* *I use notebook for demonstration only. You should copy the code into a script.*

## Explanation
You can find more details/explanation about these tricks in this notebook:

[https://www.kaggle.com/pestipeti/optimization-tips-faster-train-faster-inference](https://www.kaggle.com/pestipeti/optimization-tips-faster-train-faster-inference)

In [None]:
import cv2
import numpy as np
import pandas as pd
import pyarrow.parquet as pq

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset
from torch.utils.data.sampler import SequentialSampler
from torchvision.models import resnet34, densenet121

import gc

In [None]:
# ======================
# Params

# I did not optimize the batch size
# If the GPU has more memory available
# you can increase this for faster inference
BATCH_SIZE = 96
N_WORKERS = 4

HEIGHT = 137
WIDTH = 236
TARGET_SIZE = 128
PADDING = 8

# Replace these to your values
MEAN = 0.0778441
STD = 0.216016

# You should keep this
INPUT_PATH = '/kaggle/input/bengaliai-cv19'

# Replace this to your weight dataset
DATASET_PATH = '/kaggle/input/private-bengali-ai-model-weights'

In [None]:
NUM_ENSEMBLE = 5

# These are from my experiments (I did not upload the weights)
# For this demo I used equal weights, but feel free to modify them.
# Make sure the sum of the labels are equals to 1 (per label)
ENSEMBLES = [
    {
        'model': 'resnet',
        'model_str': 'resnet-34',
        'model_state_file': DATASET_PATH + '/exp-507--a--f0--resnet-34--swa-4.pt',
        
        # Ensemble-item weights
        'w_grapheme': 1 / NUM_ENSEMBLE,
        'w_vowel':  1 / NUM_ENSEMBLE,
        'w_conso':  1 / NUM_ENSEMBLE,
    },
    {
        'model': 'resnet',
        'model_str': 'resnet-34',
        'model_state_file': DATASET_PATH + '/exp-560--a--f1--resnet-34--swa-4.pt',
        'w_grapheme': 1 / NUM_ENSEMBLE,
        'w_vowel':  1 / NUM_ENSEMBLE,
        'w_conso':  1 / NUM_ENSEMBLE,
    },
    {
        'model': 'resnet',
        'model_str': 'resnet-34',
        'model_state_file': DATASET_PATH + '/exp-588--a--f2--resnet-34--swa-4.pt',
        'w_grapheme': 1 / NUM_ENSEMBLE,
        'w_vowel':  1 / NUM_ENSEMBLE,
        'w_conso':  1 / NUM_ENSEMBLE,
    },
    {
        'model': 'resnet',
        'model_str': 'resnet-34',
        'model_state_file': DATASET_PATH + '/exp-589--a--f3--resnet-34--swa-4.pt',
        'w_grapheme': 1 / NUM_ENSEMBLE,
        'w_vowel':  1 / NUM_ENSEMBLE,
        'w_conso':  1 / NUM_ENSEMBLE,
    },
    {
        'model': 'resnet',
        'model_str': 'resnet-34',
        'model_state_file': DATASET_PATH + '/exp-590--a--f4--resnet-34--swa-4.pt',
        'w_grapheme': 1 / NUM_ENSEMBLE,
        'w_vowel':  1 / NUM_ENSEMBLE,
        'w_conso':  1 / NUM_ENSEMBLE,
    },
]

## Preprocessing
The preprocessing script below is from Iafoss' kernel:

[https://www.kaggle.com/iafoss/image-preprocessing-128x128](https://www.kaggle.com/iafoss/image-preprocessing-128x128)

In [None]:
def bbox(img):
    rows = np.any(img, axis=1)
    cols = np.any(img, axis=0)
    rmin, rmax = np.where(rows)[0][[0, -1]]
    cmin, cmax = np.where(cols)[0][[0, -1]]
    return rmin, rmax, cmin, cmax


def crop_resize(img0, size=TARGET_SIZE, pad=64):
    # crop a box around pixels large than the threshold
    # some images contain line at the sides
    ymin, ymax, xmin, xmax = bbox(img0[5:-5, 5:-5] > 80)

    # cropping may cut too much, so we need to add it back
    xmin = xmin - 13 if (xmin > 13) else 0
    ymin = ymin - 10 if (ymin > 10) else 0
    xmax = xmax + 13 if (xmax < WIDTH - 13) else WIDTH
    ymax = ymax + 10 if (ymax < HEIGHT - 10) else HEIGHT
    img = img0[ymin:ymax, xmin:xmax]

    # remove lo intensity pixels as noise
    img[img < 28] = 0
    lx, ly = xmax - xmin, ymax - ymin
    ls = max(lx, ly) + pad

    # make sure that the aspect ratio is kept in rescaling
    img = np.pad(img, [((ls - ly) // 2,), ((ls - lx) // 2,)], mode='constant')

    return cv2.resize(img, (size, size))

In [None]:
test_df = pd.read_csv(INPUT_PATH + ('/test.csv'))
submission_df = pd.read_csv(INPUT_PATH + '/sample_submission.csv')

## Loading data

In [None]:
class BengaliParquetDataset(Dataset):

    def __init__(self, num_samples=1):
        
        self.num_samples = num_samples
        self.images = torch.zeros(num_samples, TARGET_SIZE * TARGET_SIZE, dtype=torch.uint8)
        img_id = 0

        for i in range(4):
            datafile = INPUT_PATH + '/test_image_data_{}.parquet'.format(i)
            parq = pq.read_pandas(datafile, columns=[str(x) for x in range(32332)]).to_pandas()
            parq = 255 - parq.iloc[:, :].values.reshape(-1, HEIGHT, WIDTH).astype(np.uint8)
            
            # Not enough memory to do this using a large batch
            # parq = (parq * (255.0 / parq.max(axis=(1,2), keepdims=True))).astype(np.uint8)

            for idx, image in enumerate(parq):
                image = (image * (255.0 / image.max())).astype(np.uint8)
                self.images[img_id, ...] = torch.from_numpy(crop_resize(image, size=TARGET_SIZE, pad=PADDING).reshape(-1).astype(np.uint8))
                img_id = img_id + 1
                
        del parq

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img = self.images[idx]
        img = img.view(TARGET_SIZE, TARGET_SIZE)
        img = img.unsqueeze(0)

        return img, idx

In [None]:
bengali_dataset = BengaliParquetDataset(num_samples = test_df.shape[0] // 3)

## Models

In [None]:
class BengaliResnets(nn.Module):

    def __init__(self, backbone_str='resnet-18'):
        super().__init__()
        self.backbone = resnet34(pretrained=False)
        num_bottleneck_filters = self.backbone.fc.in_features
        self.head_dropout = 0.1
        
        old_conv1 = self.backbone.conv1

        self.backbone.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        with torch.no_grad():
            self.backbone.conv1.weight = nn.Parameter(old_conv1.weight.data[:, 0, :, :].unsqueeze(1))        

        self.last = nn.Sequential(
            nn.BatchNorm2d(num_bottleneck_filters),
            nn.ReLU()
        )

        self.fc_graph = nn.Linear(num_bottleneck_filters, 168)
        self.fc_vowel = nn.Linear(num_bottleneck_filters, 11)
        self.fc_conso = nn.Linear(num_bottleneck_filters, 7)        

    def forward_backbone(self, x):

        x = self.backbone.conv1(x)
        x = self.backbone.bn1(x)
        x = self.backbone.relu(x)
        x = self.backbone.maxpool(x)

        x = self.backbone.layer1(x)
        x = self.backbone.layer2(x)
        x = self.backbone.layer3(x)
        x = self.backbone.layer4(x)

        x = self.last(x)

        return x

    def forward(self, x):
        batch_size, C, H, W = x.shape
        
        x = (x - MEAN * 255.0) / (STD * 255.0)
        x = self.forward_backbone(x)

        x = F.adaptive_avg_pool2d(x, 1).reshape(batch_size, -1)
        x = F.dropout(x, self.head_dropout, self.training)

        fc_graph = self.fc_graph(x)
        fc_vowel = self.fc_vowel(x)
        fc_conso = self.fc_conso(x)

        return fc_graph, fc_vowel, fc_conso

In [None]:
# If you'd like to use different batch size for
# different size models (tip #12)
data_loader_test = torch.utils.data.DataLoader(
    bengali_dataset,
    batch_size=BATCH_SIZE,
    num_workers=N_WORKERS,
    sampler=SequentialSampler(bengali_dataset),
    shuffle=False
)

## Predicting

In [None]:
# Predictions
size = submission_df.shape[0] // 3
results = {
    'grapheme_root': np.zeros((len(ENSEMBLES), size, 168), dtype=np.float),
    'vowel_diacritic': np.zeros((len(ENSEMBLES), size, 11), dtype=np.float),
    'consonant_diacritic': np.zeros((len(ENSEMBLES), size, 7), dtype=np.float),
}

In [None]:
for model_idx, ensemble in enumerate(ENSEMBLES):
    
    if ensemble['model'].lower() == 'resnet':
        model = BengaliResnets(backbone_str=ensemble['model_str'].lower())
    # elif ensemble['model'].lower() == 'densenet':
    #     model = BengaliDensenets(backbone_str=ensemble['model_str'].lower())
    else:
        raise ValueError
    
    # Load your model's/fold's weights
    model_state = None
    # model_state = torch.load(ensemble['model_state_file'])
    # model.load_state_dict(model_state['model_state_dict'])
    model.eval()

    if torch.cuda.is_available():
        model.cuda()
    
    del model_state
    
    for batch_idx, data in enumerate(data_loader_test):
        images, image_idx = data

        if torch.cuda.is_available():
            images = images.float().cuda()
        else:
            images = images.float()
        
        with torch.no_grad():
            out_graph, out_vowel, out_conso = model(images)

        out_graph = F.softmax(out_graph, dim=1).data.cpu().numpy() * ensemble['w_grapheme']
        out_vowel = F.softmax(out_vowel, dim=1).data.cpu().numpy() * ensemble['w_vowel']
        out_conso = F.softmax(out_conso, dim=1).data.cpu().numpy() * ensemble['w_conso']

        start = batch_idx * BATCH_SIZE
        end = min((batch_idx + 1) * BATCH_SIZE, submission_df.shape[0] // 3)

        results['grapheme_root'][model_idx, start:end, :] = out_graph
        results['vowel_diacritic'][model_idx, start:end, :] = out_vowel
        results['consonant_diacritic'][model_idx, start:end, :] = out_conso
        
        del images
        del out_graph, out_vowel, out_conso
            
    del model

## Ensemble

In [None]:
# Clean-up
del data_loader_test
del bengali_dataset
del test_df

gc.collect()
%reset -f out

In [None]:
submission_df = pd.read_csv(INPUT_PATH + '/sample_submission.csv')
submission_df.head()

In [None]:
for l in ['grapheme_root', 'vowel_diacritic', 'consonant_diacritic']:
    idx = submission_df[submission_df['row_id'].str.contains(l)].index
    submission_df.iloc[idx, 1] = results[l].sum(axis=0).argmax(axis=1)

In [None]:
submission_df.to_csv('./submission.csv', index=False)

-----------

**Tanks for reading** If you find this notebook useful, plase vote.