In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# Any results you write to the current directory are saved as output.

In [None]:
import torch
from torch import nn
from torch.nn import functional as F
from torchvision import models


class DenseNetHead(nn.Module):

    def __init__(self, in_features: int, n_classes: int):
        super().__init__()
        self.classifier = nn.Linear(in_features, n_classes)

    def forward(self, x):
        return self.classifier(x)


class DenseNetBase(nn.Module):

    def __init__(self, name: str, pretrained: bool = True):
        super().__init__()
        self.base = getattr(models, name)(pretrained=pretrained)
        if name.endswith('121'):
            self.out_features = 1024
        elif name.endswith('161'):
            self.out_features = 2208
        elif name.endswith('169'):
            self.out_features = 1664
        else:
            self.out_features = 1920

    def forward(self, x):
        base = self.base
        features = base.features(x)
        out = F.relu(features, inplace=True)
        out = F.adaptive_avg_pool2d(out, (1, 1))
        out = torch.flatten(out, 1)
        return out


In [None]:
import torch
from torch import nn
from torchvision import models


class ResNetHead(nn.Module):
    def __init__(self, in_features: int, n_classes: int):
        super().__init__()

        self.pooling = nn.AdaptiveAvgPool2d((1, 1))
        self.fc1 = nn.Linear(in_features, n_classes)

    def forward(self, x):
        x = self.pooling(x)
        x = torch.flatten(x, start_dim=1)
        x = self.apply_fc_out(x)
        return x

    def apply_fc_out(self, x):
        return self.fc1(x)


class ResNetBase(nn.Module):
    def __init__(self, name: str, pretrained: bool, frozen_start: bool = False):
        super().__init__()

        self.base = getattr(models, name)(pretrained=pretrained)

        self.frozen_start = frozen_start

        if name == 'resnet34' or name == 'resnet18':
            self.out_features = 512
        else:
            self.out_features = 2048

        self.frozen = []
        if self.frozen_start:
            self.frozen = [self.base.layer1, self.base.conv1, self.base.bn1]
            for m in self.frozen:
                self._freeze(m)

    def forward(self, x):
        base = self.base
        x = base.conv1(x)
        x = base.bn1(x)
        x = base.relu(x)
        x = base.maxpool(x)

        x = base.layer1(x)
        x = base.layer2(x)
        x = base.layer3(x)
        x = base.layer4(x)

        return x

    def train(self, mode=True):
        super().train(mode=mode)
        for m in self.frozen:
            self._bn_to_eval(m)

    def _freeze(self, module):
        for p in module.parameters():
            p.requires_grad = False

    def _bn_to_eval(self, module):
        for m in module.modules():
            if isinstance(m, nn.BatchNorm2d):
                m.eval()

# STN Network

In [None]:
"""
The source code is from Clova.ai with no changes have been made.
Only factoring the code.
https://github.com/clovaai/deep-text-recognition-benchmark/
"""

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class TPS_SpatialTransformerNetwork(nn.Module):
    """ Rectification Network of RARE, namely TPS based STN """

    def __init__(self, F, I_size, I_r_size, I_channel_num=1):
        """ Based on RARE TPS
        input:
            batch_I: Batch Input Image [batch_size x I_channel_num x I_height x I_width]
            I_size : (height, width) of the input image I
            I_r_size : (height, width) of the rectified image I_r
            I_channel_num : the number of channels of the input image I
        output:
            batch_I_r: rectified image [batch_size x I_channel_num x I_r_height x I_r_width]
        """
        super(TPS_SpatialTransformerNetwork, self).__init__()
        self.F = F
        self.I_size = I_size
        self.I_r_size = I_r_size  # = (I_r_height, I_r_width)
        self.I_channel_num = I_channel_num
        self.LocalizationNetwork = LocalizationNetwork(self.F, self.I_channel_num)
        self.GridGenerator = GridGenerator(self.F, self.I_r_size)

    def forward(self, batch_I):
        batch_C_prime = self.LocalizationNetwork(batch_I)  # batch_size x K x 2
        build_P_prime = self.GridGenerator.build_P_prime(
            batch_C_prime)  # batch_size x n (= I_r_width x I_r_height) x 2
        build_P_prime_reshape = build_P_prime.reshape(
            [build_P_prime.size(0), self.I_r_size[0], self.I_r_size[1], 2])

        if torch.__version__ > "1.2.0":
            batch_I_r = F.grid_sample(batch_I, build_P_prime_reshape,
                                      padding_mode='border', align_corners=True)
        else:
            batch_I_r = F.grid_sample(batch_I, build_P_prime_reshape,
                                      padding_mode='border')

        return batch_I_r


class LocalizationNetwork(nn.Module):
    """ Localization Network of RARE, which predicts C' (K x 2) from I (I_width x I_height) """

    def __init__(self, F, I_channel_num):
        super(LocalizationNetwork, self).__init__()
        self.F = F
        self.I_channel_num = I_channel_num
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels=self.I_channel_num, out_channels=64, kernel_size=3,
                      stride=1, padding=1,
                      bias=False), nn.BatchNorm2d(64), nn.ReLU(True),
            nn.MaxPool2d(2, 2),  # batch_size x 64 x I_height/2 x I_width/2
            nn.Conv2d(64, 128, 3, 1, 1, bias=False), nn.BatchNorm2d(128), nn.ReLU(True),
            nn.MaxPool2d(2, 2),  # batch_size x 128 x I_height/4 x I_width/4
            nn.Conv2d(128, 256, 3, 1, 1, bias=False), nn.BatchNorm2d(256), nn.ReLU(True),
            nn.MaxPool2d(2, 2),  # batch_size x 256 x I_height/8 x I_width/8
            nn.Conv2d(256, 512, 3, 1, 1, bias=False), nn.BatchNorm2d(512), nn.ReLU(True),
            nn.AdaptiveAvgPool2d(1)  # batch_size x 512
        )

        self.localization_fc1 = nn.Sequential(nn.Linear(512, 256), nn.ReLU(True))
        self.localization_fc2 = nn.Linear(256, self.F * 2)

        # Init fc2 in LocalizationNetwork
        self.localization_fc2.weight.data.fill_(0)
        """ see RARE paper Fig. 6 (a) """
        ctrl_pts_x = np.linspace(-1.0, 1.0, int(F / 2))
        ctrl_pts_y_top = np.linspace(0.0, -1.0, num=int(F / 2))
        ctrl_pts_y_bottom = np.linspace(1.0, 0.0, num=int(F / 2))
        ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
        ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
        initial_bias = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0)
        self.localization_fc2.bias.data = torch.from_numpy(initial_bias).float().view(-1)

    def forward(self, batch_I):
        """
        input:     batch_I : Batch Input Image [batch_size x I_channel_num x I_height x I_width]
        output:    batch_C_prime : Predicted coordinates of fiducial points for input batch [batch_size x F x 2]
        """
        batch_size = batch_I.size(0)
        features = self.conv(batch_I).view(batch_size, -1)
        batch_C_prime = self.localization_fc2(self.localization_fc1(features)).view(
            batch_size, self.F, 2)
        return batch_C_prime


class GridGenerator(nn.Module):
    """ Grid Generator of RARE, which produces P_prime by multipling T with P """

    def __init__(self, F, I_r_size):
        """ Generate P_hat and inv_delta_C for later """
        super(GridGenerator, self).__init__()
        self.eps = 1e-6
        self.I_r_height, self.I_r_width = I_r_size
        self.F = F
        self.C = self._build_C(self.F)  # F x 2
        self.P = self._build_P(self.I_r_width, self.I_r_height)
        ## for multi-gpu, you need register buffer
        self.register_buffer("inv_delta_C", torch.tensor(
            self._build_inv_delta_C(self.F, self.C)).float())  # F+3 x F+3
        self.register_buffer("P_hat", torch.tensor(
            self._build_P_hat(self.F, self.C, self.P)).float())  # n x F+3
        ## for fine-tuning with different image width, you may use below instead of self.register_buffer
        # self.inv_delta_C = torch.tensor(self._build_inv_delta_C(self.F, self.C)).float().cuda()  # F+3 x F+3
        # self.P_hat = torch.tensor(self._build_P_hat(self.F, self.C, self.P)).float().cuda()  # n x F+3

    def _build_C(self, F):
        """ Return coordinates of fiducial points in I_r; C """
        ctrl_pts_x = np.linspace(-1.0, 1.0, int(F / 2))
        ctrl_pts_y_top = -1 * np.ones(int(F / 2))
        ctrl_pts_y_bottom = np.ones(int(F / 2))
        ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
        ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
        C = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0)
        return C  # F x 2

    def _build_inv_delta_C(self, F, C):
        """ Return inv_delta_C which is needed to calculate T """
        hat_C = np.zeros((F, F), dtype=float)  # F x F
        for i in range(0, F):
            for j in range(i, F):
                r = np.linalg.norm(C[i] - C[j])
                hat_C[i, j] = r
                hat_C[j, i] = r
        np.fill_diagonal(hat_C, 1)
        hat_C = (hat_C ** 2) * np.log(hat_C)
        # print(C.shape, hat_C.shape)
        delta_C = np.concatenate(  # F+3 x F+3
            [
                np.concatenate([np.ones((F, 1)), C, hat_C], axis=1),  # F x F+3
                np.concatenate([np.zeros((2, 3)), np.transpose(C)], axis=1),  # 2 x F+3
                np.concatenate([np.zeros((1, 3)), np.ones((1, F))], axis=1)  # 1 x F+3
            ],
            axis=0
        )
        inv_delta_C = np.linalg.inv(delta_C)
        return inv_delta_C  # F+3 x F+3

    def _build_P(self, I_r_width, I_r_height):
        I_r_grid_x = (np.arange(-I_r_width, I_r_width,
                                2) + 1.0) / I_r_width  # self.I_r_width
        I_r_grid_y = (np.arange(-I_r_height, I_r_height,
                                2) + 1.0) / I_r_height  # self.I_r_height
        P = np.stack(  # self.I_r_width x self.I_r_height x 2
            np.meshgrid(I_r_grid_x, I_r_grid_y),
            axis=2
        )
        return P.reshape([-1, 2])  # n (= self.I_r_width x self.I_r_height) x 2

    def _build_P_hat(self, F, C, P):
        n = P.shape[0]  # n (= self.I_r_width x self.I_r_height)
        P_tile = np.tile(np.expand_dims(P, axis=1),
                         (1, F, 1))  # n x 2 -> n x 1 x 2 -> n x F x 2
        C_tile = np.expand_dims(C, axis=0)  # 1 x F x 2
        P_diff = P_tile - C_tile  # n x F x 2
        rbf_norm = np.linalg.norm(P_diff, ord=2, axis=2, keepdims=False)  # n x F
        rbf = np.multiply(np.square(rbf_norm), np.log(rbf_norm + self.eps))  # n x F
        P_hat = np.concatenate([np.ones((n, 1)), P, rbf], axis=1)
        return P_hat  # n x F+3

    def build_P_prime(self, batch_C_prime):
        """ Generate Grid from batch_C_prime [batch_size x F x 2] """
        batch_size = batch_C_prime.size(0)
        batch_inv_delta_C = self.inv_delta_C.repeat(batch_size, 1, 1)
        batch_P_hat = self.P_hat.repeat(batch_size, 1, 1)
        batch_C_prime_with_zeros = torch.cat((batch_C_prime, torch.zeros(
            batch_size, 3, 2).float().to(device)), dim=1)  # batch_size x F+3 x 2
        batch_T = torch.bmm(batch_inv_delta_C,
                            batch_C_prime_with_zeros)  # batch_size x F+3 x 2
        batch_P_prime = torch.bmm(batch_P_hat, batch_T)  # batch_size x n x 2
        return batch_P_prime  # batch_size x n x 2

# Grapheme Model

In [None]:
def build_grapheme_model(base: str, n_classes: int, **kwargs) -> nn.Module:
    return GraphemeModel(base=base, n_classes=n_classes, **kwargs)


class GraphemeModel(nn.Module):
    def __init__(self, *, base: str, n_classes: int, **base_kwargs,):
        super().__init__()

        if base.startswith('resne'):

            self.base = ResNetBase(base, **base_kwargs)
            self.in_features = self.base.out_features
            self.head = ResNetHead(
                in_features=self.in_features,
                n_classes=n_classes,
            )
        elif base.startswith('vgg'):
            self.base = VGGBase(base, **base_kwargs)
            self.head = VGGHead(
                n_classes=n_classes,
            )
        else:
            self.base = DenseNetBase(base, **base_kwargs)
            self.in_features = self.base.out_features
            self.head = DenseNetHead(
                in_features=self.in_features,
                n_classes=n_classes,
            )

    def forward(self, x):
        x = self.base(x)
        x = self.head(x)
        return x

# Vowel + Consonant Model

In [None]:
from typing import List

def build_vc_model(base: str, n_classes: List[int], **kwargs) -> nn.Module:
    return VCModel(base=base, n_classes=n_classes, **kwargs)


class VCModel(nn.Module):
    def __init__(self, *, base: str, n_classes: List[int], **base_kwargs,):
        super().__init__()
        
        self.transformation = TPS_SpatialTransformerNetwork(
            F=20,
            I_size=(224, 224),
            I_r_size=(224, 224),
            I_channel_num=3,
        )

        if base.startswith('resne'):

            self.base = ResNetBase(base, **base_kwargs)
            self.in_features = self.base.out_features
            self.vowel_head = ResNetHead(
                in_features=self.in_features,
                n_classes=n_classes[0],
            )
            self.consonant_head = ResNetHead(
                in_features=self.in_features,
                n_classes=n_classes[1],
            )
        elif base.startswith('vgg'):
            self.base = VGGBase(base, **base_kwargs)
            self.vowel_head = VGGHead(
                n_classes=n_classes[0],
            )
            self.consonant_head = VGGHead(
                n_classes=n_classes[1],
            )
        else:
            self.base = DenseNetBase(base, **base_kwargs)
            self.in_features = self.base.out_features
            self.vowel_head = DenseNetHead(
                in_features=self.in_features,
                n_classes=n_classes[0],
            )
            self.consonant_head = DenseNetHead(
                in_features=self.in_features,
                n_classes=n_classes[1],
            )

    def forward(self, x):
        x = self.transformation(x)
        x = self.base(x)
        vowel = self.vowel_head(x)
        consonant = self.consonant_head(x)

        return vowel, consonant


In [None]:
from pathlib import Path

import pandas as pd
import os
import torchvision

import torch
from torch.nn import Module
from PIL import Image
import cv2
from torch.utils.data import Dataset, DataLoader
import numpy as np

from tqdm import tqdm
from albumentations import Compose
from albumentations.pytorch import ToTensorV2
import albumentations as A

In [None]:
import torch.nn.functional as F

In [None]:
BASE = 'resnext50_32x4d'  # 'resnext101_32x8d'
HEAD_DROPOUT = 0.5
FROZEN_START = 0
CLASSES = [168, 11, 7]
FP16 = True
HEAD = 'SimpleHead'

In [None]:
GRAPHEME_BASE = 'resnext50_32x4d'
VC_BASE = 'resnext50_32x4d'

GRAPHEME_CLASSES = 168
VC_CLASSES = [11, 7]

SIZE=224
HEIGHT = 137
WIDTH = 236

# GRAPHEME_WEIGHTS_FILE= os.path.abspath('/kaggle/input/vcmodels/bengali-experiments/model_best_0.pth')
GRAPHEME_WEIGHTS_FILE = os.path.abspath('/kaggle/input/vcmodels/model_best_g_0.pth')
VC_WEIGHTS_FILE = os.path.abspath('/kaggle/input/vcmodels/stn_resnext50/model_best_1.pth')

In [None]:
def bbox(img):
    rows = np.any(img, axis=1)
    cols = np.any(img, axis=0)
    rmin, rmax = np.where(rows)[0][[0, -1]]
    cmin, cmax = np.where(cols)[0][[0, -1]]
    return rmin, rmax, cmin, cmax

def crop_resize(img0, size=SIZE, pad=16):
    #crop a box around pixels large than the threshold 
    #some images contain line at the sides
    ymin,ymax,xmin,xmax = bbox(img0[5:-5,5:-5] > 80)
    #cropping may cut too much, so we need to add it back
    xmin = xmin - 13 if (xmin > 13) else 0
    ymin = ymin - 10 if (ymin > 10) else 0
    xmax = xmax + 13 if (xmax < WIDTH - 13) else WIDTH
    ymax = ymax + 10 if (ymax < HEIGHT - 10) else HEIGHT
    img = img0[ymin:ymax,xmin:xmax]
    #remove lo intensity pixels as noise
    img[img < 28] = 0
    lx, ly = xmax-xmin,ymax-ymin
    l = max(lx,ly) + pad
    #make sure that the aspect ratio is kept in rescaling
    img = np.pad(img, [((l-ly)//2,), ((l-lx)//2,)], mode='constant')
    img = cv2.resize(img,(size,size))
    img = cv2.cvtColor(img,cv2.COLOR_GRAY2BGR)
    return img

In [None]:
class GraphemeDataset(Dataset):
    def __init__(self, fname, transform=None):
        self.df = pd.read_parquet(fname)
        self.data = self.df.iloc[:, 1:].values
        self.transform = transform
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        name = self.df.iloc[idx,0]
        img = 255 - self.data[idx, :].reshape(HEIGHT, WIDTH).astype(np.uint8)
        img = (img*(255.0/img.max())).astype(np.uint8)
        img = crop_resize(img)
        
        if self.transform:
            transformed = self.transform(image=img)
            img = transformed['image']
            
        return img, name

In [None]:
TEST = ['/kaggle/input/bengaliai-cv19/test_image_data_0.parquet',
        '/kaggle/input/bengaliai-cv19/test_image_data_1.parquet',
        '/kaggle/input/bengaliai-cv19/test_image_data_2.parquet',
        '/kaggle/input/bengaliai-cv19/test_image_data_3.parquet']

In [None]:
transform_test = Compose([
    A.Normalize(mean=(0.0692, 0.0692, 0.0692),
                std=(0.2052, 0.2052, 0.2052)),
    ToTensorV2()
])

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
grapheme_model: Module = build_grapheme_model(
    base=GRAPHEME_BASE,
    pretrained=False,
    n_classes=GRAPHEME_CLASSES,
)

print('Creating model ...')
grapheme_model.load_state_dict(torch.load(GRAPHEME_WEIGHTS_FILE, map_location=device))
grapheme_model = grapheme_model.to(device)
grapheme_model.eval()

In [None]:
vc_model: Module = build_vc_model(
    base=VC_BASE,
    pretrained=False,
    n_classes=VC_CLASSES,
)
print('Creating model ...')
vc_model.load_state_dict(torch.load(VC_WEIGHTS_FILE, map_location=device))
vc_model = vc_model.to(device)
vc_model.eval()

In [None]:
row_id,target = [],[]
for fname in TEST:
    ds = GraphemeDataset(fname, transform=transform_test)
    dl = DataLoader(ds, batch_size=256, shuffle=False)
    with torch.no_grad():
        for x, y in tqdm(dl):
            x = x.to(device, dtype=torch.float)
            pred_g = grapheme_model(x)
            pred_v, pred_c = vc_model(x)
            
            pred_g = F.softmax(pred_g, dim=1).data.cpu().numpy().argmax(axis=1)
            pred_v = F.softmax(pred_v, dim=1).data.cpu().numpy().argmax(axis=1)
            pred_c = F.softmax(pred_c, dim=1).data.cpu().numpy().argmax(axis=1)
            
            for idx,name in enumerate(y):
                row_id += [f'{name}_grapheme_root',f'{name}_vowel_diacritic',
                           f'{name}_consonant_diacritic']
                target += [pred_g[idx].item(),pred_v[idx].item(),pred_c[idx].item()]

In [None]:
sub_df = pd.DataFrame({'row_id': row_id, 'target': target})
sub_df.to_csv('submission.csv', index=False)
sub_df.head()