In [None]:
# !pip install timm
# !pip install opencv-python
# !pip install albumentations

In [None]:
# !pip install -U efficientnet_pytorch --no-cache

In [None]:
import os
import glob
import gc
gc.enable()
import multiprocessing
import cv2
import copy
import time
import random
from PIL import Image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

import base64
import pickle
import urllib
from urllib import request
from urllib.request import urlopen
import uuid

# fold
from sklearn.model_selection import StratifiedKFold

# For downloading images
from io import BytesIO

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp
import torchvision


# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold

# For Image Models
import timm

# For Transformer Models
from transformers import AutoTokenizer, AutoModel
from transformers import BertTokenizer, BertModel, BertConfig

# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2

# For colored terminal text
from colorama import Fore, Back, Style
r_ = Fore.RED
b_ = Fore.BLUE
c_ = Fore.CYAN
g_ = Fore.GREEN
y_ = Fore.YELLOW
m_ = Fore.MAGENTA
sr_ = Style.RESET_ALL

import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
from efficientnet_pytorch import EfficientNet

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
saved_model_path = './saved_models/version 1.0/'
dataset_path = '../input/wikipedia-image-caption/'

In [None]:
def optimal_num_of_loader_workers():
    num_cpus = multiprocessing.cpu_count()
    num_gpus = torch.cuda.device_count()
    optimal_value = min(num_cpus, num_gpus*4) if num_gpus else num_cpus - 1
    return optimal_value

In [None]:
CONFIG = {
    "seed": 2021,
    "epochs": 5,#20,
    'nfolds':5,
    
    "img_size": 600,
    "text_model_name": "xlm-roberta-base",
    
    "embedding_size": 256,
    "train_batch_size": 4,
    "valid_batch_size": 4,
    "learning_rate": 1e-4,
    "scheduler": 'CosineAnnealingLR',
    "min_lr": 1e-6,
    'num_workers':optimal_num_of_loader_workers(),
    
    "T_max": 500,
    "weight_decay": 1e-6,
    "max_length": 32,
    
    "n_accumulate": 1,
}

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['text_model_name'])

In [None]:
CONFIG

In [None]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

In [None]:
train_feathers = glob.glob('./data/train_feather_files/' + 'train*')
print(train_feathers)

In [None]:
train_df = pd.DataFrame()
for file in train_feathers:
    df = pd.read_feather(file)
    train_df = pd.concat([train_df, df])
print("Before removing duplicate rows:",train_df.shape)
train_df = train_df.drop_duplicates() #Drop duplicate rows if any
print("After removing duplicate rows:",train_df.shape)
train_df = train_df.sample(frac = 1).reset_index(drop = True)
print("Null:", train_df.isnull().any().any())

train_df.head()

In [None]:
print(len(train_df['language'].unique()), train_df['language'].unique())

In [None]:
print(train_df['language'].value_counts())

In [None]:
len(train_df)

In [None]:
train_df.at[0, 'image_url']

In [None]:
def url_to_image(img_url, file_name):
    try:
        file_name = str(uuid.uuid4())
        file_name = f'./data/train_images/{file_name}.jpg'
        req = request.Request(img_url)
        req.add_header('User-Agent', 'abc-bot')
        response = request.urlopen(req)
        f= open(file_name, 'wb')
        f.write(response.read())
        f.close()
        img = Image.open(file_name).convert("RGB")
        os.remove(file_name)
        return img
    except:
        return None

In [None]:
data_transforms = {
    "train": A.Compose([
            A.Resize(CONFIG['img_size'], CONFIG['img_size']),
            A.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
                max_pixel_value=255.0, 
                p=1.0
            ),
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.5),
            A.Rotate(limit=180, p=0.7),
            A.ShiftScaleRotate(
                shift_limit = 0.1, scale_limit=0.1, rotate_limit=45, p=0.5
            ),
            A.HueSaturationValue(
                hue_shift_limit=0.2, sat_shift_limit=0.2,
                val_shift_limit=0.2, p=0.5
            ),
            A.RandomBrightnessContrast(
                brightness_limit=(-0.1, 0.1),
                contrast_limit=(-0.1, 0.1), p=0.5
            ),
            ToTensorV2(p=1.0),
        ]),
    
    "valid": A.Compose([
            A.Resize(CONFIG['img_size'], CONFIG['img_size']),
            A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
            ToTensorV2(p=1.0),
        ])
}

In [None]:
class WikipediaDataset(Dataset):
    def __init__(self, data, tokenizer, max_length, transforms=None):
        self.data = data.reset_index(drop=True)
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.transforms = transforms
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
#         image_bytes = base64.b64decode(self.data[index]["b64_bytes"])
#         img = np.asarray(Image.open(BytesIO(image_bytes)).convert("RGB"))
        if torch.is_tensor(index):
            index = index.tolist()
        img = url_to_image(self.data.at[index, "image_url"], self.data.at[index, "page_title"])
        while img == None:
            index = random.randint(0, len(self.data)-1)
            img = url_to_image(self.data.at[index, "image_url"], self.data.at[index, "page_title"])
        img = np.array(img)
        caption = random.choice(self.data.at[index, "caption_title_and_reference_description"])
        caption = caption.replace("[SEP]", "</s>") # sep token for xlm-roberta
        inputs = self.tokenizer.encode_plus(
                caption,
                truncation=True,
                add_special_tokens=True,
                max_length=self.max_len,
                padding='max_length'
            )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        
        if self.transforms:
            img = self.transforms(image=img)["image"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'image': img
        }

In [None]:
class ImageFeatureExtractor(nn.Module):
    def __init__(self):
        super(ImageFeatureExtractor, self).__init__()
        self.model = EfficientNet.from_pretrained('efficientnet-b7')
        del self.model._fc, self.model._dropout
        self.n_feature_vector_size = 2560
#         self.bn1 = nn.BatchNorm1d(2560)

    def forward(self, inputs):
        output = self.model.extract_features(inputs)
        output = self.model._avg_pooling(output)
        output = torch.flatten(output, start_dim=1)
#         output = self.bn1(output)
        return output
    
# h = ImageFeatureExtractor().to(device)
# print(h(torch.randn(1,3,600,600).to(device)).shape)
# del h
# gc.collect()

In [None]:
class TextExtractorModel(nn.Module):
    def __init__(self, text_model):
        super(TextExtractorModel, self).__init__()
        self.text_model = AutoModel.from_pretrained(text_model)
        self.text_fc = nn.Sequential(
            nn.Linear(768, 1024),
            nn.Dropout(p=0.2),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Linear(1024, 2560),
            nn.BatchNorm1d(2560)
        )
        self.init_weights(self.text_fc)
        
    def init_weights(self, m):
        if type(m) == torch.nn.Linear:
            torch.nn.init.xavier_uniform_(m.weight)
            m.bias.data.fill_(0)
            
    def forward(self, ids, mask):
        out = self.text_model(input_ids=ids, attention_mask=mask, output_hidden_states=False)[1]
        text_embeddings = self.text_fc(out)
        return text_embeddings
    

In [None]:
class Loss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = nn.MSELoss()
        self.cosine = nn.CosineEmbeddingLoss()
        self.eps = eps
        
    def forward(self, inputs, targets):
        c_loss = self.cosine(inputs, targets, torch.Tensor(inputs.size(0)).to(device).fill_(1.0))
        m_loss = torch.sqrt(self.mse(inputs, targets) + self.eps)
        return 0.75*m_loss + 0.25*c_loss

In [None]:
def fetch_scheduler(optimizer):
    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=CONFIG['T_max'], eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=CONFIG['T_0'], eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == None:
        return None
        
    return scheduler

In [None]:
for i in range(CONFIG['nfolds']):
    os.makedirs(os.path.join(saved_model_path, f'model{i}'),exist_ok=True)

In [None]:
results = {}

# K-fold Cross Validation model evaluation
print('Total number of folds:', CONFIG['nfolds'])
print('-'*50)

# kfold = KFold(n_splits=CONFIG['nfolds'], shuffle=True, random_state=CONFIG['seed'])

kfold = StratifiedKFold(n_splits=CONFIG['nfolds'], shuffle=True, random_state=CONFIG['seed'])
for k , (train_idx,valid_idx) in enumerate(kfold.split(X=train_df, y=train_df['language'])):
    train_df.loc[valid_idx,'Fold'] = k


# for fold, (train_ids, test_ids) in enumerate(kfold.split(train_dataset)):
for fold in range(CONFIG['nfolds']):
    if fold==0:
        continue
    print('FOLD', fold)
    print('-'*50)
#     train_subsampler = SubsetRandomSampler(train_ids)
#     test_subsampler = SubsetRandomSampler(test_ids)
#     trainloader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], num_workers=CONFIG['num_workers'], 
#                              shuffle=True, pin_memory=True, drop_last=True, sampler=train_subsampler)
#     testloader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], num_workers=CONFIG['num_workers'], 
#                             shuffle=False, pin_memory=True, sampler=test_subsampler)

    x_train, x_valid = train_df.query(f"Fold != {fold}"), train_df.query(f"Fold == {fold}")
    train_dataset = WikipediaDataset(x_train, CONFIG["tokenizer"], CONFIG["max_length"], transforms=data_transforms["train"])
    valid_dataset = WikipediaDataset(x_valid, CONFIG["tokenizer"], CONFIG["max_length"], transforms=data_transforms["valid"])
    trainloader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], num_workers=CONFIG['num_workers'], 
                             shuffle=True, pin_memory=True, drop_last=True)
    testloader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], num_workers=CONFIG['num_workers'], 
                            shuffle=False, pin_memory=True)
    
#     model = Model(FeatureExtractor, CONFIG['text_model_name']).to(device)
    image_model = ImageFeatureExtractor().to(device)
    text_model = TextExtractorModel(CONFIG['text_model_name']).to(device)
    
    criterion = Loss().to(device)
#     optimizer = optim.AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=CONFIG['weight_decay'],
#                            betas=(0.9, 0.999), amsgrad=True)
    optimizer = optim.AdamW([
                {'params': image_model.parameters()},
                {'params': text_model.parameters(), 'lr':1e-5}
            ], lr=CONFIG['learning_rate'], weight_decay=CONFIG['weight_decay'], betas=(0.9, 0.999), amsgrad=True)
    scheduler = fetch_scheduler(optimizer)
    
    best_val_loss = 1000
    
    #### TRAINING & EVALUATION  ####
    for epoch in range(CONFIG['epochs']):
        #### TRAINING ####
#         model.train()
#         model.freeze_backbone()
        image_model.train(); text_model.train();
        current_loss = 0.0
        t = tqdm(enumerate(trainloader), total=len(trainloader), desc="Train: ")
        for batch_id, data in t:
            optimizer.zero_grad()
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            images = data['image'].to(device, dtype=torch.float)
            
            image_outputs = image_model(images)
            text_outputs = text_model(ids, mask)
            loss = criterion(image_outputs, text_outputs)
            loss.backward()
            optimizer.step()
            current_loss += float(loss.item())
            t.set_postfix_str('Training Loss='+str(round(current_loss/(batch_id+1), 4)))
            
            if scheduler is not None:
                scheduler.step()
        
        #### EVALUATION ####
        files = glob.glob('./data/train_images/*')
        for f in files:
            os.remove(f)
        with torch.no_grad():
#             model.eval()
            image_model.eval(); text_model.eval();
            current_loss = 0.0
            t = tqdm(enumerate(testloader), total=len(testloader), desc="Val: ")
            for batch_id, data in t:
                ids = data['ids'].to(device, dtype = torch.long)
                mask = data['mask'].to(device, dtype = torch.long)
                images = data['image'].to(device, dtype=torch.float)

#                 image_outputs, text_outputs = model(images, ids, mask)
                image_outputs = image_model(images)
                text_outputs = text_model(ids, mask)
                loss = criterion(image_outputs, text_outputs)
                current_loss += float(loss.item())
                t.set_postfix_str('Val Loss='+str(round(current_loss/(batch_id+1), 4)))
                
            if (current_loss/len(testloader))<=best_val_loss:
                print(f'loss has been decreased from {best_val_loss} to {(current_loss/len(testloader))}')
                best_val_loss = (current_loss/len(testloader))
                results[fold] = best_val_loss
                torch.save(image_model.state_dict(), os.path.join(saved_model_path, f'model{fold}/image_model{fold}.bin'))
                torch.save(text_model.state_dict(), os.path.join(saved_model_path, f'model{fold}/text_model{fold}.bin'))
                CONFIG["tokenizer"].save_pretrained(os.path.join(saved_model_path, f'model{fold}'))
                
    torch.cuda.empty_cache()
    del image_model, text_model, trainloader, testloader, optimizer, scheduler
    gc.collect()

In [None]:
import scipy.io as sio

captions_df = pd.read_csv(os.path.join(dataset_path, 'test_caption_list.csv'))
# print(captions_df.shape)
captions = captions_df['caption_title_and_reference_description'].tolist()  
# print(captions)
captions = [caption.replace("[SEP]", "</s>") for caption in captions]
# print(captions)

for fold in range(CONFIG['nfolds']):
#     fold = 0 
    caption_tokens = []
    text_model = TextExtractorModel(CONFIG['text_model_name']).to(device)
    text_model.load_state_dict(torch.load(os.path.join(saved_model_path, f'model{fold}/text_model{fold}.bin')))
    with torch.no_grad():
        text_model.eval()
        for caption in captions:
            inputs = CONFIG["tokenizer"].encode_plus(
                        caption,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=CONFIG["max_length"],
                        padding='max_length'
                    )
            ids = torch.tensor(inputs['input_ids'], dtype=torch.long).to(device).unsqueeze(0)
            mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).to(device).unsqueeze(0)
            text_features = text_model(ids, mask)
            text_features = text_features.cpu().detach().numpy()
            caption_tokens.append(text_features.squeeze())
    print('Fold:', fold, np.array(caption_tokens).shape)
    sio.savemat(os.path.join(saved_model_path, f'model{fold}/text_embeddings{fold}.mat'), 
                {'text_embeddings': np.array(caption_tokens)})
    del caption_tokens, text_model
    gc.collect()
    #         break