## SET up TPU. 
I get a bug when i try to use a TPU trained model on GPU or CPU

In [None]:
import os
assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'

In [None]:
VERSION = "nightly"  #@param ["1.5" , "20200516", "nightly"]
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version $VERSION

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  4139  100  4139    0     0  54460      0 --:--:-- --:--:-- --:--:-- 55186
Updating TPU and VM. This may take around 2 minutes.
Updating TPU runtime to pytorch-nightly ...
Collecting cloud-tpu-client
  Downloading https://files.pythonhosted.org/packages/56/9f/7b1958c2886db06feb5de5b2c191096f9e619914b6c31fdf93999fdbbd8b/cloud_tpu_client-0.10-py3-none-any.whl
Collecting google-api-python-client==1.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/9a/b4/a955f393b838bc47cbb6ae4643b9d0f90333d3b4db4dc1e819f36aad18cc/google_api_python_client-1.8.0-py3-none-any.whl (57kB)
[K     |████████████████████████████████| 61kB 2.5MB/s 
Uninstalling torch-1.5.1+cu101:
Installing collected packages: google-api-python-client, cloud-tpu-client
  Found ex

In [None]:
import torch_xla
import torch_xla.core.xla_model as xm #handles most of the basic tasks
import torch_xla.debug.metrics as met
import torch_xla.distributed.parallel_loader as pl #handles dataloading on multiple processes
import torch_xla.distributed.xla_multiprocessing as xmp

## Pre-Config

In [None]:
import os
from pathlib import Path
from google.colab import drive
import shutil

def create_path(path):
    if not os.path.isdir(path):
        path.mkdir(parents=True, exist_ok=True)
    return path

colab_path = Path('/content')

In [None]:
create_path(colab_path/'dataset');
create_path(colab_path/'models');

!git clone --quiet 'https://github.com/tezike/download_google_drive.git'
os.chdir('download_google_drive')
!python download_gdrive.py '10rH0nAxa7mWS289xIyRP-mOOowqiIolL' '../dataset/temp.tgz'
shutil.rmtree('../download_google_drive')
os.chdir('..')

In [None]:
!pip install bcolz transformers -q

[K     |████████████████████████████████| 1.5MB 3.4MB/s 
[K     |████████████████████████████████| 757kB 23.2MB/s 
[K     |████████████████████████████████| 1.1MB 34.3MB/s 
[K     |████████████████████████████████| 890kB 37.1MB/s 
[K     |████████████████████████████████| 3.0MB 47.7MB/s 
[?25h  Building wheel for bcolz (setup.py) ... [?25l[?25hdone
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


## Colab_setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')
root_dir = Path('/content/drive/My Drive')
base_path = create_path(root_dir/'Rakuten')
base_path

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


PosixPath('/content/drive/My Drive/Rakuten')

## Imports

In [None]:
import os
import PIL
import bcolz
import shutil
import pandas as pd
import numpy as np
import albumentations as A

from tqdm.notebook import tqdm

from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import SequentialSampler, DataLoader
import torchvision

import transformers

from fastai.vision.data import get_image_files

from sklearn.model_selection import train_test_split

## Config

In [None]:
class Config():
    def __init__(self):
        self.COLAB_PATH = Path('/content/')
        self.BASE_PATH = base_path
        self.DATA_PATH = create_path(base_path/'dataset')
        self.MODEL_PATH = create_path(base_path/'models')
        self.IMAGE_TRAIN_PATH = self.COLAB_PATH/'SIGIR-2020-EComDC-release/image/image_training'
        self.IMAGE_TEST_PATH = self.COLAB_PATH/'SIGIR-2020-EComDC-release/image/image_test'
        self.TEST_FILE = self.COLAB_PATH/'SIGIR-2020-EComDC-release/data/x_test_task1_phase1.tsv'
        self.CLEAN_DF = self.DATA_PATH/'clean_folds_df.csv'
        
        self.MODEL_NAME = 'camembert-base'
        # self.LM_MODEL = transformers.CambertForMaskedLM.from_pretrained(self.MODEL_NAME)
        self.CLAS_MODEL = transformers.CamembertModel #.from_pretrained(MODEL_NAME)
        self.TOKENIZER = transformers.CamembertTokenizer.from_pretrained(
                    pretrained_model_name_or_path=self.MODEL_NAME,
                    do_lower_case=True,
                    )
        self.MODEL_CONFIG = transformers.CamembertConfig.from_pretrained(self.MODEL_NAME)
        self.MAX_SEQ_LEN = 256
        self.BATCH_SIZE = 64
        self.TRAIN_BS = 16
        self.VALID_BS = 16
        self.NUM_EPOCHS = 4
        self.sz = 400
        self.MEAN = (0.485, 0.456, 0.406)
        self.STD = (0.229, 0.224, 0.225)
        self.lr = 7e-05

config = Config()

In [None]:
device = xm.xla_device()

## Prep Data

In [None]:
!tar -xzf 'dataset/temp.tgz'

## Utils

In [None]:
class AverageMeter():
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

## Fasttext

If running for first time, remove all the comments else run only the uncommented one

In [None]:
# !git clone https://github.com/facebookresearch/fastText.git
# os.chdir('fastText')
# !pip install '.' -q

Cloning into 'fastText'...
remote: Enumerating objects: 3840, done.[K
remote: Total 3840 (delta 0), reused 0 (delta 0), pack-reused 3840[K
Receiving objects: 100% (3840/3840), 8.21 MiB | 33.77 MiB/s, done.
Resolving deltas: 100% (2412/2412), done.
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone


In [None]:
# os.chdir(config.COLAB_PATH)
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz -q
# # os.chdir(path)

In [None]:
# !gunzip $config.COLAB_PATH/'cc.en.300.bin.gz'

In [None]:
# import fasttext as ft
# en_vecs = ft.load_model(str((config.COLAB_PATH/'cc.en.300.bin')))

In [None]:
# categories = df_all.Prdlbl

In [None]:
# vecs = []
# for n in categories:
#     vecs.append(en_vecs.get_sentence_vector(str(n)))

In [None]:
# all_label_vecs = np.save(str(config.DATA_PATH/'all_label_vecs.npy'), vecs)

In [None]:
all_label_vecs = np.load(str(config.DATA_PATH/'all_label_vecs.npy'))

## Image and text pretrained classifier models (can be skipped)
I'll remove their heads and load their pretrained dicts

In [None]:
#export
class ImageNoClasModel(nn.Module):
    def __init__(self, pretrained=False):
        super(ImageNoClasModel, self).__init__()
        model = torchvision.models.resnet50(pretrained=pretrained)
        self.model = nn.Sequential(*list(model.children())[:-1])
        # self.model.fc = nn.Linear(in_features=2048, out_features=27)

    def forward(self, x):
        out = self.model(x)
        return out

In [None]:
# export
class TextNoClasModel(nn.Module):
    def __init__(self, pretrained=True, num_classes=27):
        super(TextNoClasModel, self).__init__()
        if pretrained:
            self.model = config.CLAS_MODEL.from_pretrained(config.MODEL_NAME, config=config.MODEL_CONFIG)
        else: 
            self.model = config.CLAS_MODEL(config.MODEL_CONFIG)
                  
        self.drop = nn.Dropout(0.4)

        # self.lin = nn.Linear(768*2, num_classes)
    
    def forward(self, ids, mask):

        h_0, _ = self.model(ids, attention_mask=mask)
        
        mean_pool = torch.mean(h_0, 1)

        max_pool = torch.max(h_0, 1)[0]

        out = torch.cat([mean_pool, max_pool], 1)

        # out = self.lin(self.drop(out))
        out = self.drop(out)

        return out

In [None]:
def load_model_dict(model, dict_path):
    model_dict = model.state_dict()
    pretrained_dict = torch.load(dict_path)
    needed_dict = {k:v for k, v in pretrained_dict.items() if k in model_dict.keys()}
    model_dict.update(needed_dict)
    model.load_state_dict(model_dict)
    print('Loaded successfully!!')

In [None]:
image_model = ImageNoClasModel(pretrained=False)
load_model_dict(image_model, config.MODEL_PATH/'model_resnet50.bin')
text_model = TextNoClasModel(pretrained=False)
load_model_dict(text_model, config.MODEL_PATH/f'torch_xla_pretrained_camembert_fold1.bin')

Loaded successfully!!
Loaded successfully!!


## Get dataloaders for images and text(can be skipped)

In [None]:
#export
def onehot(size, target):
    vec = np.zeros(size, dtype=np.float32)
    vec[target] = 1.
    return vec

class ImageDataset(torch.utils.data.Dataset):
    def __init__(self, img_paths, lbls, augs=True, resize=False, is_train=True):

        self.img_paths = img_paths
        self.lbls = lbls
        self.augs = augs
        self.is_train = is_train
        self.resize = resize

    def __getitem__(self, i):
        img_path = self.img_paths[i]
        lbl = self.lbls[i]

        img = PIL.Image.open(img_path)

        if self.resize:
            img = img.resize((config.sz, config.sz), resample=PIL.Image.LANCZOS)
           
        img = np.asarray(img).astype(np.uint8)

        
        if self.augs:
            if self.is_train:
                data = {'image' : img}            
                data = self.get_train_transforms()(**data)
                img = data['image']
            elif not self.is_train:
                data = {'image':img}            
                data = self.get_valid_transforms()(**data)
                img = data['image']

        # img = np.array(img)/255.

        lbl = onehot(27, lbl)

        return {
            'image' : torch.tensor(img, dtype=torch.float).permute(2, 0, 1),
            'target' : torch.tensor(lbl, dtype=torch.float)
            }

        
    def __len__(self):
        return len(self.lbls)

    def get_train_transforms(self):
        return A.Compose([
                A.Normalize(config.MEAN, config.STD, max_pixel_value=255.0, always_apply=True),
                # A.RandomSizedCrop(min_max_height=(180, 180), height=224, width=224, p=0.5),
                # A.CenterCrop(config.sz, config.sz, always_apply=True),
                # A.CenterCrop(224, 224, always_apply=True),
                A.RandomRotate90(p=0.5),
                A.HorizontalFlip(p=0.5),
                A.VerticalFlip(p=0.5),
                # A.ShiftScaleRotate(p=0.9),
                # A.RandomBrightnessContrast(),
                # A.Resize(height=512, width=512, p=1),
                # A.CoarseDropout(max_holes=8, max_height=32, max_width=32, min_holes=3, min_height=8, min_width=8, fill_value=0),                  
            ], p=1.0)

    def get_valid_transforms(self):
        return A.Compose([
                A.Normalize(config.MEAN, config.STD, max_pixel_value=255.0, always_apply=True),
                # A.CenterCrop(config.sz, config.sz, always_apply=True),
                # A.Resize(height=400, width=400, p=1.0),
            ], p=1.0)
    
    def get_labels(self):
        return list(self.lbls)

In [None]:
# export
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, text, label):
        self.text, self.label = text, label
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_SEQ_LEN

    def __len__(self):
        return len(self.label)

    def __getitem__(self, i):
        # sanity check
        text = ' '.join(self.text[i].split())

        # tokenize using Huggingface tokenizers
        out = self.tokenizer.encode_plus(text, None, 
                                   add_special_tokens=True, 
                                   max_length = self.max_len,
                                   truncation=True)
        
        ids = out['input_ids']
        mask = out['attention_mask']
        
        padding_length = self.max_len - len(ids)
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.label[i], dtype=torch.float)
        }

In [None]:
df_all = pd.read_csv(config.DATA_PATH/'df_all.csv').fillna(' ')
df_all['Prdlbl'] = 0

image_ds = ImageDataset(df_all.image_path, df_all.Prdlbl, resize=True, is_train=False)

image_dl = torch.utils.data.DataLoader(image_ds, batch_size=config.BATCH_SIZE, drop_last=False,
                                        num_workers=0, shuffle=False)

In [None]:
text_ds = TextDataset((df_all.Title + 'xxfld' + df_all.Description).values, df_all.Prdlbl.values)

text_dl = torch.utils.data.DataLoader(text_ds, batch_size=config.BATCH_SIZE, drop_last=False,
                                        num_workers=0, shuffle=False)

## Precompute vectors image and text for train(can be skipped)

In [None]:
def precompute_image_vectors(image_dl, model, path, force):
    model.eval()
    if os.path.exists(path) and not force: return
    arr = bcolz.carray(np.zeros((0,2048), np.float32), chunklen=1, mode='w', rootdir=path)

    with torch.no_grad():
        for batch in tqdm(image_dl):
            img = batch['image'].to(device)
            out = model.to(device)(img)
            out = out.squeeze(-1).squeeze(-1)
            # print(out.shape)
            arr.append(out.cpu().numpy())
            arr.flush()
            # break

In [None]:
def precompute_text_vectors(text_dl, model, path, force):
    model.eval()
    if os.path.exists(path) and not force: return
    arr = bcolz.carray(np.zeros((0,768*2), np.float32), chunklen=1, mode='w', rootdir=path)

    with torch.no_grad():
        for batch in tqdm(text_dl):
            ids = batch['ids'].to(device)
            mask = batch['mask'].to(device)
            out = model.to(device)(ids, mask)
            # print(out.shape)
            # out = out.squeeze(-1).squeeze(-1)
            arr.append(out.cpu().numpy())
            arr.flush()
            # break

In [None]:
precompute_image_vectors(image_dl, image_model, config.COLAB_PATH/'train_image_vecs', True)
precompute_text_vectors(text_dl, text_model, config.COLAB_PATH/'train_text_vecs', True)

HBox(children=(FloatProgress(value=0.0, max=1327.0), HTML(value='')))




In [None]:
image_vec_precomputed = bcolz.open(config.COLAB_PATH/'train_image_vecs')
text_vec_precomputed = bcolz.open(config.COLAB_PATH/'train_text_vecs')

due to I/O latency when reading files from drive, i've settled to create the procompute image and text vectors in the colab env and then tar it up and copy to gdrive so i can simply download them when needed

In [None]:
# test (can be removed)

# shutil.copytree(config.COLAB_PATH/'train_image_vecs', config.COLAB_PATH/'train_image_vecs2')
# shutil.copytree(config.COLAB_PATH/'train_text_vecs', config.COLAB_PATH/'train_text_vecs2')

PosixPath('/content/train_text_vecs2')

In [None]:
# test (can be removed)

# shutil.rmtree(config.COLAB_PATH/'train_image_vecs2')
# shutil.rmtree(config.COLAB_PATH/'train_text_vecs2')

In [None]:
!tar -czf 'train_image_vecs.tgz' $'train_image_vecs'
!tar -czf 'train_text_vecs.tgz' $'train_text_vecs'

Push tar files into drive

In [None]:
shutil.copy(config.COLAB_PATH/'train_image_vecs.tgz', config.MODEL_PATH)
shutil.copy(config.COLAB_PATH/'train_text_vecs.tgz', config.MODEL_PATH)

'/content/drive/My Drive/Rakuten/models/train_image_vecs.tgz'

## Precomputed vecs from drive

In [None]:
# TODO: Add the path ids to these files to the github downloader

In [None]:
!git clone --quiet 'https://github.com/tezike/download_google_drive.git'
os.chdir('download_google_drive')
!python download_gdrive.py '1pn98vCPS_PvB6wOHXUuS2ITGt6LzPNAD' '../train_image_vecs.tgz'
!python download_gdrive.py '14aYrg4uaki3Ej_FASF2-T-q0lW1L2tPZ' '../train_text_vecs.tgz'
shutil.rmtree('../download_google_drive')
os.chdir('..')

In [None]:
# uncomment this and run if files are sucessfully downloaded from above

!tar -xzf 'train_image_vecs.tgz'
!tar -xzf 'train_text_vecs.tgz'

In [None]:
image_vec_precomputed = bcolz.open(config.COLAB_PATH/'train_image_vecs')
text_vec_precomputed = bcolz.open(config.COLAB_PATH/'train_text_vecs')

## input vector to fasttext vector mapping

In [None]:
class VectorDataset(torch.utils.data.Dataset):
    def __init__(self, inp_vecs, targ_vecs):
        self.inp_vecs = inp_vecs
        self.targ_vecs = targ_vecs

    def __len__(self):
        return len(self.inp_vecs)

    def __getitem__(self, i):
        inp_vecs = self.inp_vecs[i]
        targ_vecs = self.targ_vecs[i]

        return {
                'inp_vecs': torch.tensor(inp_vecs, dtype=torch.float), 
                'targ_vecs': torch.tensor(targ_vecs, dtype=torch.float)
                }

In [None]:
image_vec_precomputed_train, image_vec_precomputed_valid = train_test_split(image_vec_precomputed, test_size=0.2, shuffle=False)
text_vec_precomputed_train, text_vec_precomputed_valid = train_test_split(text_vec_precomputed, test_size=0.2, shuffle=False)

In [None]:
image_train_dataset = VectorDataset(image_vec_precomputed_train[:2000], all_label_vecs[:2000])
image_valid_dataset = VectorDataset(image_vec_precomputed_valid[:2000], all_label_vecs[:2000])

text_train_dataset = VectorDataset(text_vec_precomputed_train[:2000], all_label_vecs[:2000])
text_valid_dataset = VectorDataset(text_vec_precomputed_train[:2000], all_label_vecs[:2000])

In [None]:
# from torch.utils.data import SequentialSampler, DataLoader

In [None]:
# image_train_dl = DataLoader(image_train_dataset, batch_size=32, shuffle=False)
image_train_dl = DataLoader(image_train_dataset, batch_size=32*2, shuffle=True, drop_last=True)
image_valid_dl = DataLoader(image_valid_dataset, batch_size=32//2, shuffle=False, drop_last=False)

# text_train_dl = DataLoader(text_train_dataset, batch_size=32, shuffle=False)
text_train_dl = DataLoader(text_train_dataset, batch_size=32*2, shuffle=True, drop_last=True)
text_valid_dl = DataLoader(text_valid_dataset, batch_size=32//2, shuffle=False, drop_last=False)

In [None]:
class VectorNet(nn.Module):
    def __init__(self, inp_feat, mid_feat = 1024, out_feat=300):
        super(VectorNet, self).__init__()
        self.lin1 = nn.Linear(inp_feat, mid_feat, bias=True)
        self.bn = nn.BatchNorm1d(mid_feat)
        self.relu = nn.ReLU(inplace=True)
        self.drop = nn.Dropout(p=0.25, inplace=False)

        self.lin2 = nn.Linear(mid_feat, out_feat, bias=True)

    def forward(self, inp):
        out = self.lin1(inp)
        out = self.bn(out)
        out = self.relu(out)
        out = self.drop(out)
        out = self.lin2(out)
        return out

In [None]:
image_net = VectorNet(2048).to(device)
text_net = VectorNet(768*2).to(device)

In [None]:
def cos_loss(inp,targ):
    return 1 - F.cosine_similarity(inp,targ).mean()

In [None]:
def train(dl, model, optimizer, device):
    model.train()
    loss_meter = AverageMeter()
    p_bar = tqdm(dl, total=len(dl))

    for i, batch in enumerate(p_bar):
        inp_vecs = batch['inp_vecs']
        targ_vecs = batch['targ_vecs']

        inp_vecs = inp_vecs.to(device, dtype=torch.float)
        targ_vecs = targ_vecs.to(device, dtype=torch.float)

        optimizer.zero_grad()
        out = model.to(device)(inp_vecs)
        loss = cos_loss(out, targ_vecs)
        loss_meter.update(loss.item())

        if i % 100 == 1: print(f'Training Loss: {loss_meter.avg}')
        
        loss.backward()
        # optimizer.step()
        xm.optimizer_step(optimizer)

        del out
        del inp_vecs
        del targ_vecs

        p_bar.set_postfix(loss=loss_meter.avg)

def evaluate(dl, model, device):
    model.eval()
    loss_meter = AverageMeter()
    p_bar = tqdm(dl, total=len(dl))

    all_outs, all_targets = [], []

    with torch.no_grad():
    
        for i, batch in enumerate(p_bar):
            inp_vecs = batch['inp_vecs']
            targ_vecs = batch['targ_vecs']

            inp_vecs = inp_vecs.to(device)
            targ_vecs = targ_vecs.to(device)

            out = model.to(device)(inp_vecs)
            loss = cos_loss(out, targ_vecs)
            loss_meter.update(loss.item())

            if i % 100 == 1: print(f'Evaluation Loss: {loss_meter.avg}')

            # xm.master_print('before append')
            all_outs.append(out.cpu().numpy().tolist())
            # xm.master_print('after append0')
            all_targets.append(targ_vecs.cpu().numpy().tolist())
            # xm.master_print('after append1')

            del out
            del inp_vecs
            del targ_vecs

            p_bar.set_postfix(loss=loss_meter.avg)

        return all_outs, all_targets

In [None]:
import gc; gc.collect()

404

In [None]:
image_lr = 2e-02 * xm.xrt_world_size()
optimizer = torch.optim.Adam(image_net.parameters(), lr=image_lr)

model_path  = config.MODEL_PATH/'image2fasttext.bin'

best_metric = float('-inf')
for epoch in range(config.NUM_EPOCHS):
    train(image_train_dl, image_net, optimizer, device)
    all_outs, all_targs = evaluate(image_valid_dl, image_net, device)

    xm.master_print('here0')

    similarity = F.cosine_similarity(torch.tensor(all_outs), torch.tensor(all_targs)).mean()
    xm.master_print('here1')

    if similarity > best_metric:
        xm.master_print('Similarity score increased from ({} --> {}). Saving model!'.format(best_metric, similarity))
        best_metric = similarity
        # xm.save(image_net.state_dict(), model_path)

HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

Training Loss: 0.5851026177406311



HBox(children=(FloatProgress(value=0.0, max=125.0), HTML(value='')))

Evaluation Loss: 0.3421779274940491
Evaluation Loss: 0.34644026382296694

here0
here1
Similarity score increased from (-inf --> 0.4356200397014618). Saving model!


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

Training Loss: 0.3480779826641083



HBox(children=(FloatProgress(value=0.0, max=125.0), HTML(value='')))

Evaluation Loss: 0.3284217119216919
Evaluation Loss: 0.32945774758563323

here0
here1
Similarity score increased from (0.4356200397014618 --> 0.44807374477386475). Saving model!


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

Training Loss: 0.3360714316368103



HBox(children=(FloatProgress(value=0.0, max=125.0), HTML(value='')))

Evaluation Loss: 0.3219001591205597
Evaluation Loss: 0.323193266695621

here0
here1
Similarity score increased from (0.44807374477386475 --> 0.45931363105773926). Saving model!


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

Training Loss: 0.310409814119339


In [None]:
test_lr = 1e-05 * xm.xrt_world_size()
optimizer = torch.optim.Adam(text_net.parameters(), lr=test_lr)

model_path  = config.MODEL_PATH/'text2fasttext.bin'

best_metric = float('-inf')

for epoch in range(config.NUM_EPOCHS):
    train(text_train_dl, text_net, optimizer, device)
    all_outs, all_targs = evaluate(text_valid_dl, text_net, device)

    similarity = F.cosine_similarity(tensor(all_outs), tensor(all_targs))
    
    if similarity > best_metric:
        best_metric = similarity
        xm.master_print('Similarity score increased from ({} --> {}). Saving model!'.format(best_metric, similarity))
        xm.save(text_net.state_dict(), model_path)

In [None]:
def precompute_fasttextlike_vectors(test_dl, model, path, force):
    model.eval()
    if os.path.exists(path) and not force: return
    arr = bcolz.carray(np.zeros((0,768*2), np.float32), chunklen=1, mode='w', rootdir=path)

    with torch.no_grad():
        for batch in tqdm(test_dl):
            inp_vecs = batch['inp_vecs']

            inp_vecs = inp_vecs.to(device)
            targ_vecs = targ_vecs.to(device)

            out = model(inp_vecs)

            arr.append(out.cpu().numpy())
            arr.flush()

## Image Vector to fasttext vector (inference kinda)

In [None]:
def get_image_dl(bs):
    image_path = config.COLAB_PATH/'SIGIR-2020-EComDC-release/image/image_test/image_test.cross-modal_phase1'

    img_files = get_image_files(image_path)
    lbls = [0]* len(img_files)

    test_ds = ImageDataset(img_files, lbls, resize=True, is_train=False)
    
    test_dl = torch.utils.data.DataLoader(test_ds, batch_size=bs, drop_last=False, 
                                           num_workers=0, shuffle=False)
    return test_dl

In [None]:
def get_text_dl(bs):
    test_df = pd.read_csv(config.COLAB_PATH/'SIGIR-2020-EComDC-release/data/x_test_task2_phase1.tsv', sep='\t').fillna(' ')
    # test_df = pd.read_csv(data_path, sep='\t').fillna(' ')
    test_df['Prdlbl'] = 0

    test_ds = TextDataset((test_df.Title + 'xxfld' + test_df.Description).values, test_df.Prdlbl.values)

    test_dl = torch.utils.data.DataLoader(test_ds, batch_size=bs, drop_last=False,
                                            num_workers=0, shuffle=False)
    return test_dl

In [None]:
image_net = VectorNet(2048).to(device)
image_net.load_state_dict(torch.load(config.MODEL_PATH/'image2fasttext.bin'))

text_net = VectorNet(768*2).to(device)
text_net.load_state_dict(torch.load(config.MODEL_PATH/'text2fasttext.bin'))

In [None]:
test_image_dl = get_image_dl(bs)
test_text_dl = get_text_dl(bs)

### precompute the normal image and text vectors

In [None]:
precompute_image_vectors(test_image_dl, image_model, config.COLAB_PATH/'test_image_vecs', False)
precompute_text_vectors(test_text_dl, text_model, config.COLAB_PATH/'test_text_vecs', False)

In [None]:
test_image_vec_precomputed = bcolz.open(config.COLAB_PATH/'test_image_vecs')
test_text_vec_precomputed = bcolz.open(config.COLAB_PATH/'test_text_vecs')

In [None]:
# !tar -czf 'test_image_vecs.tgz' $'test_image_vecs'
# !tar -czf 'test_text_vecs.tgz' $'test_text_vecs'

Push tar files into drive

In [None]:
# shutil.copy(config.COLAB_PATH/'test_image_vecs.tgz', config.MODEL_PATH)
# shutil.copy(config.COLAB_PATH/'test_text_vecs.tgz', config.MODEL_PATH)

'/content/drive/My Drive/Rakuten/models/train_image_vecs.tgz'

### convert those vectors to fasttextlike vectors

In [None]:
test_all_label_vecs = [0] * len(test_image_vec_precomputed)

test_image_vec_dataset = VectorDataset(test_image_vec_precomputed, test_all_label_vecs)

test_text_vec_dataset = VectorDataset(test_text_vec_precomputed, test_all_label_vecs)

In [None]:
test_image_vec_dl = DataLoader(test_image_vec_dataset, batch_size=64, sampler=SequentialSampler)

test_text_vec_dl = DataLoader(test_text_vec_dataset, batch_size=64, sampler=SequentialSampler)

In [None]:
precompute_fasttextlike_vectors(test_image_vec_dl, image_net, config.COLAB_PATH/'fasttext_image_vecs', False)
precompute_fasttextlike_vectors(test_text_vec_dl, text_net, config.COLAB_PATH/'fasttext_text_vecs', False)

In [None]:
fasttext_image_vec_precomputed = bcolz.open(config.COLAB_PATH/'tmp/fasttext_image_vecs')
fasttext_text_vec_precomputed = bcolz.open(config.COLAB_PATH/'tmp/fasttext_text_vecs')