**As we have to deal with a multimodal classification task, DataLoader from `torchtext.data` cannot be used. This result in a prohibiting computation time if we compute the features on the fly.** \
**Furthemore, the BERT base model is a huge model and already very accurate so we will not fine-tune it. We want to use it as a feature extactor only.** \
<font color='red'> **==> We will precompute the textual features and save them in `torch.tensors`.** </font>

# Preliminaries

## Install and import libraries 

In [None]:
!pip install transformers==3.5.1  #to use repo cl-tohoku/bert-japanese
!pip install sentencepiece #to deal with Japanese language
!pip install fugashi #to deal with Japanese language
!pip install ipadic  #to deal with Japanese language

import torch
import os
import fugashi 
import ast
import csv
import pandas as pd
from transformers.tokenization_bert_japanese import BertJapaneseTokenizer
from transformers import BertModel
from torchtext.data import Field, TabularDataset, BucketIterator, Iterator

## Set computation engine

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

## Connect to drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Load text data and compute features

In [None]:
source_folder = '/content/drive/MyDrive/data_rakuten' #source folder of csv files

In [None]:
#Tokenizer
tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")

#BERT
encoder = BertModel.from_pretrained('cl-tohoku/bert-base-japanese')
for param in encoder.parameters():  #Freeze the weights of BERT because we want to use it only for sentence embedding
    param.requires_grad = False

encoder.eval()
encoder.cuda()

In [None]:
#Load data

XTrain = pd.read_csv(os.path.join(source_folder,'X_train_12tkObq.csv'), index_col=0)
XTest = pd.read_csv(os.path.join(source_folder,'X_test_gDTIJPh.csv'), index_col=0)

YTrain = pd.read_csv(os.path.join(source_folder,'y_train_Q9n2dCu.csv'), index_col=0)
YTrain['color_tags'] = YTrain['color_tags'].apply(lambda x: ast.literal_eval(x)) #to change str to list labels

In [None]:
def compute_feature(text):
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor  = torch.tensor([indexed_tokens]).to(device)
    tokens_tensor=tokens_tensor[:,:512] #to prevent tokens sequence longer than 512 tokens
    text_features  = encoder.forward(input_ids=tokens_tensor,return_dict=True)
    pooler_output  = text_features['pooler_output'].squeeze(0)

    return  pooler_output


In [None]:
Xtrain_item_caption=torch.zeros([768,len(XTrain)])   # a big guy :p

for idx,item_caption in enumerate(XTrain["item_caption"]):
    Xtrain_item_caption[:,idx] = compute_feature("item_caption")
    if idx%1000==0:
      print(idx)

torch.save(Xtrain_item_caption,'/content/drive/MyDrive/data_rakuten/Xtrain_item_caption.pt')

In [None]:
Xtrain_item_name=torch.zeros([768,len(XTrain)])   # another big guy :p

for idx,item_name in enumerate(XTrain["item_name"]):
    Xtrain_item_name[:,idx] = compute_feature("item_name")
    if idx%1000==0:
      print(idx)

torch.save(Xtrain_item_name,'/content/drive/MyDrive/data_rakuten/Xtrain_item_name.pt')

In [None]:
Xtest_item_caption=torch.zeros([768,len(XTest)])   # another big guy :p

for idx,item_caption in enumerate(XTest["item_caption"]):
    Xtest_item_caption[:,idx] = compute_feature("item_caption")
    if idx%1000==0:
      print(idx)

torch.save(Xtest_item_caption,'/content/drive/MyDrive/data_rakuten/Xtest_item_caption.pt')

In [None]:
Xtest_item_name=torch.zeros([768,len(XTest)])   # another big guy :p

for idx,item_name in enumerate(XTest["item_name"]):
    Xtest_item_name[:,idx] = compute_feature("item_name")
    if idx%1000==0:
      print(idx)

torch.save(Xtest_item_name,'/content/drive/MyDrive/data_rakuten/Xtest_item_name.pt')

In [None]:
dico_labels={  "Beige": 0,  "Black": 1,  "Blue": 2,  "Brown": 3,  "Burgundy": 4,  "Gold": 5,  "Green": 6,  
  "Grey": 7,    "Khaki": 8,  "Multiple Colors": 9,  "Navy": 10,  "Orange": 11,
  "Pink": 12,"Purple": 13,"Red": 14,"Silver": 15, "Transparent": 16,"White": 17,"Yellow": 18  }

In [None]:
Ytrain_label=torch.zeros([19,len(YTrain)])   # another big guy :p

for idx,str_labels in enumerate(YTrain['color_tags']):
    int_labels=[dico_labels[color] for color in str_labels]
    tensor_labels=torch.zeros(19)
    for label in int_labels:
        Ytrain_label[label,idx]=1

    if idx%1000==0:
      print(idx)

torch.save(Ytrain_label,'/content/drive/MyDrive/data_rakuten/Ytrain_label.pt')