# Preliminaries

## Install

In [1]:
%%capture
!pip install transformers
import torch
import torch.nn.functional as F
import pandas as pd
import os
import ast


from transformers import BertTokenizer, BertModel
from torch.nn.utils.rnn import pad_sequence

## Device configuration

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


## Conect to google drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
source_folder = '/content/drive/MyDrive/data_rakuten' #source folder of csv files

## Load data

In [5]:
#Load data

XTrain = pd.read_csv(os.path.join(source_folder,'X_train_12tkObq.csv'), index_col=0)
XTest = pd.read_csv(os.path.join(source_folder,'X_test_gDTIJPh.csv'), index_col=0)

Ytrain_label = torch.load('/content/drive/MyDrive/data_rakuten/Ytrain_label.pt')

## Create dataloader

In [52]:
class TextDataset(torch.utils.data.Dataset):

    def __init__(self):
      #Load pre-computed tensors
      self.text_name = XTrain['item_name']
     # self.text_caption = XTrain['item_caption']
      self.tokenizer = BertTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-v2')
      self.labels = Ytrain_label
        #torch.cat((Xtrain_item_name,Xtrain_item_caption),0)
    def __len__(self):
        return len(self.text_name)

    def __getitem__(self, idx):
        tokenized_text_name    = self.tokenizer.tokenize(self.text_name[idx])
    #    tokenized_text_caption = self.tokenizer.tokenize(str(self.text_caption[idx])) #sometimes there is no caption so str() is required

        indexed_tokens_name    = self.tokenizer.convert_tokens_to_ids(tokenized_text_name)
      #  indexed_tokens_caption = self.tokenizer.convert_tokens_to_ids(tokenized_text_caption)
        
        tokens_tensor_name     = torch.tensor([indexed_tokens_name])
       # tokens_tensor_caption  = torch.tensor([indexed_tokens_caption])

        tokens_tensor_name    = tokens_tensor_name[0,:100] #to prevent tokens sequence longer than 512 tokens
      #  tokens_tensor_caption = tokens_tensor_caption[0,:412] #to prevent tokens sequence longer than 512 tokens

        #return  torch.cat((tokens_tensor_name,tokens_tensor_caption),0),self.labels[:,idx]
        return  tokens_tensor_name,self.labels[:,idx]

def generate_batch(data_batch):
  tokens_batch = [item[0] for item in data_batch]
  labels_batch = [item[1] for item in data_batch]
  tokens_batch = pad_sequence(tokens_batch,batch_first=True, padding_value=1)
  labels_batch = pad_sequence(labels_batch,batch_first=True, padding_value=0) #just to have tensor instead of list
  
  return tokens_batch, labels_batch

trainSet= TextDataset()
trainLoader = torch.utils.data.DataLoader(trainSet, batch_size=64,shuffle=True, collate_fn=generate_batch)

In [53]:
class CustomModel(torch.nn.Module):

    def __init__(self):
        super(CustomModel, self).__init__()

        
        self.encoder   =  BertModel.from_pretrained('cl-tohoku/bert-base-japanese-v2')
        for param in self.encoder.parameters(): 
                param.requires_grad = False
        self.fc1 = torch.nn.Linear(768, 450)
        self.fc2 = torch.nn.Linear(450, 200)
        self.fc3 = torch.nn.Linear(200, 19)


    def forward(self, tokens_tensor):
        text_features  = self.encoder.forward(input_ids=tokens_tensor,return_dict=True)
        text_features  = text_features['pooler_output'].squeeze(0)
        text_features = F.relu(self.fc1(text_features))
        text_features = F.relu(self.fc2(text_features))
        logits = self.fc3(text_features)

        return logits

    def relaxation(self,type_relax):
        if type_relax=="soft":
            for name,param in self.named_parameters():
                if name.startswith('encoder.encoder.layer.11') or name.startswith('encoder.pooler.dense'):
                    param.requires_grad = True
        elif type_relax=="hard":
            for param in self.encoder.parameters(): 
                param.requires_grad = True
    

model=CustomModel()
model.relaxation('soft')
model.to(device)

CustomModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32768, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

## Train the model

In [54]:
nbr_labels_positive = torch.tensor([25673,71831,34014,33338,2383,8303,21697,28814,8353,12597,25017,10378,24582,10355,23583,12911,3325,51751,14534]) #number of labels
nbr_labels_negative = nbr_labels_positive.sum()*torch.ones(19)-nbr_labels_positive
coeffs = nbr_labels_negative/nbr_labels_positive    #coefficients for each label
coeffs = coeffs.to(device)

In [55]:
criterion = torch.nn.BCEWithLogitsLoss()#pos_weight=coeffs)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.7)

# Training
def train(epoch):
    print('\nEpoch: %d' % epoch)
    model.train()
    train_loss = 0

    for batch_idx, (inputs, targets) in enumerate(trainLoader):
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)

        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        if batch_idx%100==0:
            print('{:.0f}%|Train Loss: {:.5f} '.format(100*batch_idx/(len(trainLoader)+1),train_loss/(batch_idx+1)))


In [None]:
# training loop
for epoch in range(100):
    train(epoch)
    scheduler.step()


Epoch: 0
0%|Train Loss: 0.70422 
3%|Train Loss: 0.33453 
6%|Train Loss: 0.32090 
9%|Train Loss: 0.31308 
12%|Train Loss: 0.31142 
15%|Train Loss: 0.30896 
18%|Train Loss: 0.30770 
21%|Train Loss: 0.30621 


In [10]:
# training loop
for epoch in range(100):
    train(epoch)
    scheduler.step()


Epoch: 0
0%|Train Loss: 0.69838 
3%|Train Loss: 0.32401 
6%|Train Loss: 0.31578 
9%|Train Loss: 0.31148 
12%|Train Loss: 0.30887 
15%|Train Loss: 0.30719 
18%|Train Loss: 0.30627 
21%|Train Loss: 0.30568 
24%|Train Loss: 0.30498 
27%|Train Loss: 0.30414 
30%|Train Loss: 0.30356 
33%|Train Loss: 0.30269 
36%|Train Loss: 0.30240 
39%|Train Loss: 0.30208 
42%|Train Loss: 0.30182 
45%|Train Loss: 0.30184 
48%|Train Loss: 0.30135 
51%|Train Loss: 0.30060 
54%|Train Loss: 0.30038 
57%|Train Loss: 0.30017 
60%|Train Loss: 0.29981 
63%|Train Loss: 0.29941 
66%|Train Loss: 0.29896 
69%|Train Loss: 0.29863 
72%|Train Loss: 0.29832 
75%|Train Loss: 0.29798 
78%|Train Loss: 0.29766 
81%|Train Loss: 0.29720 
84%|Train Loss: 0.29702 
87%|Train Loss: 0.29672 
90%|Train Loss: 0.29638 
93%|Train Loss: 0.29611 
97%|Train Loss: 0.29583 
100%|Train Loss: 0.29545 

Epoch: 1
0%|Train Loss: 0.30309 
3%|Train Loss: 0.28496 
6%|Train Loss: 0.28492 
9%|Train Loss: 0.28479 
12%|Train Loss: 0.28410 
15%|Train Lo

KeyboardInterrupt: ignored

In [None]:
#Save weights
model_file = "/content/drive/MyDrive/data_rakuten/textmodelFinetune.pth"
torch.save(model.state_dict(), model_file)

In [None]:
#Load weights
model_file = "/content/drive/MyDrive/data_rakuten/textmodelFinetune.pth"
state_dict = torch.load(model_file)
model.load_state_dict(state_dict)

## Generate csv file for submission

In [None]:
class TestDataset(torch.utils.data.Dataset):

    def __init__(self,):

        self.features = XTest["item_caption"]
        
        

    def __len__(self):
        return self.features.shape[1]


    #all this processing needs to be done here because the output of __getitem__ needs to have a fixed size to use a BS>1
    def __getitem__(self, idx):
 
        return  self.features[:,idx]

testSet= TestDataset()
testLoader = torch.utils.data.DataLoader(testSet, batch_size=1,shuffle=False, num_workers=2)

In [None]:
inv_dico_labels={ 0: "Beige",1:"Black",2:"Blue",3:"Brown",4:"Burgundy",5:"Gold",6:"Green",7:"Grey",
                 8:"Khaki",9:"Multiple Colors",10:"Navy",11:"Orange",12:"Pink",
                 13:"Purple",14:"Red",15:"Silver",16:"Transparent",17:"White",18:"Yellow"}

model.eval()

#Write prediction in the submission.csv file

with open('/content/drive/MyDrive/data_rakuten/submission.csv', 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',')
    spamwriter.writerow([',color_tags,'])
    with torch.no_grad():
        for batch_idx, inputs in enumerate(testLoader):
            inputs = inputs.to(device)
            outputs = model(inputs)
            prediction=[]
            for indice,logits in enumerate(outputs.squeeze(0)):
                if logits>0: #put the tag if the proba is greater than 0.5
                    prediction.append(inv_dico_labels[indice]) 
            
            if len(prediction)>1:
                spamwriter.writerow(['{},"{}"'.format(batch_idx,prediction)])
            else:
                spamwriter.writerow(['{},{}'.format(batch_idx,prediction)])
            if batch_idx>300:
              break