In [1]:
#Dependencies
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import XLMRobertaTokenizer, XLMRobertaModel
import pandas as pd
from torch.utils.data import Dataset, DataLoader

In [2]:
# DataLoader
class TwinNetDataset(Dataset):
    def __init__(self, tokenize1, tokenize2):
        self.tokenize1 = tokenize1
        self.tokenize2 = tokenize2
        train_df = pd.read_csv(r'C:\Users\pshankesi\source\repos\KDDCup2022-ESCI\Data\task_1_query-product_ranking\data\processed\public\task_1_query-product_ranking\train-v0.3.csv')
        products_df = pd.read_csv(r'C:\Users\pshankesi\source\repos\KDDCup2022-ESCI\Data\task_1_query-product_ranking\data\processed\public\task_1_query-product_ranking\product_catalogue-v0.3.csv')
        train_df = pd.merge(train_df, products_df, how='left', left_on=['query_locale','product_id'], right_on=['product_locale', 'product_id'])
        esci_label = {
        'exact' : 1,
        'substitute' : 1,
        'complement' : 1,
        'irrelevant' : 0,
        }
        train_df['label'] = train_df['esci_label'].apply(lambda x: esci_label[x])
        self.training_data = train_df[['query','product_title','label']].copy()
    
    def __len__(self):
        return len(self.training_data)
    
    def __getitem__(self, index):
        tk1 = self.tokenize1(self.training_data['query'][index])
        tk2 = self.tokenize2(self.training_data['product_title'][index])
        ids_1, attention_mask_1 = tk1['input_ids'], tk1['attention_mask']
        ids_2, attention_mask_2 = tk2['input_ids'], tk2['attention_mask']
        
        return {
            'ids': [ids_1,ids_2],
            'attention_mask': [attention_mask_1,attention_mask_2],
            'labels' : self.training_data.label[index]
        }

In [3]:
# Tokenize function
tokenizer_query = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
def tokenize_fn_query(text):
    return tokenizer_query(text,padding="max_length", max_length=512, truncation=True,return_tensors="pt")

In [4]:
dataset = TwinNetDataset(tokenize_fn_query,tokenize_fn_query)

In [5]:
# Model Building
class TwinBert(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
        
    def forward_once(self, ids, attention_mask):
        output = self.model(ids.squeeze(), attention_mask.squeeze())
        return output
    
    def forward(self, ids, attention_mask):
        output1 = self.forward_once(ids[0],attention_mask[0])
        output2 = self.forward_once(ids[1],attention_mask[1])
        return output1,output2

model = TwinBert()

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
train_params = {'batch_size': 8,
                'shuffle': True,
                'num_workers': 0
                }
train_dataloader = DataLoader(dataset, **train_params)

In [7]:
# Loss Function
import torch.nn.functional as F
def loss_fn(output1, output2, labels):
    query_vecs = output1.last_hidden_state[:,0,:]
    product_vecs = output2.last_hidden_state[:,0,:]
    y_pred = F.cosine_similarity(query_vecs,product_vecs).sigmoid()
    lossfn = nn.BCELoss()
    labels = labels.to(torch.float32)
    loss = lossfn(y_pred,labels)
    return loss

In [8]:
# Optimizers specified in the torch.optim package
from transformers import AdamW
optim = AdamW(model.parameters(), lr=5e-5)

# Training
model.train()
num_train_epochs=1

for epoch in range(num_train_epochs):
    for step, batch in enumerate(train_dataloader):
        ids, attention_mask = batch['ids'], batch['attention_mask']
        labels = batch['labels']
        
        output1, output2 = model(ids, attention_mask)
        
        optim.zero_grad()
        
        loss = loss_fn(output1, output2, labels)
        
        loss.backward()
        
        optim.step()
        
        # Gather Data and Report 
        print(loss.item())




0.5815460085868835
0.43850165605545044


KeyboardInterrupt: 