In [None]:
#!pip install transformers datasets accelerate nvidia-ml-py3
#!pip install accelerate
!pip install bitsandbytes

In [1]:
#Dependencies
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import XLMRobertaTokenizer, XLMRobertaModel
import pandas as pd
from torch.utils.data import Dataset, DataLoader
torch.cuda.empty_cache()

In [2]:
from torch import cuda
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
device

'cuda'

In [4]:
# DataLoader
class TwinNetDataset(Dataset):
    def __init__(self, tokenize1, tokenize2):
        self.tokenize1 = tokenize1
        self.tokenize2 = tokenize2
        train_df = pd.read_csv(r'C:\Users\prakh\Desktop\AmazonShoppingChallenge\data\processed\public\task_1_query-product_ranking\train-v0.3.csv')
        products_df = pd.read_csv(r'C:\Users\prakh\Desktop\AmazonShoppingChallenge\data\processed\public\task_1_query-product_ranking\product_catalogue-v0.3.csv')
        train_df = pd.merge(train_df, products_df, how='left', left_on=['query_locale','product_id'], right_on=['product_locale', 'product_id'])
        esci_label = {
        'exact' : 1,
        'substitute' : 1,
        'complement' : 1,
        'irrelevant' : 0,
        }
        train_df['label'] = train_df['esci_label'].apply(lambda x: esci_label[x])
        self.training_data = train_df[['query','product_title','label']].copy()
    
    def __len__(self):
        return len(self.training_data)
    
    def __getitem__(self, index):
        tk1 = self.tokenize1(self.training_data['query'][index])
        tk2 = self.tokenize2(self.training_data['product_title'][index])
        ids_1, attention_mask_1 = tk1['input_ids'], tk1['attention_mask']
        ids_2, attention_mask_2 = tk2['input_ids'], tk2['attention_mask']
        
        return {
            'ids_1': ids_1,
            'ids_2' : ids_2,
            'attention_mask_1': attention_mask_1,
            'attention_mask_2': attention_mask_2,
            'labels' : self.training_data.label[index]
        }

In [5]:
# Tokenize function
tokenizer_query = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
def tokenize_fn_query(text):
    return tokenizer_query(text,padding="max_length", max_length=512, truncation=True,return_tensors="pt")

In [6]:
dataset = TwinNetDataset(tokenize_fn_query,tokenize_fn_query)

In [7]:
# Loss Function
import torch.nn.functional as F
def loss_fn(output1, output2, labels):
    query_vecs = output1.last_hidden_state[:,0,:]
    product_vecs = output2.last_hidden_state[:,0,:]
    y_pred = F.cosine_similarity(query_vecs,product_vecs).sigmoid()
    lossfn = nn.BCELoss()
    labels = labels.to(torch.float32)
    loss = lossfn(y_pred,labels)
    return loss

In [8]:
from transformers import logging

logging.set_verbosity_warning()
# Model Building
class TwinBert(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
        
    def forward_once(self, ids, attention_mask):
        output = self.model(ids.squeeze(), attention_mask.squeeze())
        return output
    
    def forward(self, data):
        output1 = self.forward_once(data['ids_1'],data['attention_mask_1'])
        output2 = self.forward_once(data['ids_2'],data['attention_mask_2'])
        return loss_fn(output1, output2, data['labels'])

model = TwinBert()
model.to(device)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TwinBert(
  (model): XLMRobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,)

In [9]:
train_params = {'batch_size': 2,
                'shuffle': True,
                'num_workers': 0
                }
train_dataloader = DataLoader(dataset, **train_params)

In [None]:
from transformers import TrainingArguments, Trainer, logging
from accelerate import Accelerator


default_args = {
    "output_dir": "tmp",
    "evaluation_strategy": "steps",
    "num_train_epochs": 1,
    "log_level": "error",
    "report_to": "none",
}

training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    fp16=True,
    optim="adafactor",
    **default_args,
)

if training_args.gradient_checkpointing:
    model.model.gradient_checkpointing_enable()
    
accelerator = Accelerator(fp16=training_args.fp16)
model, optimizer, dataloader = accelerator.prepare(model, adam_bnb_optim, train_dataloader)

In [11]:
# Optimizers specified in the torch.optim package
from transformers import AdamW
from transformers.optimization import Adafactor, AdafactorSchedule
import bitsandbytes as bnb

optim = bnb.optim.Adam8bit(model.parameters(), lr=0.001, betas=(0.9, 0.995))

# Training
model.train()
num_train_epochs=1

for epoch in range(num_train_epochs):
    for step, batch in enumerate(train_dataloader):
        #ids_0, attention_mask_0 = batch['ids'][0].to(device, dtype = torch.int), batch['attention_mask'][0].to(device, dtype = torch.int)
        #ids_1, attention_mask_1 = batch['ids'][1].to(device, dtype = torch.int), batch['attention_mask'][1].to(device, dtype = torch.int)
        #labels = batch['labels'].to(device, dtype = torch.int)
        
        for key, value in batch.items():
            batch[key] = batch[key].to(device)
        
        loss = model(batch)
        
        optim.zero_grad()
        
        loss.backward()
        
        optim.step()
        
        # Gather Data and Report 
        print(loss.item())



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link
CUDA SETUP: Loading binary C:\Users\prakh\anaconda3\envs\pytorch_gpu\lib\site-packages\bitsandbytes\libbitsandbytes_cpu.so...


  warn(
  warn(
  warn(
  warn(


TypeError: argument of type 'WindowsPath' is not iterable

In [None]:
from numba import cuda
cuda.select_device(0)
cuda.close()