## 1 - Import packages

In [1]:
import pandas as pd 
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import get_scheduler
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

## Config

In [2]:
class Config:
    # data preprocessing
    train_batch = 64
    
    # model setup
    model_path = 'siebert/sentiment-roberta-large-english'
    n_sentiments = 5
    
    # training
    epochs = 3
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    learning_rate = 3e-5

## Read Data

In [3]:
train = pd.read_csv('train.tsv', sep='\t')
test = pd.read_csv('test.tsv', sep='\t')

In [4]:
train.head(5)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
test.head(5)

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


## 2 - Download the finished model and tokenizer
SOURCE: https://huggingface.co/siebert/sentiment-roberta-large-english

In [6]:
tokenizer = AutoTokenizer.from_pretrained(Config.model_path)
model = AutoModelForSequenceClassification.from_pretrained(Config.model_path, num_labels=Config.n_sentiments, ignore_mismatched_sizes=True)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at siebert/sentiment-roberta-large-english and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 1024]) in the checkpoint and torch.Size([5, 1024]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Encode the input data (phrases & labels)

In [7]:
tokenized_train_data = train['Phrase'].apply(lambda x: tokenizer(x, truncation=True))
collator = DataCollatorWithPadding(tokenizer)

In [9]:
sent_values = train['Sentiment'].values

for i in range(len(tokenized_train_data)):
    tokenized_train_data[i]['label'] = sent_values[i]

print(tokenized_train_data)
print(tokenized_train_data[0])

0         [input_ids, attention_mask, label]
1         [input_ids, attention_mask, label]
2         [input_ids, attention_mask, label]
3         [input_ids, attention_mask, label]
4         [input_ids, attention_mask, label]
                         ...                
156055    [input_ids, attention_mask, label]
156056    [input_ids, attention_mask, label]
156057    [input_ids, attention_mask, label]
156058    [input_ids, attention_mask, label]
156059    [input_ids, attention_mask, label]
Name: Phrase, Length: 156060, dtype: object
{'input_ids': [0, 250, 651, 9, 11363, 1115, 4216, 16987, 5, 2329, 1580, 14, 99, 16, 205, 13, 5, 29910, 16, 67, 205, 13, 5, 821, 6072, 2156, 103, 9, 61, 10930, 524, 9764, 53, 4146, 9, 61, 5353, 7, 203, 9, 10, 527, 479, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 1}


## Dataloader

In [10]:
train_loader = DataLoader(tokenized_train_data, batch_size=Config.train_batch, shuffle=True, collate_fn=collator)

## 3 - Training the model

In [11]:
def train(model, optimizer, train_loader, lr_scheduler,
          progress_bar, num_epochs=Config.epochs, device=Config.device):
    
    model.train()
    for e in range(num_epochs):
        ###################
        # train the model #
        ###################
        for batch in train_loader:
            # pass the data to the device
            batch = {k: v.to(device) for k, v in batch.items()}
            # forward
            out = model(**batch)
            loss = out.loss
            # optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            # update bar and output loss
            progress_bar.update(1)
            print(f'EPOCH: {e}\t LOSS: {loss}')

In [12]:
# train params
model = model.to(Config.device)
progress_bar = tqdm(range(Config.epochs))
optimizer = AdamW(model.parameters(), lr=Config.learning_rate)
n_steps = Config.epochs * len(train_loader)
lr_scheduler = get_scheduler('linear', optimizer=optimizer, num_warmup_steps=0, num_training_steps=n_steps)
train(model, optimizer, train_loader, lr_scheduler, progress_bar)

  0%|          | 0/3 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


RuntimeError: CUDA out of memory. Tried to allocate 42.00 MiB (GPU 0; 4.00 GiB total capacity; 3.28 GiB already allocated; 0 bytes free; 3.44 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

## 4 - Get test predictions and make submission

In [None]:
# Encode the input phrases data 
tokenized_test_data = test['Phrase'].apply(lambda x: tokenizer(x, truncation=True))
# Dataloader
test_loader = DataLoader(tokenized_test_data, batch_size=Config.train_batch, shuffle=True, collate_fn=collator)

In [None]:
def predict(model, test_loader):
    # list to save preds
    test_preds = []
    
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            # pass the data to the device
            batch = {k: v.to(device) for k, v in batch.items()}
            # forward
            out = model(**batch)
            # preds raw logits
            logits = out.logits
            # get the most likely predict
            preds = torch.argmax(logits, dim=-1).item()
            test_preds.extend(preds)
            
    return test_preds

In [None]:
# get predictions
test_preds = predict(model, test_loader)

In [None]:
# make submission DataFrame
submission = pd.DataFrame(list(zip(test['PhraseId'], test_preds)), columns=['PhraseId', 'Sentiment'])
submission.head(25)

In [None]:
submission.to_csv('submission.csv', index=False)