## 1 - Import packages

In [1]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd 
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import get_scheduler
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

## Config

In [3]:
class Config:
    # data preprocessing
    train_batch = 32
    
    # model setup
    model_path = 'siebert/sentiment-roberta-large-english'
    n_sentiments = 5
    
    # training
    epochs = 3
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    learning_rate = 3e-5
    print_every = 100

## Read Data

In [4]:
train = pd.read_csv('train.tsv', sep='\t')
test = pd.read_csv('test.tsv', sep='\t')
sample_submission = pd.read_csv('sampleSubmission.csv')

In [5]:
train.head(5)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [6]:
test.head(5)

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [7]:
sample_submission.head(5)

Unnamed: 0,PhraseId,Sentiment
0,156061,2
1,156062,2
2,156063,2
3,156064,2
4,156065,2


## 2 - Download the finished model and tokenizer
SOURCE: https://huggingface.co/siebert/sentiment-roberta-large-english

In [8]:
tokenizer = AutoTokenizer.from_pretrained(Config.model_path)
model = AutoModelForSequenceClassification.from_pretrained(Config.model_path, num_labels=Config.n_sentiments, ignore_mismatched_sizes=True)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at siebert/sentiment-roberta-large-english and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 1024]) in the checkpoint and torch.Size([5, 1024]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Encode the input data (phrases & labels)

In [9]:
tokenized_train_data = train['Phrase'].apply(lambda x: tokenizer(x, truncation=True))
collator = DataCollatorWithPadding(tokenizer)

In [10]:
sent_values = train['Sentiment'].values

for i in range(len(tokenized_train_data)):
    tokenized_train_data[i]['label'] = sent_values[i]

print(tokenized_train_data)
print(tokenized_train_data[0])

0         [input_ids, attention_mask, label]
1         [input_ids, attention_mask, label]
2         [input_ids, attention_mask, label]
3         [input_ids, attention_mask, label]
4         [input_ids, attention_mask, label]
                         ...                
156055    [input_ids, attention_mask, label]
156056    [input_ids, attention_mask, label]
156057    [input_ids, attention_mask, label]
156058    [input_ids, attention_mask, label]
156059    [input_ids, attention_mask, label]
Name: Phrase, Length: 156060, dtype: object
{'input_ids': [0, 250, 651, 9, 11363, 1115, 4216, 16987, 5, 2329, 1580, 14, 99, 16, 205, 13, 5, 29910, 16, 67, 205, 13, 5, 821, 6072, 2156, 103, 9, 61, 10930, 524, 9764, 53, 4146, 9, 61, 5353, 7, 203, 9, 10, 527, 479, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 1}


## Dataloader

In [11]:
train_loader = DataLoader(tokenized_train_data, batch_size=Config.train_batch, shuffle=True, collate_fn=collator)

## 3 - Training the model

In [12]:
def train(model, optimizer, train_loader, lr_scheduler,
          progress_bar, num_epochs=Config.epochs, device=Config.device, print_every=Config.print_every):
    
    model.train()
    for e in range(num_epochs):
        ###################
        # train the model #
        ###################
        n_iters = 0
        for batch in train_loader:
            # pass the data to the device
            batch = {k: v.to(device) for k, v in batch.items()}
            # forward
            out = model(**batch)
            loss = out.loss
            # optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            # update bar and output loss
            progress_bar.update(1)
            if n_iters % print_every == 0:
              print(f'EPOCH: {e}\t LOSS: {loss}')
            n_iters += 1

In [13]:
# train params
model = model.to(Config.device)
progress_bar = tqdm(range(Config.epochs))
optimizer = AdamW(model.parameters(), lr=Config.learning_rate)
n_steps = Config.epochs * len(train_loader)
lr_scheduler = get_scheduler('linear', optimizer=optimizer, num_warmup_steps=0, num_training_steps=n_steps)
train(model, optimizer, train_loader, lr_scheduler, progress_bar)

  0%|          | 0/3 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


EPOCH: 0	 LOSS: 1.8150309324264526
EPOCH: 0	 LOSS: 0.9176779389381409
EPOCH: 0	 LOSS: 0.928498387336731
EPOCH: 0	 LOSS: 0.8213568329811096
EPOCH: 0	 LOSS: 0.9981415271759033
EPOCH: 0	 LOSS: 0.8740782141685486
EPOCH: 0	 LOSS: 0.898932158946991
EPOCH: 0	 LOSS: 0.9598965644836426
EPOCH: 0	 LOSS: 0.6587538719177246
EPOCH: 0	 LOSS: 0.6056280732154846
EPOCH: 0	 LOSS: 0.8944194912910461
EPOCH: 0	 LOSS: 0.6544626355171204
EPOCH: 0	 LOSS: 0.9701473116874695
EPOCH: 0	 LOSS: 0.6983817219734192
EPOCH: 0	 LOSS: 0.8921201825141907
EPOCH: 0	 LOSS: 0.8020201325416565
EPOCH: 0	 LOSS: 0.7697359919548035
EPOCH: 0	 LOSS: 0.6693500876426697
EPOCH: 0	 LOSS: 0.61357581615448
EPOCH: 0	 LOSS: 0.5793221592903137
EPOCH: 0	 LOSS: 0.9366759657859802
EPOCH: 0	 LOSS: 0.777459979057312
EPOCH: 0	 LOSS: 0.9879554510116577
EPOCH: 0	 LOSS: 1.0580925941467285
EPOCH: 0	 LOSS: 0.7522875070571899
EPOCH: 0	 LOSS: 1.1362261772155762
EPOCH: 0	 LOSS: 0.7770671844482422
EPOCH: 0	 LOSS: 0.6883115768432617
EPOCH: 0	 LOSS: 0.6610530

## 4 - Get test predictions and make submission

In [14]:
# Encode the input phrases data 
tokenized_test_data = test['Phrase'].apply(lambda x: tokenizer(x, truncation=True))
# Dataloader
test_loader = DataLoader(tokenized_test_data, batch_size=Config.train_batch, shuffle=True, collate_fn=collator)

In [20]:
def predict(model, test_loader, device=Config.device):
    # list to save preds
    test_preds = []
    
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            # pass the data to the device
            batch = {k: v.to(device) for k, v in batch.items()}
            # forward
            out = model(**batch)
            # preds raw logits
            logits = out.logits
            # get the most likely predict
            preds = torch.argmax(logits, dim=-1)
            test_preds.extend(preds)
      
    test_preds = [i.item() for i in test_preds]
            
    return test_preds

In [21]:
# get predictions
test_preds = predict(model, test_loader)

In [22]:
# make submission DataFrame
submission = pd.DataFrame(list(zip(test['PhraseId'], test_preds)), columns=['PhraseId', 'Sentiment'])
submission.head(25)

Unnamed: 0,PhraseId,Sentiment
0,156061,0
1,156062,2
2,156063,2
3,156064,1
4,156065,2
5,156066,2
6,156067,2
7,156068,3
8,156069,2
9,156070,4


In [23]:
submission.to_csv('submission.csv', index=False)