## 1 - Import packages

In [1]:
! pip install transformers





In [2]:
import pandas as pd 
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import get_scheduler, AdamW
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

## Config

In [3]:
class Config:
    # data preprocessing
    train_batch = 4
    
    # model setup
    model_path = 'siebert/sentiment-roberta-large-english'
    n_sentiments = 5
    
    # training
    epochs = 3
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    learning_rate = 3e-5
    print_every = 1000

## Read Data

In [4]:
train = pd.read_csv('train.tsv', sep='\t')
test = pd.read_csv('test.tsv', sep='\t')
sample_submission = pd.read_csv('sampleSubmission.csv')

In [5]:
train.head(5)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [6]:
test.head(5)

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [7]:
sample_submission.head(5)

Unnamed: 0,PhraseId,Sentiment
0,156061,2
1,156062,2
2,156063,2
3,156064,2
4,156065,2


## 2 - Download the finished model and tokenizer
SOURCE: https://huggingface.co/siebert/sentiment-roberta-large-english

In [8]:
tokenizer = AutoTokenizer.from_pretrained(Config.model_path)
model = AutoModelForSequenceClassification.from_pretrained(Config.model_path, num_labels=Config.n_sentiments, ignore_mismatched_sizes=True)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at siebert/sentiment-roberta-large-english and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 1024]) in the checkpoint and torch.Size([5, 1024]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Encode the input data (phrases & labels)

In [9]:
def phrase_tokenize(phrase):
    return tokenizer(phrase, truncation=True)

In [None]:
tokenized_train_data = train['Phrase'].map(phrase_tokenize)
collator = DataCollatorWithPadding(tokenizer)

In [None]:
sent_values = train['Sentiment'].values

for i in range(len(tokenized_train_data)):
    tokenized_train_data[i]['label'] = sent_values[i]

print(tokenized_train_data)
print(tokenized_train_data[0])

## Dataloader

In [None]:
train_loader = DataLoader(tokenized_train_data, batch_size=Config.train_batch, shuffle=True, collate_fn=collator)

## 3 - Training the model

In [None]:
def train(model, optimizer, train_loader, lr_scheduler,
          progress_bar, num_epochs=Config.epochs, device=Config.device, print_every=Config.print_every):
    
    model.train()
    for e in range(num_epochs):
        ###################
        # train the model #
        ###################
        n_iters = 0
        for batch in train_loader:
            # pass the data to the device
            batch = {k: v.to(device) for k, v in batch.items()}
            # forward
            out = model(**batch)
            loss = out.loss
            # optimize
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            # update bar and output loss
            progress_bar.update(1)
            if n_iters % print_every == 0:
              print(f'EPOCH: {e}\t LOSS: {loss}')
              print("Using learning-rate scheduler with LR ", lr_scheduler.get_last_lr())
            n_iters += 1

In [None]:
# train params
model = model.to(Config.device)
progress_bar = tqdm(range(Config.epochs))
optimizer = AdamW(model.parameters(), lr=Config.learning_rate)
n_steps = Config.epochs * len(train_loader)
lr_scheduler = get_scheduler('linear', optimizer=optimizer, num_warmup_steps=0, num_training_steps=n_steps)
train(model, optimizer, train_loader, lr_scheduler, progress_bar)

## 4 - Get test predictions and make submission

In [None]:
# Encode the input phrases data 
tokenized_test_data = test['Phrase'].map(phrase_tokenize)
# Dataloader
test_loader = DataLoader(tokenized_test_data, batch_size=Config.train_batch, collate_fn=collator)

In [None]:
def predict(model, test_loader, device=Config.device):
    # list to save preds
    test_preds = []
    
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            # pass the data to the device
            batch = {k: v.to(device) for k, v in batch.items()}
            # forward
            out = model(**batch)
            # preds raw logits
            logits = out.logits
            # get the most likely predict
            preds = torch.argmax(logits, dim=-1)
            test_preds.extend(preds)
      
    test_preds = [i.item() for i in test_preds]
            
    return test_preds

In [None]:
# get predictions
test_preds = predict(model, test_loader)

In [None]:
# make submission DataFrame
submission = pd.DataFrame(list(zip(test['PhraseId'], test_preds)), columns=['PhraseId', 'Sentiment'])
submission.head(25)

In [None]:
submission.to_csv('submission.csv', index=False)