## GPT-2 Finetuning on Sentiment Classification

### Overview

- Compare performance of different text generation model on a sentiment detection task.
- For this, we will fine the text generation model GPT-2 on train data and report performance on the test data.
- Hence, we will also learn how to fine tune the TG models along wth how to apply these model to an example NLP task.

### Model

- Huggingface

### Dataset

- Tweet Sentiment

### Download and import packages

In [None]:
# uninstall
!pip uninstall -y wandb

# download
!pip install transformers

# import
import re
import json
import torch
import random
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

### Dataset load and prep functions

In [None]:
# Dataset class
class SentimentDataset(Dataset):
    def __init__(self, txt_list, label_list, tokenizer, max_length):
        # define variables    
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        map_label = {0:'negative', 4: 'positive'}
        # iterate through the dataset
        for txt, label in zip(txt_list, label_list):
            # prepare the text
            prep_txt = f'<|startoftext|>Review: {txt}\nSentiment: {map_label[label]}<|endoftext|>'
            # tokenize
            encodings_dict = tokenizer(prep_txt, truncation=True,
                                       max_length=max_length, padding="max_length")
            # append to list
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
            self.labels.append(map_label[label])

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx], self.labels[idx]

# Data load function
def load_sentiment_dataset(tokenizer, random_seed = 1, file_path="../input/sentiment140/training.1600000.processed.noemoticon.csv"):
    # load dataset and sample 10k reviews.
    df = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)
    df = df[[0, 5]]
    df.columns = ['label', 'text']
    df = df.sample(10000, random_state=1)
    
    def pick_first_n_words(string, max_words=250): # tried a few max_words, kept 250 as max tokens was < 512
        split_str = string.split()
        return " ".join(split_str[:min(len(split_str), max_words)])

    df['text'] = df['text'].apply(lambda x: pick_first_n_words(x))
    
    # divide into test and train
    X_train, X_test, y_train, y_test = \
              train_test_split(df['text'].tolist(), df['label'].tolist(),
              shuffle=True, test_size=0.05, random_state=random_seed, stratify=df['label'])

    # get max length
    max_length_train = max([len(tokenizer.encode(text)) for text in X_train])
    max_length_test = max([len(tokenizer.encode(text)) for text in X_test])
    max_length = max([max_length_train, max_length_test]) + 10  #for special tokens (sos and eos) and fillers
    max_length = max(max_length, 300)
    print(f"Setting max length as {max_length}")

    # format into SentimentDataset class
    train_dataset = SentimentDataset(X_train, y_train, tokenizer, max_length=max_length)

    # return
    return train_dataset, (X_test, y_test)

### Load model and tokenizer; Call data Prep

In [None]:
# import 
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

# model
model_name = "gpt2"
seed = 42

# seed
torch.manual_seed(seed)

In [None]:
# iterate for N trials
for trial_no in range(3):
    
    print("Loading model...")
    # load tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained(model_name, bos_token='<|startoftext|>',
                                              eos_token='<|endoftext|>', pad_token='<|pad|>')
    model = GPT2LMHeadModel.from_pretrained(model_name).cuda()
    model.resize_token_embeddings(len(tokenizer))

    print("Loading dataset...")
    train_dataset, test_dataset = load_sentiment_dataset(tokenizer, trial_no)
    
    print("Start training...")
    training_args = TrainingArguments(output_dir='results', num_train_epochs=2, 
                                    logging_steps=10, load_best_model_at_end=True,
                                      save_strategy="epoch", per_device_train_batch_size=2, per_device_eval_batch_size=2,
                                    warmup_steps=100, weight_decay=0.01, logging_dir='logs')

    Trainer(model=model, args=training_args, train_dataset=train_dataset,
            eval_dataset=test_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                                  'attention_mask': torch.stack([f[1] for f in data]),
                                                                  'labels': torch.stack([f[0] for f in data])}).train()
    
    # test
    print("Start testing...")
    # eval mode on model
    _ = model.eval()

    # compute prediction on test data
    original, predicted, all_text, predicted_text = [], [], [], []
    map_label = {0:'negative', 4: 'positive'}
    for text, label in tqdm(zip(test_dataset[0], test_dataset[1])):
        # predict sentiment on test data
        prompt = f'<|startoftext|>Review: {text}\nSentiment:'
        generated = tokenizer(f"<|startoftext|> {prompt}", return_tensors="pt").input_ids.cuda()
        sample_outputs = model.generate(generated, do_sample=False, top_k=50, max_length=512, top_p=0.90, 
                temperature=0, num_return_sequences=0)
        pred_text = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)
        # extract the predicted sentiment
        try:
            pred_sentiment = re.findall("\nSentiment: (.*)", pred_text)[-1]
        except:
            pred_sentiment = "None"
        original.append(map_label[label])
        predicted.append(pred_sentiment)
        all_text.append(text)
        predicted_text.append(pred_text)
    #transform into dataframe
    df = pd.DataFrame({'text': all_text, 'predicted': predicted, 'original': original, 'predicted_text': predicted_text})
    df.to_csv(f"result_run_{trial_no}.csv", index=False)
    # compute f1 score
    print(f1_score(original, predicted, average='macro'))