In [None]:
!pip install transformers
!pip install demoji
!pip install sentencepiece

In [None]:
# Set seed for reproducibility

import torch
torch.manual_seed(0)

import random
random.seed(0)

import numpy as np
np.random.seed(0)


In [None]:
if torch.cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"
DEVICE = torch.device(dev)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Change this home path
home_path = "/content/drive/MyDrive/NLP 243/project"

In [None]:
training_args = {
    # Add a linear layer before the final layer
    "add_linear": False, 
    #for training on original train set: original
    #for training on translation augmented train set: translated
    "train_on": "original",
    "use_demoji": True,
    "remove_mentions": True,
    "remove_numbers": True,
    "remove_http": True,
    "stratify_split": True,
    "train_zero_shot": True
}

In [None]:
import torch.nn as nn
from transformers import AutoTokenizer, XLMRobertaModel


class MultiLingualModel(nn.Module):
    def __init__(self, model_name, add_linear=False):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = XLMRobertaModel.from_pretrained(
            model_name, output_attentions=False, output_hidden_states=False
        ).to(DEVICE)
        self.add_linear = add_linear
        if self.add_linear:
            self.linear = nn.Sequential(nn.Dropout(0.2), nn.ReLU(), nn.Linear(768, 768)).to(
                DEVICE
            )
        self.regressor = nn.Sequential(
            nn.Dropout(0.1), nn.ReLU(), nn.Linear(768, 1)
        ).to(DEVICE)

    def forward(self, sentences):
        encoded_input = self.tokenizer(
            sentences,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=512,
        ).to(DEVICE)
        out = self.model(**encoded_input)[1]
        if self.add_linear:
            out = self.linear(out)
        out = self.regressor(out)
        return out, encoded_input

In [None]:
import pandas as pd
if training_args["train_on"] == "original":
    data = pd.read_csv(f"{home_path}/data/train.csv")
else: 
    data = pd.read_csv(f"{home_path}/data/full_translate_all.tsv", sep="\t")

In [None]:
data.info()

In [None]:
# Data Cleaning Actions

In [None]:
import demoji
import re

def handle_emoji(x):
    x = demoji.replace_with_desc(x)
    return re.sub(r":", " ", x)

# Strip leading and trailing inverted commas
data["text"] = data["text"].apply(lambda x: x.strip("'"))

if training_args["use_demoji"]:
    # Expand emojis with description using demoji library
    data["text"] = data["text"].apply(lambda x: handle_emoji(x))

if training_args["remove_mentions"]:
    # get rid of mentions @user @whatever
    data["text"] = data["text"].str.replace(r"@[A-Za-z0-9_]+", "", regex=True)

if training_args["remove_numbers"]:
    # remove words containing numbers
    data["text"] = data["text"].str.replace(r"\w*\d\w*", "", regex=True)

if training_args["remove_http"]:
    data["text"] = data["text"].str.replace("\shttps?\s", "", regex=True)

In [None]:
def get_data_loader(data, batch_size=16, train=True):
    if train:
        shuffled_data = data.sample(frac=1, random_state=0).reset_index(drop=True)
    else:
        shuffled_data = data
    start = 0
    end = start + batch_size
    data_len = len(shuffled_data)
    while start < data_len:
        sub_data = shuffled_data[start:end]
        start += batch_size
        end = min(start + batch_size, data_len)
        yield sub_data["text"].tolist(), torch.tensor(sub_data["label"].tolist())

In [None]:
from sklearn.model_selection import train_test_split

train_data, valid_data = train_test_split(
    data,
    test_size=0.2,
    shuffle=True,
    random_state=0,
    stratify=data["language"] if training_args["stratify_split"] else None,
)

In [None]:
if training_args["train_zero_shot"]:
    train_data = train_data[
        ~train_data["language"].isin(["Korean", "Dutch", "Arabic", "Hindi"])
    ]

In [None]:
train_data.language.unique()

In [None]:
valid_data.info()

In [None]:
train_data.info()

In [None]:
from tqdm import tqdm


def train_or_valid(model_args, curr_epoch, model, is_train=True):
    """
    This fn. is used to train or validate the model
    params:
        model_args: a dict of model parameters
        curr_epoch: Current value of the epoch
        model: model to be trained
        is_train: can be True or False depending on whether to train or validate

    returns:
        loss: sum of the loss across all tokens

    """
    loss_list = []
    y_pred_list = []
    y_list = []
    model_args["optimizer"].zero_grad()
    train_type = None
    if is_train:
        data_loader = get_data_loader(train_data, batch_size=model_args["batch_size"])
        model.train()
        train_type = "train"
    else:
        data_loader = get_data_loader(
            valid_data, batch_size=model_args["batch_size"], train=False
        )
        model.eval()
        train_type = "valid"

    with tqdm(data_loader, unit="batch") as tepoch:
        tepoch.set_description(f"Epoch {curr_epoch} - {train_type}")
        for step, batch in enumerate(tepoch):
            model_args["optimizer"].zero_grad()
            X = batch[0]
            y = batch[1].float().to(DEVICE)
            y_pred, _ = model(X)
            y_pred_list.extend(y_pred.reshape(-1).tolist())
            y_list.extend(y.tolist())
            loss = model_args["criterion"](y_pred.reshape(-1), y)
            loss_list.append(loss.item())
            if is_train:
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), 2)
                model_args["optimizer"].step()
                model_args["scheduler"].step()
                with torch.cuda.device(DEVICE):
                    torch.cuda.empty_cache()
            tepoch.set_postfix(loss=sum(loss_list) / len(loss_list))
    if is_train is False:
        valid_data[f"y_pred_{curr_epoch}"] = y_pred_list
    else:
        train_data[f"y_pred_{curr_epoch}"] = y_pred_list
    return sum(loss_list) / len(loss_list), y_pred_list, y_list

In [None]:
# Defining parameters for training the model
def get_model_args():
    # returns a dict - {param: value}
    return {
        "batch_size": 64,
        "epoch": 15,
        "learning_rate": 0.0001,
        "model_name": "cardiffnlp/twitter-xlm-roberta-base",
    }

In [None]:
import scipy

def compute_r(y, y_pred):
    corr = scipy.stats.pearsonr(y, y_pred)
    return corr

In [None]:
# Compute and store pearsons score every epoch for each language

def reset_language_score():
    language_score = {}
    for language in valid_data["language"].unique():
        language_score[language] = []
    return language_score


def compute_language_correlation(valid_data, epoch, language_score):
    for language in valid_data["language"].unique():
        r = compute_r(
            valid_data[valid_data["language"] == language][f"y_pred_{epoch}"],
            valid_data[valid_data["language"] == language]["label"],
        )
        print(f"correlation for {language} is : {r}")
        language_score[language].append(r[0])

In [None]:
import time
from transformers import AdamW, get_linear_schedule_with_warmup

model_args = get_model_args()
model = MultiLingualModel(model_args["model_name"])

# Loss and Optimization
total_steps = (len(train_data) // (model_args["batch_size"]) + 1) * model_args["epoch"]
model_args["criterion"] = nn.MSELoss()
model_args["optimizer"] = AdamW(
    model.parameters(), lr=model_args["learning_rate"], eps=1e-8
)
model_args["scheduler"] = get_linear_schedule_with_warmup(
    model_args["optimizer"], num_warmup_steps=0, num_training_steps=total_steps
)

language_score = reset_language_score()
# Log Metrics
epoch_train_loss = []
epoch_valid_loss = []
epoch_valid_r = []

# validate the model
valid_loss, valid_y_pred, valid_y = train_or_valid(model_args, 0, model, False)
compute_language_correlation(valid_data, 0, language_score)

# Begin Training
for epoch in range(model_args["epoch"]):

    # Train the model
    train_loss, _, _ = train_or_valid(model_args, epoch, model)
    epoch_train_loss.append(train_loss)
    with torch.cuda.device(DEVICE):
        torch.cuda.empty_cache()
    # validate the model
    valid_loss, valid_y_pred, valid_y = train_or_valid(model_args, epoch, model, False)
    compute_language_correlation(valid_data, epoch, language_score)
    epoch_valid_loss.append(valid_loss)
    epoch_valid_r.append(compute_r)

In [None]:
import matplotlib.pyplot as plt

xi = list(range(model_args["epoch"]))
plt.rcParams["figure.figsize"] = (12, 5)
for lang, score in language_score.items():
    plt.plot(score, label=f"pearson_{lang}")

plt.xticks(xi, range(model_args["epoch"]))
plt.xlabel("Epoch")
plt.ylabel("Pearson's Score")
plt.title("Epoch vs Pearson's Score")
plt.legend(fancybox=True, shadow=True)
plt.savefig(f"images/pearson_score-xlmt_base-6lang.png")
plt.show()

In [None]:
valid_data["diff"] = valid_data.apply(
    lambda x: abs(x["y_pred_14"] - x["label"]), axis=1
)

In [None]:
valid_data.sort_values("diff", ascending=False).to_csv("xlm_valid.csv")