In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import torch
#torch.manual_seed(0)
#torch.use_deterministic_algorithms(False)

import random
random.seed(0)

import numpy as np
np.random.seed(0)


<IPython.core.display.Javascript object>

In [3]:
if torch.cuda.is_available():
    dev = "cuda:1"
else:
    dev = "cpu"
DEVICE = torch.device(dev)

<IPython.core.display.Javascript object>

In [4]:
import torch
from transformers import XLMTokenizer, XLMWithLMHeadModel

# tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-100-1280")
# model = XLMWithLMHeadModel.from_pretrained("xlm-mlm-100-1280")

<IPython.core.display.Javascript object>

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# model = AutoModelForSequenceClassification.from_pretrained("xlm-mlm-100-1280", num_labels=1)

<IPython.core.display.Javascript object>

In [6]:
import torch.nn as nn
from transformers import XLMRobertaTokenizer, XLMRobertaModel


class MultiLingualModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = XLMRobertaModel.from_pretrained(
            model_name, output_attentions=False, output_hidden_states=False
        ).to(DEVICE)
        # self.linear = nn.Sequential(nn.Dropout(0.2), nn.Linear(768, 768)).to(DEVICE)
        self.regressor = nn.Sequential(nn.Dropout(0.1), nn.Linear(768, 1)).to(DEVICE)

    def forward(self, sentences):
        encoded_input = self.tokenizer(
            sentences, padding=True, truncation=True, return_tensors="pt"
        ).to(DEVICE)
        out = self.model(**encoded_input)[1]
        # out = self.linear(out)
        out = self.regressor(out)
        return out, encoded_input

<IPython.core.display.Javascript object>

In [7]:
model = MultiLingualModel("cardiffnlp/twitter-xlm-roberta-base")

Some weights of the model checkpoint at cardiffnlp/twitter-xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this mode

<IPython.core.display.Javascript object>

In [None]:
out, tokens = model(["Wikipedia was used to", "This is great"])

In [None]:
out

In [None]:
tokens

In [None]:
del out

In [7]:
with torch.cuda.device("cuda:1"):
    torch.cuda.empty_cache()

<IPython.core.display.Javascript object>

In [7]:
import pandas as pd

data = pd.read_csv("data/train.csv")

<IPython.core.display.Javascript object>

In [8]:
data.head()

Unnamed: 0,text,label,language
0,wearing a fake engagement ring so guys won’t a...,1.8,English
1,Bees vs. Wasps. http,1.0,English
2,Here is a nice equation: 0+0-0-0+0=0,1.0,English
3,@user @user Enjoy each new day!😊🇨🇦🐞🐭,1.6,English
4,I can be having a perfectly good day then I th...,1.6,English


<IPython.core.display.Javascript object>

In [9]:
import demoji
import re


def handle_emoji(x):
    x = demoji.replace_with_desc(x)
    return re.sub(r":", " ", x)


# demoji
data["text"] = data["text"].apply(lambda x: handle_emoji(x))
# get rid of mentions @user @whatever
data["text"] = data["text"].str.replace(r"@[A-Za-z0-9_]+", "", regex=True)
# remove words containing numbers
data["text"] = data["text"].str.replace(r"\w*\d\w*", "", regex=True)

<IPython.core.display.Javascript object>

In [10]:
def get_data_loader(data, batch_size=16, train=True):
    if train:
        shuffled_data = data.sample(frac=1, random_state=0).reset_index(drop=True)
    else:
        shuffled_data = data
    start = 0
    end = start + batch_size
    data_len = len(shuffled_data)
    while start < data_len:
        sub_data = shuffled_data[start:end]
        start += batch_size
        end = min(start + batch_size, data_len)
        yield sub_data["text"].tolist(), torch.tensor(sub_data["label"].tolist())

<IPython.core.display.Javascript object>

In [11]:
from sklearn.model_selection import train_test_split

train_data, valid_data = train_test_split(
    data, test_size=0.2, shuffle=True, random_state=0
)

<IPython.core.display.Javascript object>

In [12]:
train_data.head()

Unnamed: 0,text,label,language
8744,我一看到的第一反应竟然是 只狼,1.25,Chinese
6595,J’arrive pas à me concentrer c’est pour ca que...,3.6,French
2797,Con una mano mi cervecita y la otra en en el c...,1.8,Spanish
7047,J'ai l'impression que depuis Everyday Robots...,1.0,French
7419,je suis japonaise c’est officiel woozy face,2.0,French


<IPython.core.display.Javascript object>

In [13]:
from tqdm import tqdm


def train_or_valid(model_args, curr_epoch, model, is_train=True):
    """
    This fn. is used to train or validate the model
    params:
        model_args: a dict of model parameters
        curr_epoch: Current value of the epoch
        model: model to be trained
        is_train: can be True or False depending on whether to train or validate

    returns:
        loss: sum of the loss across all tokens

    """
    loss_list = []
    y_pred_list = []
    y_list = []
    model_args["optimizer"].zero_grad()
    train_type = None
    if is_train:
        data_loader = get_data_loader(train_data, batch_size=model_args["batch_size"])
        model.train()
        train_type = "train"
    else:
        data_loader = get_data_loader(
            valid_data, batch_size=model_args["batch_size"], train=False
        )
        model.eval()
        train_type = "valid"

    with tqdm(data_loader, unit="batch") as tepoch:
        tepoch.set_description(f"Epoch {curr_epoch} - {train_type}")
        for step, batch in enumerate(tepoch):
            model_args["optimizer"].zero_grad()
            X = batch[0]
            y = batch[1].float().to(DEVICE)
            y_pred, _ = model(X)
            y_pred_list.extend(y_pred.reshape(-1).tolist())
            y_list.extend(y.tolist())
            loss = model_args["criterion"](y_pred.reshape(-1), y)
            loss_list.append(loss.item())
            if is_train:

                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), 2)
                model_args["optimizer"].step()
                model_args["scheduler"].step()
            tepoch.set_postfix(loss=sum(loss_list) / len(loss_list))
    if is_train is False:
        valid_data[f"y_pred_{curr_epoch}"] = y_pred_list
    else:
        train_data[f"y_pred_{curr_epoch}"] = y_pred_list
    return sum(loss_list) / len(loss_list), y_pred_list, y_list

<IPython.core.display.Javascript object>

In [14]:
# Defining parameters for the model
def get_model_args():
    # returns a dict - {param: value}
    return {
        "batch_size": 64,
        "epoch": 15,
        "learning_rate": 0.0001,
    }

<IPython.core.display.Javascript object>

In [15]:
import numpy as np
import scipy

"""
    https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html
    
"""


def compute_r(y, y_pred):
    # corr = np.corrcoef(y, y_pred)
    corr = scipy.stats.pearsonr(y, y_pred)
    return corr

<IPython.core.display.Javascript object>

In [16]:
def compute_language_correlation(valid_data, epoch):
    for language in valid_data["language"].unique():
        r = compute_r(
            valid_data[valid_data["language"] == language][f"y_pred_{epoch}"],
            valid_data[valid_data["language"] == language]["label"],
        )
        print(f"correlation for {language} is : {r}")

<IPython.core.display.Javascript object>

In [17]:
import time
from transformers import AdamW, get_linear_schedule_with_warmup

model = MultiLingualModel("cardiffnlp/twitter-xlm-roberta-base")
model_args = get_model_args()
# Loss and Optimization
total_steps = (len(train_data) // (model_args["batch_size"]) + 1) * model_args["epoch"]
model_args["criterion"] = nn.MSELoss()
model_args["optimizer"] = AdamW(
    model.parameters(), lr=model_args["learning_rate"], eps=1e-8
)
model_args["scheduler"] = get_linear_schedule_with_warmup(
    model_args["optimizer"], num_warmup_steps=0, num_training_steps=total_steps
)

# Log Metrics
epoch_train_loss = []
epoch_valid_loss = []
epoch_valid_r = []
# Begin Training
# validate the model
valid_loss, valid_y_pred, valid_y = train_or_valid(model_args, 0, model, False)
# print(f"Pearson's r is : {compute_r(valid_y_pred, valid_y)}")
compute_language_correlation(valid_data, 0)

for epoch in range(model_args["epoch"]):

    # Train the model
    train_loss, _, _ = train_or_valid(model_args, epoch, model)
    epoch_train_loss.append(train_loss)
    with torch.cuda.device("cuda:1"):
        torch.cuda.empty_cache()
    print(model("je suis japonaise c’est officiel 🥴"))
    # validate the model
    valid_loss, valid_y_pred, valid_y = train_or_valid(model_args, epoch, model, False)
    # print(f"Pearson's r is : {compute_r(valid_y_pred, valid_y)}")
    compute_language_correlation(valid_data, epoch)
    epoch_valid_loss.append(valid_loss)
    epoch_valid_r.append(compute_r)

Some weights of the model checkpoint at cardiffnlp/twitter-xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this mode

correlation for Italian is : PearsonRResult(statistic=-0.0005086902132619628, pvalue=0.9929408763020414)
correlation for Spanish is : PearsonRResult(statistic=-0.019057547127698835, pvalue=0.7289734631225289)
correlation for English is : PearsonRResult(statistic=-0.029828186403735975, pvalue=0.5979085304913662)
correlation for Chinese is : PearsonRResult(statistic=-0.06900922780174096, pvalue=0.22120421450935226)
correlation for Portuguese is : PearsonRResult(statistic=-0.028175494972319207, pvalue=0.6122471736784157)
correlation for French is : PearsonRResult(statistic=0.026948101483886114, pvalue=0.6397806057242621)


Epoch 0 - train: : 88batch [00:19,  4.55batch/s, loss=0.678]


OutOfMemoryError: CUDA out of memory. Tried to allocate 72.00 MiB (GPU 1; 23.70 GiB total capacity; 21.25 GiB already allocated; 36.31 MiB free; 22.63 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

<IPython.core.display.Javascript object>

In [21]:
epoch_valid_loss

[0.49118588169415794,
 0.47502252260843914,
 0.497125643491745,
 0.5000736157099406,
 0.6068410396575927,
 0.48411247730255125,
 0.4828253189722697,
 0.5049465616544088,
 0.5449186623096466,
 0.5153661012649536,
 0.5320098876953125,
 0.5557697554429372,
 0.5617479185263315,
 0.5785285830497742,
 0.5175983746846516]

<IPython.core.display.Javascript object>

In [None]:
model("je suis japonaise c’est officiel 🥴")

In [None]:
model("Posting some VIP client tickets: http")

In [None]:
model("que asco los besos")

In [None]:
valid_data.head()

In [None]:
valid_data["diff"] = valid_data.apply(
    lambda x: abs(x["y_pred_14"] - x["label"]), axis=1
)

In [None]:
valid_data.sort_values("diff", ascending=False).to_csv("xlm_valid.csv")