In [1]:
!pip install accelerate==0.20.1
!pip install transformers[torch] -U
!pip install datasets

Collecting accelerate==0.20.1
  Using cached accelerate-0.20.1-py3-none-any.whl (227 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.25.0
    Uninstalling accelerate-0.25.0:
      Successfully uninstalled accelerate-0.25.0
Successfully installed accelerate-0.20.1
Collecting accelerate>=0.20.3 (from transformers[torch])
  Using cached accelerate-0.25.0-py3-none-any.whl (265 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.20.1
    Uninstalling accelerate-0.20.1:
      Successfully uninstalled accelerate-0.20.1
Successfully installed accelerate-0.25.0


In [2]:
from google.colab import files
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import re
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from scipy import stats
from scipy.stats import pearsonr
import pandas as pd

## **Data read-in**

In [3]:
uploaded = files.upload()

Saving train.csv to train (2).csv
Saving test.csv to test (2).csv


In [4]:
data_file = {"train": "train.csv"}

dataset = load_dataset("csv", data_files=data_file)

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 9491
    })
})

In [5]:
temp = dataset["train"].train_test_split(test_size=0.20, seed=42)
dataset["validation"] = temp['test']
dataset["train"] = temp['train']
del(temp)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 7592
    })
    validation: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 1899
    })
})

In [6]:
#def clean(examples):
#    new_texts = []
#    for text in examples['text']:
#        text = re.sub('@user', '', text)
#        text = re.sub('http', '', text)
#        text = re.sub('@[\w]+', '', text)
#        text = text.strip()
#        new_texts.append(text)

#    examples['text'] = new_texts
#    return examples

#dataset=dataset.map(clean, batched=True)
#dataset

## **Creating a function to calculate metrics**

In [7]:
def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)
    predictions = np.squeeze(logits)

    # loss metrics
    mse = mean_squared_error(labels, logits)

    rmse = mean_squared_error(labels, logits, squared=False)

    mae = mean_absolute_error(labels, logits)

    smape = 1/len(labels) * np.sum(2 * np.abs(logits-labels) / (np.abs(labels) + np.abs(logits))*100)


    # performance metrics
    r2 = r2_score(labels, logits)

    pearson, _ =stats.pearsonr(np.squeeze(np.asarray(labels)), np.squeeze(np.asarray(logits)))

    return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "smape": smape, "pearson": pearson}

In [8]:
NUM_EPOCHS = 3

# Specifiy the arguments for the trainer
training_args = TrainingArguments(
    output_dir ='./results',
    num_train_epochs = NUM_EPOCHS,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 20,
    weight_decay = 0.01,
    learning_rate = 2e-5,
    logging_dir = './logs',
    save_total_limit = 10,
    load_best_model_at_end = True,
    # metric_for_best_model = 'rmse',
    metric_for_best_model = 'pearson',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    report_to = 'all',
)

## **Model training and evaluation**


In [9]:
def model_training(model_name):
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 1).to("cuda")

  #TOKENIZING
  from transformers import AutoTokenizer
  if 'MiniLM' in model_name:
    # we must load the tokenizer of XLM-R
    tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
  else:
    tokenizer = AutoTokenizer.from_pretrained(model_name)

  MAX_LEN = 50

  #DATA ENCODING
  def tokenize(examples):
       return tokenizer(examples["text"], truncation=True, max_length=MAX_LEN, padding='max_length')

  #apply tokenizer and remove the columns that we do not need anymore
  data_encodings=dataset.map(tokenize, batched=True, remove_columns=['text','language'])
  data_encodings


  # Call the Trainer
  trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = data_encodings['train'],
    eval_dataset = data_encodings['validation'],
    compute_metrics = compute_metrics_for_regression,
    #callbacks=[EarlyStoppingCallback(3, 0.0)]
  )

  # Train the model
  trainer.train()

  return data_encodings

In [10]:
def model_testing(model_name, test_dataset):
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 1).to("cuda")
  from transformers import AutoTokenizer
  if 'MiniLM' in model_name:
    # we must load the tokenizer of XLM-R
    tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
  else:
    tokenizer = AutoTokenizer.from_pretrained(model_name)

  def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, truncation=True, return_tensors="pt").to("cuda")
    outputs = model(**inputs)   #output is a tensor
    return outputs[0].item()

  y_test = test_dataset["test"]['label']

  # generate predictions for each text
  y_pred=[get_prediction(text) for text in test_dataset["test"]['text']]

  pearson_r, _ = pearsonr(y_pred, y_test)
  print("Pearson's r coefficient on the test data is: " , pearson_r)

  print("**********************************")
  test_with_pred = pd.DataFrame({
          'text': test_dataset["test"]['text'],
          'language': test_dataset["test"]['language'],
          'predicted_label': y_pred,
          'true_label': test_dataset["test"]['label']
          })

  def calculate_pearsonr_for_diff_lang(language, data):
    language_with_pred = test_with_pred[test_with_pred['language'] == language]
    pearson, _ = pearsonr(language_with_pred['predicted_label'], language_with_pred['true_label'])
    return pearson

  # Example usage:
  languages = set(test_with_pred['language'])

  for lang in languages:
      correlation = calculate_pearsonr_for_diff_lang(lang, test_with_pred)
      print(f"Pearson correlation for {lang} with SVR is {correlation}")

  return

In [11]:
bert = model_training('bert-base-multilingual-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape,Pearson
1,No log,0.571621,0.571621,0.756056,0.617418,0.280939,30.160738,0.60598
2,No log,0.515537,0.515537,0.718009,0.568227,0.35149,27.731981,0.620481
3,No log,0.504532,0.504532,0.710304,0.551787,0.365333,26.872511,0.627036


In [12]:
XLM_R = model_training('xlm-roberta-base')

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape,Pearson
1,No log,0.546759,0.546759,0.739431,0.602757,0.312215,29.937759,0.623906
2,No log,0.558759,0.558759,0.747502,0.591706,0.297119,28.458852,0.664422
3,No log,0.50162,0.50162,0.708252,0.556557,0.368996,27.111033,0.674618


In [13]:
XLM_T = model_training('cardiffnlp/twitter-xlm-roberta-base')

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape,Pearson
1,No log,0.4393,0.4393,0.662797,0.517962,0.447391,25.645233,0.678094
2,No log,0.431653,0.431653,0.657003,0.510731,0.45701,25.092524,0.696242
3,No log,0.441141,0.441141,0.664184,0.511783,0.445075,24.94926,0.698879


In [14]:
TwHIN = model_training('Twitter/TwHIN-BERT-base')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Twitter/TwHIN-BERT-base and are newly initialized: ['classifier.bias', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape,Pearson
1,No log,0.513228,0.513228,0.7164,0.573064,0.354394,27.974316,0.672887
2,No log,0.456196,0.456196,0.675423,0.519178,0.426136,25.276371,0.690251
3,No log,0.464867,0.464867,0.681812,0.523387,0.415229,25.334921,0.693858


## **Evaluation on the test data**

In [15]:
data_file_test = {"test": "test.csv"}

test_dataset = load_dataset("csv", data_files=data_file_test)

test_dataset

DatasetDict({
    test: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 3881
    })
})

In [16]:
def model_test(model_name, test_dataset):
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 1).to("cuda")

  #TOKENIZING
  from transformers import AutoTokenizer
  if 'MiniLM' in model_name:
    # we must load the tokenizer of XLM-R
    tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
  else:
    tokenizer = AutoTokenizer.from_pretrained(model_name)

  MAX_LEN = 50

  #DATA ENCODING
  def tokenize(examples):
       return tokenizer(examples["text"], truncation=True, max_length=MAX_LEN, padding='max_length')

  #apply tokenizer and remove the columns that we do not need anymore
  data_encodings=dataset.map(tokenize, batched=True, remove_columns=['text','language'])
  data_encodings

  data_encodings_test=test_dataset.map(tokenize, batched=True, remove_columns=['text','language'])
  data_encodings_test


  # Call the Trainer
  trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = data_encodings['train'],
    eval_dataset = data_encodings_test['test'],
    compute_metrics = compute_metrics_for_regression,
    #callbacks=[EarlyStoppingCallback(3, 0.0)]
  )

  trainer.train()

  return data_encodings

In [17]:
bert_test_pred = model_test('bert-base-multilingual-uncased',test_dataset)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1899 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape,Pearson
1,No log,0.704079,0.704079,0.839094,0.662975,0.243958,31.272379,0.507431
2,No log,0.701201,0.701201,0.837377,0.639273,0.247049,29.977555,0.513732
3,No log,0.716659,0.716659,0.846557,0.642054,0.23045,29.992045,0.515355


In [18]:
XLM_R_test_pred = model_test('xlm-roberta-base',test_dataset)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape,Pearson
1,No log,0.721476,0.721476,0.849397,0.687564,0.225277,32.716713,0.519041
2,No log,0.749886,0.749886,0.86596,0.676921,0.19477,31.523009,0.537938
3,No log,0.713968,0.713968,0.844966,0.654932,0.23334,30.669339,0.541645


In [19]:
XLM_T_test_pred = model_test('cardiffnlp/twitter-xlm-roberta-base',test_dataset)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1899 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape,Pearson
1,No log,0.66746,0.66746,0.816982,0.61392,0.283279,29.140601,0.547154
2,No log,0.66418,0.66418,0.814972,0.6105,0.286802,28.862694,0.559928
3,No log,0.669384,0.669384,0.818159,0.610427,0.281214,28.762022,0.566367


In [20]:
TwHIN_test_pred = model_test('Twitter/TwHIN-BERT-base',test_dataset)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Twitter/TwHIN-BERT-base and are newly initialized: ['classifier.bias', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1899 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape,Pearson
1,No log,0.68449,0.68449,0.827339,0.632915,0.264992,29.80818,0.541622
2,No log,0.694322,0.694322,0.83326,0.609988,0.254436,28.65158,0.557331
3,No log,0.69187,0.69187,0.831787,0.611409,0.257069,28.665005,0.561448
