In [1]:
!pip install accelerate==0.20.1
!pip install transformers[torch] -U
!pip install datasets

Collecting accelerate==0.20.1
  Using cached accelerate-0.20.1-py3-none-any.whl (227 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.24.1
    Uninstalling accelerate-0.24.1:
      Successfully uninstalled accelerate-0.24.1
Successfully installed accelerate-0.20.1
Collecting accelerate>=0.20.3 (from transformers[torch])
  Using cached accelerate-0.24.1-py3-none-any.whl (261 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.20.1
    Uninstalling accelerate-0.20.1:
      Successfully uninstalled accelerate-0.20.1
Successfully installed accelerate-0.24.1


In [2]:
from google.colab import files
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import re
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from scipy import stats

## **Data read-in**

In [3]:
uploaded = files.upload()

Saving train.csv to train (1).csv


In [4]:
data_file = {"train": "train.csv"}

dataset = load_dataset("csv", data_files=data_file)

LANGUAGES = set(dataset['train']['language'])

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 9491
    })
})

In [5]:
temp = dataset["train"].train_test_split(test_size=0.20, seed=42)
dataset["validation"] = temp['test']
dataset["train"] = temp['train']
del(temp)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 7592
    })
    validation: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 1899
    })
})

In [6]:
#def clean(examples):
#    new_texts = []
#    for text in examples['text']:
#        text = re.sub('@user', '', text)
#        text = re.sub('http', '', text)
#        text = re.sub('@[\w]+', '', text)
#        text = text.strip()
#        new_texts.append(text)

#    examples['text'] = new_texts
#    return examples

#dataset=dataset.map(clean, batched=True)
#dataset

## **Creating a function to calculate metrics**

In [7]:
def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    # loss metrics
    mse = mean_squared_error(labels, logits)

    rmse = mean_squared_error(labels, logits, squared=False)

    mae = mean_absolute_error(labels, logits)

    smape = 1/len(labels) * np.sum(2 * np.abs(logits-labels) / (np.abs(labels) + np.abs(logits))*100)


    # performance metrics
    r2 = r2_score(labels, logits)

    pearson, _ =stats.pearsonr(np.squeeze(np.asarray(labels)), np.squeeze(np.asarray(logits)))

    return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "smape": smape, "pearson": pearson}

In [8]:
NUM_EPOCHS = 3

# Specifiy the arguments for the trainer
training_args = TrainingArguments(
    output_dir ='./results',
    num_train_epochs = NUM_EPOCHS,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 20,
    weight_decay = 0.01,
    learning_rate = 2e-5,
    logging_dir = './logs',
    save_total_limit = 10,
    load_best_model_at_end = True,
    # metric_for_best_model = 'rmse',
    metric_for_best_model = 'pearson',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    report_to = 'all',
)

## **Model training and evaluation**


In [9]:
def model_training(model_name):
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 1).to("cuda")

  #TOKENIZING
  from transformers import AutoTokenizer
  if 'MiniLM' in model_name:
    # we must load the tokenizer of XLM-R
    tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
  else:
    tokenizer = AutoTokenizer.from_pretrained(model_name)

  MAX_LEN = 50

  #DATA ENCODING
  def tokenize(examples):
       return tokenizer(examples["text"], truncation=True, max_length=MAX_LEN, padding='max_length')

  #apply tokenizer and remove the columns that we do not need anymore
  data_encodings=dataset.map(tokenize, batched=True, remove_columns=['text','language'])
  data_encodings


  # Call the Trainer
  trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = data_encodings['train'],
    eval_dataset = data_encodings['validation'],
    compute_metrics = compute_metrics_for_regression,
    #callbacks=[EarlyStoppingCallback(3, 0.0)]
  )

  # Train the model
  trainer.train()

  return

In [10]:
bert = model_training('bert-base-multilingual-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/7592 [00:00<?, ? examples/s]

Map:   0%|          | 0/1899 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape,Pearson
1,No log,0.493826,0.493826,0.702728,0.546102,0.3788,26.749963,0.617215
2,No log,0.53988,0.53988,0.734765,0.579424,0.320868,28.064824,0.622784
3,No log,0.500363,0.500363,0.707364,0.547627,0.370577,26.708632,0.629318


In [11]:
XLM_R = model_training('xlm-roberta-base')

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/7592 [00:00<?, ? examples/s]

Map:   0%|          | 0/1899 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape,Pearson
1,No log,0.488928,0.488928,0.699234,0.543346,0.384962,26.591417,0.637247
2,No log,0.479497,0.479497,0.692457,0.539344,0.396825,26.248657,0.67082
3,No log,0.447083,0.447083,0.668643,0.510229,0.4376,24.964447,0.677153


In [12]:
XLM_T = model_training('cardiffnlp/twitter-xlm-roberta-base')

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/7592 [00:00<?, ? examples/s]

Map:   0%|          | 0/1899 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape,Pearson
1,No log,0.536008,0.536008,0.732125,0.576175,0.325739,27.748293,0.667796
2,No log,0.455897,0.455897,0.675202,0.533087,0.426512,26.173636,0.693139
3,No log,0.435713,0.435713,0.660085,0.508337,0.451903,24.878721,0.694078


In [13]:
TwHIN = model_training('Twitter/TwHIN-BERT-base')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Twitter/TwHIN-BERT-base and are newly initialized: ['classifier.bias', 'bert.pooler.dense.weight', 'classifier.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/7592 [00:00<?, ? examples/s]

Map:   0%|          | 0/1899 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape,Pearson
1,No log,0.460952,0.460952,0.678935,0.537618,0.420153,26.394577,0.673954
2,No log,0.463061,0.463061,0.680486,0.52522,0.4175,25.465743,0.693538
3,No log,0.451049,0.451049,0.671602,0.513189,0.432611,24.926343,0.695919
