In [None]:
import os
# use gpu for training
# set before pytorch loads
os.environ["CUDA_VISIBLE_DEVICES"] = "0" 

import torch
print(f"CUDA available: {torch.cuda.is_available()}")

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EvalPrediction
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

## Load and split data

In [None]:
df = pd.read_csv("train_prepared.csv")

Split the data so that we have an equl ratio of `target` variables in both test and train.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], 
    df['target'],
    test_size=0.2,
    stratify=df['target'],  # keep the ratio equal for test and train splits
    random_state=42
)

# check if the split maintained the distribution
print("train distribution:")
print("true:  {}%".format((y_train == 1).sum()/len(y_train)*100))
print("false: {}%".format((y_train == 0).sum()/len(y_train)*100))
print("\ntest distribution:")
print("true:  {}%".format((y_test == 1).sum()/len(y_test)*100))
print("false: {}%".format((y_test == 0).sum()/len(y_test)*100))


## Retrieve the models

In [None]:
MODEL_NAME = "vinai/bertweet-base"
# take the specialized tokenizer for the model pretrained
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# take the moodel, pretrained, with a fresh last layer
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2) # number of output labels

## Input Preparation

Convert the data into the format that the BerTweet is familiar with. 
Because Bert is a Neural Network, it operates on numbers, so we have to creeate number representations for each sentence:
- separate the sentences on words
- apply tokens like [CLS] 
- map words to word id (based on learned vocabulary)
- apply padding and trucation

In [None]:
from datasets import Dataset

train_dataset = Dataset.from_dict({
    'text': X_train.tolist(),   
    'label': y_train.tolist()   
})

test_dataset = Dataset.from_dict({
    'text': X_test.tolist(),
    'label': y_test.tolist()
})

# padding='max_length' -> apply padding so that training can be done on batches
# truncation=True -> for the specially long tweets, truncare them
# max_length=128 -> majority of tweets have from 50-80 words
train_dataset = train_dataset.map(lambda x: tokenizer(x["text"], padding='max_length', truncation=True, max_length=128), batched=True)
test_dataset = test_dataset.map(lambda x: tokenizer(x["text"], padding='max_length', truncation=True, max_length=128), batched=True)

## Train the model

Create a metrics function which will output the training progress. We care about metrics that are more suitable for imabalanced datasets.

In [None]:
def compute_metrics(eval_pred: EvalPrediction) -> dict:
    # predictions = [[percentage_0_class_elem_1, percentage_1_class_elem_1], ...]
    # labels = [true_label_elem_1, true_label_elem_2, ...]
    predictions, labels = eval_pred
    
    # get the position of the bigger percentage (class 0 or 1); axis=1 -> per element
    predictions = np.argmax(predictions, axis=1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='binary' # treat the labels together as a binary class, not separate
    )
    
    return {
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

Use the transformers training algorithm to train the Bertweet model.

In [None]:
training_args = TrainingArguments(
    # BERTWEET-SPECIFIC: Higher learning rate works well for BERTweet on tweets
    learning_rate=3e-5,  # BERTweet paper suggests 2e-5 to 5e-5, we use 3e-5
    num_train_epochs=4,
    # BERTWEET-SPECIFIC
    per_device_train_batch_size=32,  # tweets are short, so batch can be bigger 
    per_device_eval_batch_size=32,
    # BERTWEET-SPECIFIC
    weight_decay=0.01, # avoid having too large weights
    # BERTWEET-SPECIFIC
    warmup_steps=500,  # gradually increase learning rate at start -> prevents unstable/chaotic updates in early training when model hasn't learned patterns 
    # BERTWEET-SPECIFIC: 
    max_grad_norm=1.0,  # prevents huge adjustments to the weights (exploding gradients)
    eval_strategy="epoch", # evaluate the model after each epoch
    save_strategy="epoch", # don't save the model in a file
    output_dir='./temp_checkpoints',  # save models for loading comparison (otherwise last is returned)
    load_best_model_at_end=True, # load the best model after evaluations in variable
    metric_for_best_model='f1',  # better for slightly imbalaned data like ours (measuring precision and recall)
    # BERTWEET-SPECIFIC
    optim='adamw_torch',
)

# trainer api specialized for hugging face transformers
trainer = Trainer(
    model=model,                    
    args=training_args,             
    train_dataset=train_dataset,    
    eval_dataset=test_dataset,      
    compute_metrics=compute_metrics 
)

trainer.train()

Save the best model after training.

In [None]:
trainer.save_model('./final_bertweet_model')
tokenizer.save_pretrained('./final_bertweet_tokenizer_model')

Optionally, load the model.

In [None]:
trainer = AutoModelForSequenceClassification.from_pretrained('./final_bertweet_model')
tokenizer = AutoTokenizer.from_pretrained('./final_bertweet_tokenizer_model')