In [None]:
!pip install git+https://github.com/huggingface/accelerate.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install bitsandbytes

In [None]:
!pip install datasets

In [None]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding

from sklearn.model_selection import train_test_split

import tensorflow as tf
import torch
import datasets
from datasets import Dataset

import os
from google.colab import drive


In [None]:
drive.mount('/content/drive', force_remount=True)
os.chdir("drive/My Drive/tweet-data/")


#read all training data
train_big_df = pd.read_hdf('train-data.h5', 'train')

#read all testing data
test_big_df = pd.read_hdf('test-data.h5', 'test')

Mounted at /content/drive


In [None]:
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)

In [None]:
MAX_LEN = 512
roberta_checkpoint = "roberta-large"

In [None]:
train_big_df['labels'] = train_big_df['labels'].astype(int)
test_big_df['labels'] = test_big_df['labels'].astype(int)

In [None]:
train_dataset = Dataset.from_pandas(train_big_df, split="train")
test_dataset = Dataset.from_pandas(test_big_df, split="test")
dataset = datasets.DatasetDict({"train":train_dataset,"test":test_dataset})

In [None]:
# Split the dataset into training and validation datasets
data = dataset['train'].train_test_split(train_size=0.8, seed=42)
# Rename the default "test" split to "validation"
data['val'] = data.pop("test")
# Convert the test dataframe to HuggingFace dataset and add it into the first dataset
data['test'] = dataset['test']

In [None]:
data['train'].to_pandas().info()
data['test'].to_pandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24304 entries, 0 to 24303
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   tweet_id           24304 non-null  int64 
 1   text               24304 non-null  object
 2   labels             24304 non-null  int64 
 3   __index_level_0__  24304 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 759.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8606 entries, 0 to 8605
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   tweet_id           8606 non-null   int64 
 1   text               8606 non-null   object
 2   labels             8606 non-null   int64 
 3   __index_level_0__  8606 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 269.1+ KB


In [None]:
pos_weights = len(data['train'].to_pandas()) / (2 * data['train'].to_pandas().labels.value_counts()[1])
neg_weights = len(data['train'].to_pandas()) / (2 * data['train'].to_pandas().labels.value_counts()[0])
pos_weights, neg_weights

(1.4830363680742007, 0.7543140906269398)

In [None]:
# Number of Characters
max_char = data['train'].to_pandas()['text'].str.len().max()
# Number of Words
max_words = data['train'].to_pandas()['text'].str.split().str.len().max()

print(f"Max characters: {max_char}")
print(f"Max words: {max_words}")

Max characters: 283
Max words: 56


In [None]:
data['train'][0]

{'tweet_id': 903420789361344513,
 'text': 'please see message below we r look 4 donations please bring them to zysa this weekend our friends need help',
 'labels': 0,
 '__index_level_0__': 24268}

In [None]:
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_checkpoint, add_prefix_space=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
def roberta_preprocessing_function(examples):
    return roberta_tokenizer(examples['text'], truncation=True, max_length=MAX_LEN)

In [None]:
roberta_preprocessing_function(data['train'][0])

{'input_ids': [0, 2540, 192, 1579, 874, 52, 910, 356, 204, 5215, 2540, 836, 106, 7, 992, 2459, 102, 42, 983, 84, 964, 240, 244, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
col_to_delete = ['tweet_id','text','__index_level_0__']
# Apply the preprocessing function and remove the undesired columns
roberta_tokenized_datasets = data.map(roberta_preprocessing_function, batched=True, remove_columns=col_to_delete)
# Rename the target to label as for HugginFace standards
roberta_tokenized_datasets = roberta_tokenized_datasets.rename_column("labels", "label")
# Set to torch format
roberta_tokenized_datasets.set_format("torch")

Map:   0%|          | 0/24304 [00:00<?, ? examples/s]

Map:   0%|          | 0/6077 [00:00<?, ? examples/s]

Map:   0%|          | 0/8606 [00:00<?, ? examples/s]

In [None]:
roberta_tokenized_datasets['train'][0]

{'label': tensor(0),
 'input_ids': tensor([   0, 2540,  192, 1579,  874,   52,  910,  356,  204, 5215, 2540,  836,
          106,    7,  992, 2459,  102,   42,  983,   84,  964,  240,  244,    2]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}

In [None]:
# Data collator for padding a batch of examples to the maximum length seen in the batch
from transformers import DataCollatorWithPadding
roberta_data_collator = DataCollatorWithPadding(tokenizer=roberta_tokenizer)

In [None]:
from transformers import AutoModelForSequenceClassification
roberta_model = AutoModelForSequenceClassification.from_pretrained(roberta_checkpoint, num_labels=2)

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install peft

Collecting peft
  Downloading peft-0.8.2-py3-none-any.whl (183 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: peft
Successfully installed peft-0.8.2


In [None]:
from peft import get_peft_model, LoraConfig, TaskType
roberta_peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=2, lora_alpha=16, lora_dropout=0.1, bias="none",
)
roberta_model = get_peft_model(roberta_model, roberta_peft_config)
roberta_model.print_trainable_parameters()

trainable params: 1,248,258 || all params: 356,610,052 || trainable%: 0.35003444042009224


In [None]:
!pip install evaluate

In [None]:
import evaluate
import numpy as np

def compute_metrics(eval_pred):
    # All metrics are already predefined in the HF `evaluate` package
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    f1_metric= evaluate.load("f1")
    accuracy_metric = evaluate.load("accuracy")

    logits, labels = eval_pred # eval_pred is the tuple of predictions and labels returned by the model
    predictions = np.argmax(logits, axis=-1)
    precision = precision_metric.compute(predictions=predictions, references=labels)["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels)["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    # The trainer is expecting a dictionary where the keys are the metrics names and the values are the scores.
    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}

In [None]:
from transformers import Trainer

class WeightedCELossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # Get model's predictions
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Compute custom loss
        loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor([neg_weights, pos_weights], device=model.device, dtype=logits.dtype))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
import torch

In [None]:
roberta_model = roberta_model.cuda()

In [None]:
from transformers import TrainingArguments

lr = 1e-4
batch_size = 8
num_epochs = 5

training_args = TrainingArguments(
    output_dir="roberta-large-lora-token-classification",
    learning_rate=lr,
    lr_scheduler_type= "constant",
    warmup_ratio= 0.1,
    max_grad_norm= 0.3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.001,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=False,
    gradient_checkpointing=True,
)

In [None]:
roberta_trainer = WeightedCELossTrainer(
    model=roberta_model,
    args=training_args,
    train_dataset=roberta_tokenized_datasets['train'],
    eval_dataset=roberta_tokenized_datasets["val"],
    data_collator=roberta_data_collator,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
# Fine-tune the model
roberta_trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
1,0.5075,0.404747,0.663005,0.891802,0.760569,0.806154
2,0.4596,0.380847,0.672365,0.899905,0.76967,0.814053
3,0.4374,0.366981,0.739093,0.823642,0.77908,0.838736
4,0.4615,0.358001,0.725301,0.86082,0.787271,0.839394
5,0.4316,0.362365,0.754018,0.805052,0.7787,0.842027


Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Checkpoint destination directory roberta-large-lora-token-classification/checkpoint-3038 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=15190, training_loss=0.46583178270953984, metrics={'train_runtime': 924.5121, 'train_samples_per_second': 131.442, 'train_steps_per_second': 16.43, 'total_flos': 8857083788047104.0, 'train_loss': 0.46583178270953984, 'epoch': 5.0})

In [None]:
roberta_trainer.evaluate()

{'eval_loss': 0.35800111293792725,
 'eval_precision': 0.7253012048192771,
 'eval_recall': 0.8608198284080076,
 'eval_f1-score': 0.7872711421098517,
 'eval_accuracy': 0.8393944380450881,
 'eval_runtime': 25.9797,
 'eval_samples_per_second': 233.914,
 'eval_steps_per_second': 29.254,
 'epoch': 5.0}