1. Employ [Hugging Face](https://huggingface.co/models?pipeline_tag=text-classification&sort=trending&search=sentiment) transformers for the same classification task as in the first assignment.

2. Explore Hugging Face models to find a pre-trained model that is suitable and promising for fine-tuning to your task. It should make sense to pick one that has been pre-trained for the same language and/or text genre.

3. As a bonus, you can also employ a [domain adaptation](https://huggingface.co/learn/llm-course/chapter7/3?fw=pt) approach, explore [parameter-efficient fine-tuning](https://huggingface.co/docs/peft/main/quicktour) (e.g. LoRA), or [prompting language models](https://huggingface.co/docs/transformers/v4.49.0/en/tasks/prompting).

In [None]:
import utils
from utils import CustomDataset, CustomDataset1
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline

In [None]:
combined_sentiment_df = pd.read_csv("../common/data_sentiment_preprocessed.csv")
combined_sentiment_df_val = pd.read_csv("../common/data_sentiment_preprocessed_val.csv")

In [None]:
x_train = combined_sentiment_df.text
y_train = combined_sentiment_df.sentiment_label
x_val = combined_sentiment_df_val.text
y_val = combined_sentiment_df_val.sentiment_label

# Applying Pretrained Models

## SIEBERT: Sentiment RoBERTa

In [None]:
#https://huggingface.co/siebert/sentiment-roberta-large-english?library=transformers

"""
    article: https://www.sciencedirect.com/science/article/pii/S0167811622000477
"""

from transformers import pipeline

siebert_roberta = pipeline("text-classification", model="siebert/sentiment-roberta-large-english")


print(siebert_roberta("I love you!"))
print(siebert_roberta("I hate you!"))
print(siebert_roberta("neutral text"))


In [None]:
#siebert_roberta
mapper = {
    "NEGATIVE": 0,
    "POSITIVE": 1
} 
utils.apply_kaggle_model(siebert_roberta, mapper, x_val, y_val)

## DistilBERT: IMDb

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

saiffff = pipeline("text-classification", model="saiffff/distilbert-imdb-sentiment")
print(saiffff("I don't like you!"))
print(saiffff("this is really good!"))
print(saiffff("neutral text"))

In [None]:
mapper = {
    "LABEL_0": 0,
    "LABEL_1": 1,
}
utils.apply_kaggle_model(saiffff, mapper, x_val, y_val)

# Fine-tuning Pre-trained Models

## DistilBERT: IMDb

In [None]:
# https://huggingface.co/saiffff/distilbert-imdb-sentiment
saiffff_tokenizer = AutoTokenizer.from_pretrained("saiffff/distilbert-imdb-sentiment")
saiffff_model = AutoModelForSequenceClassification.from_pretrained("saiffff/distilbert-imdb-sentiment")

num_parameters = saiffff_model.num_parameters() / 1_000_000
print(f"Number of parameters: {num_parameters:.2f}M")

train_encodings = utils.tokenize_data(x_train, saiffff_tokenizer)
val_encodings = utils.tokenize_data(x_val, saiffff_tokenizer)

In [None]:
train_dataset = CustomDataset(train_encodings, y_train)
val_dataset = CustomDataset(val_encodings, y_val)

training_args = TrainingArguments(
    output_dir="./saiffff_results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    logging_dir="./saiffff_logs",
    learning_rate=2e-5,
    weight_decay=0.01,
)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "f1": f1_score(p.label_ids, preds),
    }

trainer = Trainer(
    model=saiffff_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# Save the model
trainer.save_model("saiffff_model")
# Save the tokenizer
saiffff_tokenizer.save_pretrained("saiffff_model")

## SIEBERT: Sentiment RoBERTa

This model is too big to efficiently fine tune all parameters. So we are going to use PEFT.

EDIT: PEFT by itself wasn't enough, gonna quantize the model.

In [None]:
for name, module in siebert_model.named_modules():
    if "query" in name or "value" in name:
        print(name)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig
from peft import LoftQConfig, get_peft_model, LoraConfig, TaskType


model_name = "siebert/sentiment-roberta-large-english"

siebert_tokenizer = AutoTokenizer.from_pretrained(model_name)
siebert_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    device_map="auto"
)

# Use LoftQ config
loftq_config = LoftQConfig(
    loftq_bits=4
)
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=1, lora_alpha=2, lora_dropout=0.1, bias="none", init_lora_weights="loftq", loftq_config=loftq_config
)
siebert_model = get_peft_model(siebert_model, peft_config)

In [None]:
train_encodings = utils.tokenize_data(x_train, siebert_tokenizer)
val_encodings = utils.tokenize_data(x_val, siebert_tokenizer)

train_dataset = CustomDataset(train_encodings, y_train)
val_dataset = CustomDataset(val_encodings, y_val)


training_args = TrainingArguments(
    output_dir="./siebert_results",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    logging_dir="./siebert_logs",
    learning_rate=2e-5,
    weight_decay=0.01,
)


def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "f1": f1_score(p.label_ids, preds),
    }

trainer = Trainer(
    model=siebert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# Save the model
trainer.save_model("siebert_model")
# Save the tokenizer
siebert_tokenizer.save_pretrained("siebert_model")