In [None]:
!pip install datasets
!pip install evaluate
!pip install transformers
!pip install accelerate

In [None]:
import numpy as np
import pandas as pd
import datasets
import evaluate
import seaborn as sns

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay



In [None]:
dataset_path = 'dataset/'

In [None]:
df_train = pd.read_csv(dataset_path + 'training_ironita2018_anon_REV_.csv', sep = ";")
df_test = pd.read_csv(dataset_path + 'test_gold_ironita2018_anon_REV_.csv', sep = ";")

In [None]:
df_train

In [None]:
def create_label(dataset):
  irony_labels = dataset['irony'].values
  irony_array = np.array(irony_labels)
  return irony_array


In [None]:
train_labels_irony = create_label(df_train)
test_labels_irony= create_label(df_test)

In [None]:
len(test_labels_irony)

In [None]:
df_train = df_train.loc[:, ['text', 'irony']]
df_test = df_test.loc[:, ['text', 'irony']]

In [None]:
#conversione e suddivisione dei dati per training e validation

train = datasets.Dataset.from_pandas(pd.DataFrame(data=df_train))
test = datasets.Dataset.from_pandas(pd.DataFrame(data=df_test))

train_dev = train.train_test_split(test_size=0.1)

train = train_dev["train"]
dev = train_dev["test"]

In [None]:
#caricamento del modello di classificazione e del tokenizer pre-addestrati

model_name = 'osiria/distilbert-base-italian-cased'

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
#tokenizzazione dei dati e conversione nel formato richiesto per Pytorch

def tokenize(batch):
    tokens = tokenizer(batch['text'], padding=True, truncation=True, max_length=512)
    tokens['label'] = batch['irony']

    return tokens

train = train.map(tokenize, batched=True)
dev = dev.map(tokenize, batched=True)
test = test.map(tokenize, batched=True)

train.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
dev.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
#configurazione dei parametri di addestramento per il trainer

num_epochs = 5

training_args = TrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    logging_strategy="epoch",
    save_strategy = "epoch",
    logging_steps=10,
    learning_rate=2e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    load_best_model_at_end=True
)

In [None]:
#definizione della funzione per la valutazione del modello 

def compute_metrics(eval_pred):
  f1_metric = evaluate.load("f1")
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)

  return f1_metric.compute(predictions=predictions, references=labels, average="weighted")

In [None]:
# inizialiazzione del trainer del modello
trainer = Trainer(
    model,
    training_args,
    train_dataset=train,
    eval_dataset=dev,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model("FINETUNED_MODEL")

In [None]:
log_history = trainer.state.log_history

df = pd.DataFrame(columns=["Epoch", "Loss", "Training/Validation"])

for log_data in log_history:
    epoch = int(log_data["epoch"])
    if "loss" in log_data.keys():
        loss = log_data["loss"]
        new_row = pd.DataFrame({"Epoch": [epoch], "Loss": [loss], "Training/Validation": ["Training"]})
        df = pd.concat([df, new_row], ignore_index=True)
    if "eval_loss" in log_data.keys():
        loss = log_data["eval_loss"]
        new_row = pd.DataFrame({"Epoch": [epoch], "Loss": [loss], "Training/Validation": ["Validation"]})
        df = pd.concat([df, new_row], ignore_index=True)

sns.lineplot(data=df, x="Epoch", y="Loss", hue="Training/Validation")


In [None]:
output_predictions = trainer.predict(test)
print(output_predictions)

In [None]:
y_test = test["label"].tolist()
y_pred = np.argmax(output_predictions.predictions, axis=1)

report = classification_report(y_test, y_pred)
cm = ConfusionMatrixDisplay.from_predictions(y_test, y_pred, xticks_rotation='vertical', cmap='Blues')

print("Classification Report:")
print(report)
print()

print("Confusion Matrix:")
print(cm)