In [None]:
#@title installa e importa
!pip install datasets scikit-learn transformers

import pandas as pd
import torch
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score,classification_report
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from torch.utils.data import DataLoader, TensorDataset




In [None]:
#@title Monta colab
from google.colab import drive

# Monta Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#@title Definizione Percorsi e Configurazioni
TRAIN_CSV = "/content/drive/MyDrive/ProjectNLP/00.Amazon_Project/data/AMAZON/Train.csv"
EVAL_CSV = "/content/drive/MyDrive/ProjectNLP/00.Amazon_Project/data/AMAZON/Eval.csv"
TEST_CSV = "/content/drive/MyDrive/ProjectNLP/00.Amazon_Project/data/AMAZON/Test.csv"
MODEL_NAME = "microsoft/deberta-base"
OUTPUT_DIR = "//content/drive/MyDrive/ProjectNLP/00.Amazon_Project/CLASSIFIER_CHECKPOINT"

### Funzione per la mappatura delle etichette
label2id = {"neutral": 0, "positive": 1, "negative": 2}

def map_label_to_id(label):
    return label2id[label]

### Caricamento e Preparazione dei Dati
def load_and_prepare_data(train_csv, eval_csv, test_csv):
    train_df = pd.read_csv(train_csv)
    eval_df = pd.read_csv(eval_csv)
    test_df = pd.read_csv(test_csv)

    for df in [train_df, eval_df, test_df]:
        df['labels'] = df['label'].apply(map_label_to_id)
        df['text'] = df['text'].astype(str)

    train_dataset = Dataset.from_pandas(train_df[['text', 'labels']], preserve_index=False)
    eval_dataset = Dataset.from_pandas(eval_df[['text', 'labels']], preserve_index=False)
    test_dataset = Dataset.from_pandas(test_df[['text', 'labels']], preserve_index=False)

    return train_dataset, eval_dataset, test_dataset

train_dataset, eval_dataset, test_dataset = load_and_prepare_data(TRAIN_CSV, EVAL_CSV, TEST_CSV)


In [None]:
#@title Tokenizzazione dei Dati
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=32)

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(["text"]).rename_column("labels", "label")
eval_dataset = eval_dataset.remove_columns(["text"]).rename_column("labels", "label")
test_dataset = test_dataset.remove_columns(["text"]).rename_column("labels", "label")

train_dataset.set_format("torch")
eval_dataset.set_format("torch")
test_dataset.set_format("torch")

### Inizializzazione del Modello
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(label2id))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Map:   0%|          | 0/59497 [00:00<?, ? examples/s]

Map:   0%|          | 0/3305 [00:00<?, ? examples/s]

Map:   0%|          | 0/3306 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#@title Definizione delle Metriche
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

In [None]:
#@title Train con prepruning
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    learning_rate=5e-5,
    warmup_steps=256,
    lr_scheduler_type="linear"
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

### Training
trainer.train()

### Valutazione
metrics = trainer.evaluate(eval_dataset)
print("Final Eval metrics:", metrics)

test_metrics = trainer.evaluate(test_dataset)
print("Test metrics:", test_metrics)

### Salvataggio del Modello
trainer.save_model(OUTPUT_DIR)


  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4207,0.402932,0.84236,0.843039,0.842343,0.842371
2,0.3241,0.417325,0.851437,0.851074,0.851419,0.850456
3,0.2586,0.493706,0.859304,0.858688,0.8593,0.858798
4,0.1715,0.48944,0.865961,0.86762,0.865945,0.866205
5,0.1423,0.551039,0.859304,0.86052,0.859293,0.859694
6,0.201,0.670769,0.857791,0.858591,0.857781,0.858103


Final Eval metrics: {'eval_loss': 0.48943960666656494, 'eval_accuracy': 0.8659606656580938, 'eval_precision': 0.8676196447304387, 'eval_recall': 0.8659454392503543, 'eval_f1': 0.8662054053051458, 'eval_runtime': 9.7443, 'eval_samples_per_second': 339.171, 'eval_steps_per_second': 21.243, 'epoch': 6.0}
Test metrics: {'eval_loss': 0.4917490780353546, 'eval_accuracy': 0.8611615245009074, 'eval_precision': 0.863537476397524, 'eval_recall': 0.8611615245009075, 'eval_f1': 0.8610257101272886, 'eval_runtime': 11.386, 'eval_samples_per_second': 290.356, 'eval_steps_per_second': 18.18, 'epoch': 6.0}


In [None]:
#@title Stampa metriche

# Imposta il device (GPU se disponibile)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Carica il dataset di test
csv_path = "/content/drive/MyDrive/ProjectNLP/00.Amazon_Project/data/AMAZON/Test.csv"
test_df = pd.read_csv(csv_path)

# Carica il modello e il tokenizer (sostituisci "path_to_your_model" con il percorso o identificativo corretto)
model_name_or_path = "/content/drive/MyDrive/ProjectNLP/00.Amazon_Project/CLASSIFIER_CHECKPOINT"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path)
model.to(device)
model.eval()

# Estrai testi e label dal DataFrame
texts = test_df['text'].tolist()
# Converti le label testuali in interi usando la mappatura
label2id = {"neutral": 0, "positive": 1, "negative": 2}
labels = [label2id[label] for label in test_df['label'].tolist()]

# Tokenizza i testi
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']

# Crea un TensorDataset e DataLoader per iterare in batch
dataset = TensorDataset(input_ids, attention_mask, torch.tensor(labels))
loader = DataLoader(dataset, batch_size=32)

# Effettua le predizioni sul dataset di test
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in loader:
        b_input_ids, b_attention_mask, b_labels = [item.to(device) for item in batch]
        outputs = model(input_ids=b_input_ids, attention_mask=b_attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(b_labels.cpu().numpy())

# Calcola l'accuracy complessiva
accuracy = accuracy_score(all_labels, all_preds)
print("Accuracy complessiva: {:.4f}".format(accuracy))

# Calcola e mostra precision, recall e F1-score per ciascuna label
report = classification_report(all_labels, all_preds, target_names=["neutral", "positive", "negative"])
print("\nReport dettagliato (precision, recall, F1):\n", report)


Accuracy complessiva: 0.8612

Report dettagliato (precision, recall, F1):
               precision    recall  f1-score   support

     neutral       0.79      0.86      0.82      1102
    positive       0.94      0.95      0.94      1102
    negative       0.86      0.78      0.82      1102

    accuracy                           0.86      3306
   macro avg       0.86      0.86      0.86      3306
weighted avg       0.86      0.86      0.86      3306

