# FINE TUNING PER MULTICLASS CLASSIFICATION

A differenza del precedente notebook, viene effettuato il fine tuning per effettuare la classificazione su 5 possibili labels anzichè 2.

Per il resto gli step  sono analoghi

E' necessario scaricare un dataset con 5 possibili labels

In [1]:
from datasets import load_dataset

# Loads Yelp Reviews dataset with ratings from 1 to 5 
dataset = load_dataset("yelp_review_full") 

# Displays the structure of the dataset 
print(dataset) 

README.md: 0.00B [00:00, ?B/s]

yelp_review_full/train-00000-of-00001.pa(…):   0%|          | 0.00/299M [00:00<?, ?B/s]

yelp_review_full/test-00000-of-00001.par(…):   0%|          | 0.00/23.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})


In [2]:
from datasets import DatasetDict

# Selects the train and test splits
train_dataset = dataset["train"] 
test_dataset = dataset["test"] 


# Filters for restaurant-related reviews in the train and test datasets 
restaurant_train_reviews = train_dataset.filter( 
    lambda x: "restaurant" in x["text"].lower()
)
restaurant_test_reviews = test_dataset.filter(
    lambda x: "restaurant" in x["text"].lower()
)

# Uses only 5,000 reviews for training 
number_of_reviews = 5000 
subset_train_reviews = restaurant_train_reviews.shuffle(
    seed=42).select(range(number_of_reviews))
subset_test_reviews = restaurant_test_reviews.shuffle(
    seed=42).select(range(number_of_reviews))

# Creates a DatasetDict to return both train and test datasets 
subset_dataset = { 
    "train": subset_train_reviews,
    "test": subset_test_reviews
}

# Displays the structure to match the requested format 
yelp_restaurant_dataset = DatasetDict(subset_dataset) 

# Prints the dataset structure 
print(yelp_restaurant_dataset)  

Filter:   0%|          | 0/650000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 5000
    })
})


In [3]:
yelp_restaurant_dataset['train'][0]

{'label': 2,
 'text': "This place is good, but I think I just ordered the wrong thing. The Hibiscus Enchiladas were just way too sweet for me. I think they should be on the dessert menu and not dinner menu. I'd like to give it another chance and order something else next time, but other than that a good vibe. Seemed like the typical American Mexican restaurant."}

In [4]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)  #1

def tokenize_function(examples):  #2
    return tokenizer(examples["text"],
                     padding = "max_length",
                     truncation = True,
                     max_length = 512)

tokenized_datasets = yelp_restaurant_dataset.map(  #3
                         tokenize_function,
                         batched=True)
tokenized_datasets

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
})

In [8]:
from transformers import AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained(  #1
            model_checkpoint,
            num_labels = 5)

# Determines the device 
if torch.backends.mps.is_available(): 
    device = torch.device("mps")
else:
    device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu")

# Moves the model to the selected device 
model.to(device) 

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
from transformers import Trainer, TrainingArguments

# Sets up training arguments 
training_args = TrainingArguments( 
    output_dir = "./results",  # Directory in which to save results 
    eval_strategy = "epoch",  # Evaluates model after each epoch 
    save_strategy = "epoch",  # Saves the model after each epoch 
    learning_rate = 2e-5,  # Learning rate 
    per_device_train_batch_size = 16,  # Batch size for training 
    per_device_eval_batch_size = 16,  # Batch size for evaluation 
    num_train_epochs = 3,  # Number of training epochs 
    weight_decay = 0.01,  # Weight decay for regularization 
    logging_dir = "./logs",  # Directory for logs 
    logging_steps = 10,  # Logs every 10 steps 
    save_steps = 500,  # Saves the model every 500 steps 
    load_best_model_at_end = True,  # Loads the best model at the end of training 
)

# Sets up the Trainer 
trainer = Trainer( 
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["test"],
)

# Fine-tunes the model 
trainer.train()  

In [None]:
#1 Saves the fine-tuned model and tokenizer 
model.save_pretrained("./results/final_model_multiclass") 
tokenizer.save_pretrained("./results/final_tokenizer_multiclass")  

#2 Evaluates the model on the test set 
eval_results = trainer.evaluate()  

#3 Prints the evaluation results 
print(eval_results)  #3