In [14]:
import pandas as pd
import gzip
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support,  mean_absolute_error, accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
import torch
import numpy as np

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
df = pd.read_csv("/content/drive/MyDrive/Projet finetuning BERT/Womens Clothing E-Commerce Reviews.csv").dropna(subset=['Review Text', 'Rating'])
df['Rating'] = df['Rating'] - 1  # 1-5 -> 0-4
df_test = df.iloc[:1000, :].reset_index(drop=True)
df_train = df.iloc[1000:, :].reset_index(drop=True)

In [6]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [7]:
df.columns

Index(['Unnamed: 0', 'Clothing ID', 'Age', 'Title', 'Review Text', 'Rating',
       'Recommended IND', 'Positive Feedback Count', 'Division Name',
       'Department Name', 'Class Name'],
      dtype='object')

In [8]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize function (move tensors to the device)
def tokenize(batch):
    tokenized_inputs = tokenizer(
        batch['Review Text'],
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )

    # Move tensors to the appropriate device (GPU if available)
    tokenized_inputs = {key: tensor.to(device) for key, tensor in tokenized_inputs.items()}

    # Move labels to device as well
    tokenized_inputs["labels"] = torch.tensor(batch['Rating']).to(device)

    return tokenized_inputs

# Load and tokenize datasets, moving tensors to GPU
train_dataset = Dataset.from_pandas(df_train).map(tokenize, batched=True)
test_dataset = Dataset.from_pandas(df_test).map(tokenize, batched=True)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Map:   0%|          | 0/21641 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [9]:
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [10]:
# Initializing the model
model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=df['Rating'].unique().shape[0])
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [11]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    learning_rate=1e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=10,
    fp16=True
)



In [12]:

# Function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions

    # Hard predictions are needed for accuracy, precision, recall, and F1
    hard_preds = np.argmax(preds, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, hard_preds, average='weighted')
    acc = accuracy_score(labels, hard_preds)
    mae = mean_absolute_error(labels, hard_preds)

    # Compute ROC AUC for each class
    roc_auc = {}
    for i in range(preds.shape[1]):  # Iterate over each class
        roc_auc[f"roc_auc_class_{i}"] = roc_auc_score((labels == i).astype(int), preds[:, i])

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'mae': mae,
        **roc_auc  # This will expand the dictionary to include the roc_auc for each class
    }
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Mae,Roc Auc Class 0,Roc Auc Class 1,Roc Auc Class 2,Roc Auc Class 3,Roc Auc Class 4
1,0.7973,0.749037,0.676,0.664545,0.664693,0.676,0.363,0.944006,0.928566,0.888775,0.738504,0.904043
2,0.7252,0.731673,0.699,0.681062,0.680275,0.699,0.343,0.949751,0.931256,0.897085,0.764087,0.906662
3,0.6578,0.727152,0.693,0.679725,0.679143,0.693,0.35,0.951173,0.932297,0.897551,0.761914,0.907583


Epoch,Training Loss,Validation Loss


TrainOutput(global_step=510, training_loss=0.7111737531774184, metrics={'train_runtime': 376.5899, 'train_samples_per_second': 172.397, 'train_steps_per_second': 1.354, 'total_flos': 4270604790797568.0, 'train_loss': 0.7111737531774184, 'epoch': 3.0})

In [16]:
# Evaluating the model on the test dataset
trainer.evaluate()

{'eval_loss': 0.7271522879600525,
 'eval_accuracy': 0.693,
 'eval_f1': 0.6797249254047306,
 'eval_precision': 0.679142715326454,
 'eval_recall': 0.693,
 'eval_mae': 0.35,
 'eval_roc_auc_class_0': 0.9511728005054494,
 'eval_roc_auc_class_1': 0.932297439080256,
 'eval_roc_auc_class_2': 0.8975505303899083,
 'eval_roc_auc_class_3': 0.7619136896645761,
 'eval_roc_auc_class_4': 0.9075834257485634,
 'eval_runtime': 1.671,
 'eval_samples_per_second': 598.455,
 'eval_steps_per_second': 4.788,
 'epoch': 3.0}

### Upload to huggingface

In [1]:
!huggingface-cli login

^C


In [18]:
#zip the results to download
!zip -r results.zip /content/results/

  adding: content/results/ (stored 0%)
  adding: content/results/checkpoint-340/ (stored 0%)
  adding: content/results/checkpoint-340/config.json (deflated 53%)
  adding: content/results/checkpoint-340/trainer_state.json (deflated 77%)
  adding: content/results/checkpoint-340/optimizer.pt (deflated 21%)
  adding: content/results/checkpoint-340/model.safetensors (deflated 7%)
  adding: content/results/checkpoint-340/rng_state.pth (deflated 25%)
  adding: content/results/checkpoint-340/training_args.bin (deflated 51%)
  adding: content/results/checkpoint-340/scheduler.pt (deflated 55%)
  adding: content/results/checkpoint-170/ (stored 0%)
  adding: content/results/checkpoint-170/config.json (deflated 53%)
  adding: content/results/checkpoint-170/trainer_state.json (deflated 73%)
  adding: content/results/checkpoint-170/optimizer.pt (deflated 21%)
  adding: content/results/checkpoint-170/model.safetensors (deflated 7%)
  adding: content/results/checkpoint-170/rng_state.pth (deflated 25%)
