In [1]:
import pandas as pd
import gzip
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support,  mean_absolute_error, accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
import torch
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
df = pd.read_csv("/content/drive/MyDrive/Projet finetuning BERT/Womens Clothing E-Commerce Reviews.csv").dropna(subset=['Review Text', 'Rating'])
df['Rating'] = df['Rating'] - 1  # 1-5 -> 0-4
df_test = df.iloc[:1000, :].reset_index(drop=True)
df_train = df.iloc[1000:, :].reset_index(drop=True)

In [5]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [6]:
df.columns

Index(['Unnamed: 0', 'Clothing ID', 'Age', 'Title', 'Review Text', 'Rating',
       'Recommended IND', 'Positive Feedback Count', 'Division Name',
       'Department Name', 'Class Name'],
      dtype='object')

In [7]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize function (move tensors to the device)
def tokenize(batch):
    tokenized_inputs = tokenizer(
        batch['Review Text'],
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )

    # Move tensors to the appropriate device (GPU if available)
    tokenized_inputs = {key: tensor.to(device) for key, tensor in tokenized_inputs.items()}

    # Move labels to device as well
    tokenized_inputs["labels"] = torch.tensor(batch['Rating']).to(device)

    return tokenized_inputs

# Load and tokenize datasets, moving tensors to GPU
train_dataset = Dataset.from_pandas(df_train).map(tokenize, batched=True)
test_dataset = Dataset.from_pandas(df_test).map(tokenize, batched=True)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Map:   0%|          | 0/21641 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [9]:
# Initializing the model
model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=df['Rating'].unique().shape[0])
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [10]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    learning_rate=1e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=10,
    fp16=True
)



In [11]:
# Function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions

    # Hard predictions are needed for accuracy, precision, recall, and F1
    hard_preds = np.argmax(preds, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, hard_preds, average='weighted')
    acc = accuracy_score(labels, hard_preds)
    mae = mean_absolute_error(labels, hard_preds)

    # Compute ROC AUC for each class
    roc_auc = {}
    for i in range(preds.shape[1]):  # Iterate over each class
        roc_auc[f"roc_auc_class_{i}"] = roc_auc_score((labels == i).astype(int), preds[:, i])

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'mae': mae,
        **roc_auc  # This will expand the dictionary to include the roc_auc for each class
    }
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Mae,Roc Auc Class 0,Roc Auc Class 1,Roc Auc Class 2,Roc Auc Class 3,Roc Auc Class 4
1,0.846,0.777698,0.686,0.659267,0.65214,0.686,0.353,0.936088,0.925655,0.88141,0.710592,0.894778
2,0.7844,0.743978,0.688,0.668963,0.664959,0.688,0.339,0.949337,0.931618,0.890114,0.693267,0.902653
3,0.6753,0.724892,0.702,0.686692,0.678582,0.702,0.327,0.950659,0.932519,0.897295,0.720085,0.905519
4,0.6247,0.738919,0.701,0.68694,0.700411,0.701,0.323,0.949139,0.932068,0.892058,0.727164,0.904418
5,0.6129,0.724585,0.709,0.696407,0.703096,0.709,0.321,0.948507,0.931536,0.899678,0.744401,0.905688


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=850, training_loss=0.7469626028397504, metrics={'train_runtime': 585.2896, 'train_samples_per_second': 184.874, 'train_steps_per_second': 1.452, 'total_flos': 7117674651329280.0, 'train_loss': 0.7469626028397504, 'epoch': 5.0})

In [13]:
# Evaluating the model on the test dataset
trainer.evaluate()

{'eval_loss': 0.7245850563049316,
 'eval_accuracy': 0.709,
 'eval_f1': 0.6964065143871703,
 'eval_precision': 0.7030962805479717,
 'eval_recall': 0.709,
 'eval_mae': 0.321,
 'eval_roc_auc_class_0': 0.9485073448112462,
 'eval_roc_auc_class_1': 0.9315364040726457,
 'eval_roc_auc_class_2': 0.8996783615252293,
 'eval_roc_auc_class_3': 0.7444010771748022,
 'eval_roc_auc_class_4': 0.9056880733944953,
 'eval_runtime': 1.6057,
 'eval_samples_per_second': 622.795,
 'eval_steps_per_second': 4.982,
 'epoch': 5.0}

### Upload to huggingface

In [18]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [19]:
#create a repo
!huggingface-cli repo create bert_finetuned_for_reviews --type model

[90mgit version 2.34.1[0m
[90mgit-lfs/3.0.2 (GitHub; linux amd64; go 1.18.1)[0m

You are about to create [1mpirate0007/bert_finetuned_for_reviews[0m
Proceed? [Y/n] Y

Your repo now lives at:
  [1mhttps://huggingface.co/pirate0007/bert_finetuned_for_reviews[0m

You can clone it locally with the command below, and commit/push as usual.

  git clone https://huggingface.co/pirate0007/bert_finetuned_for_reviews



In [20]:
#save model
model.save_pretrained("./")
tokenizer.save_pretrained("./")

('./tokenizer_config.json',
 './special_tokens_map.json',
 './vocab.txt',
 './added_tokens.json',
 './tokenizer.json')

In [21]:
!ls

config.json  logs		results      special_tokens_map.json  tokenizer.json
drive	     model.safetensors	sample_data  tokenizer_config.json    vocab.txt


In [22]:
#create git repository
!git init

[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/.git/


In [23]:
!huggingface-cli lfs-enable-largefiles .

Local repo set up for largefiles


In [28]:
!git lfs track "pytorch_model.bin"

"pytorch_model.bin" already supported


In [29]:
#add files to git
!git add config.json model.safetensors .gitattributes

In [59]:
!git config --global user.email "adsalim007@gmail.com"
!git config --global user.name "pirate0007"

In [32]:
!git commit -m "Initial BERT model upload"

On branch master
Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m.config/[m
	[31mdrive/[m
	[31mlogs/[m
	[31mresults/[m
	[31msample_data/[m
	[31mspecial_tokens_map.json[m
	[31mtokenizer.json[m
	[31mtokenizer_config.json[m
	[31mvocab.txt[m

nothing added to commit but untracked files present (use "git add" to track)


In [49]:
!git remote add origin https://huggingface.co/pirate0007/bert_finetuned_for_reviews


error: remote origin already exists.


In [60]:
!git push origin main

fatal: could not read Username for 'https://huggingface.co': No such device or address


In [None]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 

In [61]:
!git config --global credential.helper store

In [54]:
!git branch -M main


In [57]:
!git branch

* [32mmain[m
