In [1]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv("cleaned_balanced.csv")

In [3]:
df = df.dropna(subset=['Review'])

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 114985 entries, 0 to 114999
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  114985 non-null  int64  
 1   Rating      114985 non-null  float64
 2   Review      114985 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 3.5+ MB


In [5]:
# Convert float rating to int class label (0-indexed for BERT classification)
df['label'] = df['Rating'].astype(int) - 1  # Ratings 1–5 -> Labels 0–4

# Remove original Rating column
df = df[['Review', 'label']].rename(columns={'Review': 'text'})

# Split into train and test
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# Convert to HuggingFace Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [6]:
from transformers import AutoTokenizer

#model_name = "microsoft/deberta-v3-small"
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=64)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/91988 [00:00<?, ? examples/s]

Map:   0%|          | 0/22997 [00:00<?, ? examples/s]

In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted"),
    }

training_args = TrainingArguments(
    output_dir="./bert-review-classifier2",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    save_strategy="epoch",
    report_to="none",
    fp16=True
)


In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()



  trainer = Trainer(
  return forward_call(*args, **kwargs)


Step,Training Loss
50,1.6256
100,1.5021
150,1.4214
200,1.4255
250,1.3752
300,1.3498
350,1.3532
400,1.3491
450,1.3441
500,1.2566


  return forward_call(*args, **kwargs)


TrainOutput(global_step=11500, training_loss=1.1279316419518512, metrics={'train_runtime': 1143.8773, 'train_samples_per_second': 160.835, 'train_steps_per_second': 10.054, 'total_flos': 6050927922244608.0, 'train_loss': 1.1279316419518512, 'epoch': 2.0})

In [10]:
trainer.save_model('./epoch 2')

In [11]:
train_metrics = trainer.evaluate(eval_dataset=train_dataset)
print(" Training Accuracy after Epoch 2:", train_metrics["eval_accuracy"])
print(" Training F1 after Epoch 2:", train_metrics["eval_f1"])

  return forward_call(*args, **kwargs)


 Training Accuracy after Epoch 2: 0.6540961864591034
 Training F1 after Epoch 2: 0.6512194301334742


In [12]:
eval_metrics = trainer.evaluate(eval_dataset=test_dataset)
print(" Eval Accuracy after Epoch 2:", eval_metrics["eval_accuracy"])
print(" Eval F1 after Epoch 2:", eval_metrics["eval_f1"])

  return forward_call(*args, **kwargs)


 Eval Accuracy after Epoch 2: 0.5097186589555159
 Eval F1 after Epoch 2: 0.5074561233467068


In [13]:
model = AutoModelForSequenceClassification.from_pretrained('./epoch 2')

In [14]:

training_args = TrainingArguments(
    output_dir='./bert-review-classifier2',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    save_strategy="epoch",

    report_to="none",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Resume training from checkpoint
trainer.train(resume_from_checkpoint=True)


  trainer = Trainer(
  return forward_call(*args, **kwargs)


Step,Training Loss
11550,0.8811
11600,0.9125
11650,0.892
11700,0.9554
11750,0.8753
11800,0.9249
11850,0.9592
11900,0.9307
11950,0.92
12000,0.8586


  return forward_call(*args, **kwargs)


TrainOutput(global_step=23000, training_loss=0.4088684365645699, metrics={'train_runtime': 1121.4251, 'train_samples_per_second': 328.111, 'train_steps_per_second': 20.51, 'total_flos': 1.2101855844489216e+16, 'train_loss': 0.4088684365645699, 'epoch': 4.0})

In [15]:
trainer.save_model('./epoch 4')

In [16]:
train_metrics = trainer.evaluate(eval_dataset=train_dataset)
print(" Training Accuracy after Epoch 4:", train_metrics["eval_accuracy"])
print(" Training F1 after Epoch 4:", train_metrics["eval_f1"])

  return forward_call(*args, **kwargs)


 Training Accuracy after Epoch 4: 0.8343805713788756
 Training F1 after Epoch 4: 0.833761729219002


In [17]:
eval_metrics = trainer.evaluate(eval_dataset=test_dataset)
print(" Eval Accuracy after Epoch 4:", eval_metrics["eval_accuracy"])
print(" Eval F1 after Epoch 4:", eval_metrics["eval_f1"])

  return forward_call(*args, **kwargs)


 Eval Accuracy after Epoch 4: 0.4952385093707875
 Eval F1 after Epoch 4: 0.4968694746694446


In [18]:
model = AutoModelForSequenceClassification.from_pretrained('./epoch 4')

In [19]:

training_args = TrainingArguments(
    output_dir='./bert-review-classifier2',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    save_strategy="epoch",

    report_to="none",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Resume training from checkpoint
trainer.train(resume_from_checkpoint=True)


  trainer = Trainer(
  return forward_call(*args, **kwargs)


Step,Training Loss
23050,0.6233
23100,0.6616
23150,0.6134
23200,0.617
23250,0.6047
23300,0.6268
23350,0.6314
23400,0.6119
23450,0.6129
23500,0.6687


  return forward_call(*args, **kwargs)


TrainOutput(global_step=34500, training_loss=0.18834601963430211, metrics={'train_runtime': 1186.7785, 'train_samples_per_second': 465.064, 'train_steps_per_second': 29.07, 'total_flos': 1.8152783766733824e+16, 'train_loss': 0.18834601963430211, 'epoch': 6.0})

In [20]:
trainer.save_model('./epoch 6')

In [21]:
train_metrics = trainer.evaluate(eval_dataset=train_dataset)
print(" Training Accuracy after Epoch 6:", train_metrics["eval_accuracy"])
print(" Training F1 after Epoch 6:", train_metrics["eval_f1"])

  return forward_call(*args, **kwargs)


 Training Accuracy after Epoch 6: 0.9194242727312258
 Training F1 after Epoch 6: 0.9191717702587051


In [22]:
eval_metrics = trainer.evaluate(eval_dataset=test_dataset)
print(" Eval Accuracy after Epoch 6:", eval_metrics["eval_accuracy"])
print(" Eval F1 after Epoch 6:", eval_metrics["eval_f1"])

  return forward_call(*args, **kwargs)


 Eval Accuracy after Epoch 6: 0.4870635300256555
 Eval F1 after Epoch 6: 0.4900742416998637


**Testing on whole balanced dataset**

In [56]:
# testing on whole balanced dataset
new_df = pd.read_csv("cleaned_balanced.csv")

In [57]:
new_df = new_df.dropna(subset=['Review'])

In [58]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 114985 entries, 0 to 114999
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  114985 non-null  int64  
 1   Rating      114985 non-null  float64
 2   Review      114985 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 3.5+ MB


In [59]:
new_df['label'] = new_df['Rating'].astype(int) - 1
new_df = new_df[['Review', 'label']].rename(columns={'Review': 'text'})

In [60]:
new_dataset = Dataset.from_pandas(new_df)

In [61]:
tokenizer = AutoTokenizer.from_pretrained('./epoch 6')

In [62]:
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=64)

new_dataset = new_dataset.map(tokenize, batched=True)
new_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/114985 [00:00<?, ? examples/s]

In [63]:
model = AutoModelForSequenceClassification.from_pretrained('./epoch 6')

In [64]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }


from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=16,
    do_train=False,
    do_eval=True,
    report_to="none",
    fp16=True
)

In [65]:

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [66]:
from sklearn.metrics import classification_report
target_names = ["Rating 1", "Rating 2", "Rating 3", "Rating 4", "Rating 5"]
predictions_output = trainer.predict(new_dataset)
y_pred = np.argmax(predictions_output.predictions, axis=1)
y_true = predictions_output.label_ids

print("\nClassification Report:")
print(classification_report(y_true, y_pred, digits=4,target_names=target_names))


  return forward_call(*args, **kwargs)



Classification Report:
              precision    recall  f1-score   support

    Rating 1     0.8779    0.8755    0.8767     22994
    Rating 2     0.7950    0.7858    0.7903     22997
    Rating 3     0.7889    0.7997    0.7943     22999
    Rating 4     0.8210    0.8114    0.8161     22998
    Rating 5     0.8816    0.8924    0.8870     22997

    accuracy                         0.8330    114985
   macro avg     0.8329    0.8330    0.8329    114985
weighted avg     0.8329    0.8330    0.8329    114985



**Testing on whole imbalanced dataset**

In [44]:
# testing on whole imbalanced dataset
new_df = pd.read_csv("cleaned_imbalanced.csv")

In [45]:
new_df = new_df.dropna(subset=['Review'])

In [46]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119975 entries, 0 to 119999
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  119975 non-null  int64 
 1   Rating      119975 non-null  int64 
 2   Review      119975 non-null  object
dtypes: int64(2), object(1)
memory usage: 3.7+ MB


In [47]:
new_df['label'] = new_df['Rating'].astype(int) - 1
new_df = new_df[['Review', 'label']].rename(columns={'Review': 'text'})

In [48]:
new_dataset = Dataset.from_pandas(new_df)

In [49]:
tokenizer = AutoTokenizer.from_pretrained('./epoch 6')

In [50]:
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=64)

new_dataset = new_dataset.map(tokenize, batched=True)
new_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/119975 [00:00<?, ? examples/s]

In [51]:
model = AutoModelForSequenceClassification.from_pretrained('./epoch 6')

In [52]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }


from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=16,
    do_train=False,
    do_eval=True,
    report_to="none",
    fp16=True
)

In [53]:

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [55]:
from sklearn.metrics import classification_report
target_names = ["Rating 1", "Rating 2", "Rating 3", "Rating 4", "Rating 5"]
predictions_output = trainer.predict(new_dataset)
y_pred = np.argmax(predictions_output.predictions, axis=1)
y_true = predictions_output.label_ids

print("\nClassification Report:")
print(classification_report(y_true, y_pred, digits=4,target_names=target_names))


  return forward_call(*args, **kwargs)



Classification Report:
              precision    recall  f1-score   support

    Rating 1     0.7377    0.5568    0.6346     29989
    Rating 2     0.2317    0.3795    0.2877     11998
    Rating 3     0.3225    0.3906    0.3533     17996
    Rating 4     0.3971    0.4377    0.4164     23999
    Rating 5     0.7279    0.5953    0.6550     35993

    accuracy                         0.5019    119975
   macro avg     0.4834    0.4720    0.4694    119975
weighted avg     0.5537    0.5019    0.5202    119975

