In [1]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv("cleaned_imbalanced.csv")

In [3]:
df = df.dropna(subset=['Review'])

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119975 entries, 0 to 119999
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  119975 non-null  int64 
 1   Rating      119975 non-null  int64 
 2   Review      119975 non-null  object
dtypes: int64(2), object(1)
memory usage: 3.7+ MB


In [5]:
# Convert float rating to int class label (0-indexed for BERT classification)
df['label'] = df['Rating'].astype(int) - 1  # Ratings 1–5 -> Labels 0–4

# Remove original Rating column
df = df[['Review', 'label']].rename(columns={'Review': 'text'})

# Split into train and test
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# Convert to HuggingFace Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [6]:
from transformers import AutoTokenizer

#model_name = "microsoft/deberta-v3-small"
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=64)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/95980 [00:00<?, ? examples/s]

Map:   0%|          | 0/23995 [00:00<?, ? examples/s]

In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted"),
    }

training_args = TrainingArguments(
    output_dir="./bert-review-classifier2",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    save_strategy="epoch",
    report_to="none",
    fp16=True
)


In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()



  trainer = Trainer(
  return forward_call(*args, **kwargs)


Step,Training Loss
50,1.5772
100,1.5264
150,1.4623
200,1.3842
250,1.3166
300,1.2959
350,1.2942
400,1.265
450,1.2854
500,1.2837


  return forward_call(*args, **kwargs)


TrainOutput(global_step=11998, training_loss=1.0566092581923037, metrics={'train_runtime': 1220.0448, 'train_samples_per_second': 157.338, 'train_steps_per_second': 9.834, 'total_flos': 6313519828423680.0, 'train_loss': 1.0566092581923037, 'epoch': 2.0})

In [10]:
trainer.save_model('./epoch 2')

In [12]:
train_metrics = trainer.evaluate(eval_dataset=train_dataset)
print(" Training Accuracy after Epoch 2:", train_metrics["eval_accuracy"])
print(" Training F1 after Epoch 2:", train_metrics["eval_f1"])

 Training Accuracy after Epoch 2: 0.6711502396332569
 Training F1 after Epoch 2: 0.6421870966001068


In [13]:
eval_metrics = trainer.evaluate(eval_dataset=test_dataset)
print(" Eval Accuracy after Epoch 2:", eval_metrics["eval_accuracy"])
print(" Eval F1 after Epoch 2:", eval_metrics["eval_f1"])

  return forward_call(*args, **kwargs)


 Eval Accuracy after Epoch 2: 0.5785788705980413
 Eval F1 after Epoch 2: 0.5489385908452707


In [14]:
model = AutoModelForSequenceClassification.from_pretrained('./epoch 2')

In [15]:

training_args = TrainingArguments(
    output_dir='./bert-review-classifier2',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    save_strategy="epoch",

    report_to="none",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Resume training from checkpoint
trainer.train(resume_from_checkpoint=True)


  trainer = Trainer(
  return forward_call(*args, **kwargs)


Step,Training Loss
12000,0.9732
12050,0.8849
12100,0.8801
12150,0.8256
12200,0.8336
12250,0.8462
12300,0.9348
12350,0.8728
12400,0.8848
12450,0.8457


  return forward_call(*args, **kwargs)


TrainOutput(global_step=23996, training_loss=0.39555793771109476, metrics={'train_runtime': 1202.8646, 'train_samples_per_second': 319.171, 'train_steps_per_second': 19.949, 'total_flos': 1.262703965684736e+16, 'train_loss': 0.39555793771109476, 'epoch': 4.0})

In [16]:
trainer.save_model('./epoch 4')

In [17]:
train_metrics = trainer.evaluate(eval_dataset=train_dataset)
print(" Training Accuracy after Epoch 4:", train_metrics["eval_accuracy"])
print(" Training F1 after Epoch 4:", train_metrics["eval_f1"])

  return forward_call(*args, **kwargs)


 Training Accuracy after Epoch 4: 0.8222858928943529
 Training F1 after Epoch 4: 0.8177697647589581


In [18]:
eval_metrics = trainer.evaluate(eval_dataset=test_dataset)
print(" Eval Accuracy after Epoch 4:", eval_metrics["eval_accuracy"])
print(" Eval F1 after Epoch 4:", eval_metrics["eval_f1"])

  return forward_call(*args, **kwargs)


 Eval Accuracy after Epoch 4: 0.5556574286309648
 Eval F1 after Epoch 4: 0.5486452113432124


In [19]:
model = AutoModelForSequenceClassification.from_pretrained('./epoch 4')

In [20]:

training_args = TrainingArguments(
    output_dir='./bert-review-classifier2',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    save_strategy="epoch",

    report_to="none",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Resume training from checkpoint
trainer.train(resume_from_checkpoint=True)


  trainer = Trainer(
  return forward_call(*args, **kwargs)


Step,Training Loss
24000,0.5098
24050,0.62
24100,0.6013
24150,0.6102
24200,0.619
24250,0.6429
24300,0.6045
24350,0.6072
24400,0.5905
24450,0.6263


  return forward_call(*args, **kwargs)


TrainOutput(global_step=35994, training_loss=0.1891674054041898, metrics={'train_runtime': 1170.0898, 'train_samples_per_second': 492.167, 'train_steps_per_second': 30.762, 'total_flos': 1.894055948527104e+16, 'train_loss': 0.1891674054041898, 'epoch': 6.0})

In [21]:
trainer.save_model('./epoch 6')

In [22]:
train_metrics = trainer.evaluate(eval_dataset=train_dataset)
print(" Training Accuracy after Epoch 6:", train_metrics["eval_accuracy"])
print(" Training F1 after Epoch 6:", train_metrics["eval_f1"])

  return forward_call(*args, **kwargs)


 Training Accuracy after Epoch 6: 0.907751614919775
 Training F1 after Epoch 6: 0.9061826601113455


In [23]:
eval_metrics = trainer.evaluate(eval_dataset=test_dataset)
print(" Eval Accuracy after Epoch 6:", eval_metrics["eval_accuracy"])
print(" Eval F1 after Epoch 6:", eval_metrics["eval_f1"])

  return forward_call(*args, **kwargs)


 Eval Accuracy after Epoch 6: 0.5378620545947073
 Eval F1 after Epoch 6: 0.5339274340465767


**Testing on whole balanced dataset**


In [49]:
# testing on whole balanced dataset
new_df = pd.read_csv("cleaned_balanced.csv")

In [50]:
new_df = new_df.dropna(subset=['Review'])

In [51]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 114985 entries, 0 to 114999
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  114985 non-null  int64  
 1   Rating      114985 non-null  float64
 2   Review      114985 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 3.5+ MB


In [52]:
new_df['label'] = new_df['Rating'].astype(int) - 1
new_df = new_df[['Review', 'label']].rename(columns={'Review': 'text'})

In [53]:
new_dataset = Dataset.from_pandas(new_df)

In [54]:
tokenizer = AutoTokenizer.from_pretrained('./epoch 6')

In [55]:
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=64)

new_dataset = new_dataset.map(tokenize, batched=True)
new_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/114985 [00:00<?, ? examples/s]

In [59]:
model = AutoModelForSequenceClassification.from_pretrained('./epoch 6')

In [60]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }


from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=16,
    do_train=False,
    do_eval=True,
    report_to="none",
    fp16=True
)

In [61]:

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [62]:
from sklearn.metrics import classification_report
target_names = ["Rating 1", "Rating 2", "Rating 3", "Rating 4", "Rating 5"]
predictions_output = trainer.predict(new_dataset)
y_pred = np.argmax(predictions_output.predictions, axis=1)
y_true = predictions_output.label_ids

print("\nClassification Report:")
print(classification_report(y_true, y_pred, digits=4,target_names=target_names))


  return forward_call(*args, **kwargs)



Classification Report:
              precision    recall  f1-score   support

    Rating 1     0.5290    0.6961    0.6012     22994
    Rating 2     0.4059    0.2377    0.2998     22997
    Rating 3     0.3888    0.3412    0.3634     22999
    Rating 4     0.4020    0.4227    0.4121     22998
    Rating 5     0.5762    0.6738    0.6212     22997

    accuracy                         0.4743    114985
   macro avg     0.4604    0.4743    0.4595    114985
weighted avg     0.4604    0.4743    0.4595    114985



**Testing on whole imbalanced dataset**

In [37]:
# testing on whole imbalanced dataset
new_df = pd.read_csv("cleaned_imbalanced.csv")

In [38]:
new_df = new_df.dropna(subset=['Review'])

In [39]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119975 entries, 0 to 119999
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  119975 non-null  int64 
 1   Rating      119975 non-null  int64 
 2   Review      119975 non-null  object
dtypes: int64(2), object(1)
memory usage: 3.7+ MB


In [40]:
new_df['label'] = new_df['Rating'].astype(int) - 1
new_df = new_df[['Review', 'label']].rename(columns={'Review': 'text'})

In [41]:
new_dataset = Dataset.from_pandas(new_df)

In [42]:
tokenizer = AutoTokenizer.from_pretrained('./epoch 6')

In [43]:
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=64)

new_dataset = new_dataset.map(tokenize, batched=True)
new_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/119975 [00:00<?, ? examples/s]

In [44]:
model = AutoModelForSequenceClassification.from_pretrained('./epoch 6')

In [45]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }


from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=16,
    do_train=False,
    do_eval=True,
    report_to="none",
    fp16=True
)

In [46]:

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [48]:
from sklearn.metrics import classification_report
target_names = ["Rating 1", "Rating 2", "Rating 3", "Rating 4", "Rating 5"]
predictions_output = trainer.predict(new_dataset)
y_pred = np.argmax(predictions_output.predictions, axis=1)
y_true = predictions_output.label_ids

print("\nClassification Report:")
print(classification_report(y_true, y_pred, digits=4,target_names=target_names))


  return forward_call(*args, **kwargs)



Classification Report:
              precision    recall  f1-score   support

    Rating 1     0.8767    0.9191    0.8974     29989
    Rating 2     0.7297    0.6204    0.6706     11998
    Rating 3     0.7612    0.7446    0.7528     17996
    Rating 4     0.7947    0.7790    0.7868     23999
    Rating 5     0.8851    0.9150    0.8998     35993

    accuracy                         0.8338    119975
   macro avg     0.8095    0.7956    0.8015    119975
weighted avg     0.8308    0.8338    0.8316    119975

