In [1]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv("dataset_without.csv")

In [6]:
df=df.drop(columns=['Unnamed: 0'])

In [8]:
# Convert float rating to int class label (0-indexed for BERT classification)
df['label'] = df['Rating'].astype(int) - 1  # Ratings 1–5 -> Labels 0–4

# Remove original Rating column
df = df[['Review', 'label']].rename(columns={'Review': 'text'})

# Split into train and test
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# Convert to HuggingFace Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [9]:
from transformers import AutoTokenizer

#model_name = "microsoft/deberta-v3-small"
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=64)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/119640 [00:00<?, ? examples/s]

Map:   0%|          | 0/29911 [00:00<?, ? examples/s]

In [10]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted"),
    }

training_args = TrainingArguments(
    output_dir="./bert-review-classifier4",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    save_strategy="epoch",
    report_to="none",
    fp16=True
)


In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()



  trainer = Trainer(


Step,Training Loss
50,1.5241
100,1.2918
150,1.1187
200,1.0669
250,1.0548
300,1.0759
350,1.0104
400,1.0383
450,1.0201
500,0.9955


TrainOutput(global_step=29912, training_loss=0.6416172916062154, metrics={'train_runtime': 2927.4831, 'train_samples_per_second': 163.471, 'train_steps_per_second': 10.218, 'total_flos': 1.573972728219648e+16, 'train_loss': 0.6416172916062154, 'epoch': 4.0})

In [13]:
trainer.save_model('./epoche 4')

In [14]:
train_metrics = trainer.evaluate(eval_dataset=train_dataset)
print(" Training Accuracy after Epoch 4:", train_metrics["eval_accuracy"])
print(" Training F1 after Epoch 4:", train_metrics["eval_f1"])

 Training Accuracy after Epoch 4: 0.9195336008024072
 Training F1 after Epoch 4: 0.9189682640265431


In [15]:
eval_metrics = trainer.evaluate(eval_dataset=test_dataset)
print(" Eval Accuracy after Epoch 4:", eval_metrics["eval_accuracy"])
print(" Eval F1 after Epoch 4:", eval_metrics["eval_f1"])

 Eval Accuracy after Epoch 4: 0.6479890341345993
 Eval F1 after Epoch 4: 0.647166683555582


In [16]:
model = AutoModelForSequenceClassification.from_pretrained('./epoche 4')

In [18]:

training_args = TrainingArguments(
    output_dir='./bert-review-classifier4',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    save_strategy="epoch",

    report_to="none",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Resume training from checkpoint
trainer.train(resume_from_checkpoint=True)


  trainer = Trainer(


Step,Training Loss
29950,0.3021
30000,0.3233
30050,0.4017
30100,0.3681
30150,0.3557
30200,0.3968
30250,0.3406
30300,0.37
30350,0.3363
30400,0.3689


TrainOutput(global_step=44868, training_loss=0.10649204205263635, metrics={'train_runtime': 1485.8733, 'train_samples_per_second': 483.11, 'train_steps_per_second': 30.196, 'total_flos': 2.360959092329472e+16, 'train_loss': 0.10649204205263635, 'epoch': 6.0})

In [19]:
trainer.save_model('./epoche 6')

In [20]:
train_metrics = trainer.evaluate(eval_dataset=train_dataset)
print(" Training Accuracy after Epoch 6:", train_metrics["eval_accuracy"])
print(" Training F1 after Epoch 6:", train_metrics["eval_f1"])

 Training Accuracy after Epoch 6: 0.962094617184888
 Training F1 after Epoch 6: 0.9619187890493135


In [21]:
eval_metrics = trainer.evaluate(eval_dataset=test_dataset)
print(" Eval Accuracy after Epoch 6:", eval_metrics["eval_accuracy"])
print(" Eval F1 after Epoch 6:", eval_metrics["eval_f1"])

 Eval Accuracy after Epoch 6: 0.6370231687339106
 Eval F1 after Epoch 6: 0.6380255213886105


**Testing on whole  dataset**

In [22]:
# testing on whole balanced dataset
new_df = pd.read_csv("dataset_without.csv")

In [23]:
new_df = new_df.dropna(subset=['Review'])

In [24]:
new_df['word_count'] = new_df['Review'].astype(str).apply(lambda x: len(x.split()))

In [25]:
new_df = new_df[(new_df['word_count'] >= 3) & (new_df['word_count'] <= 200)]

In [26]:
new_df=new_df.drop(columns=['word_count'])

In [27]:
new_df=new_df.drop(columns=['Unnamed: 0'])

In [28]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 149551 entries, 0 to 162815
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Rating  149551 non-null  int64 
 1   Review  149551 non-null  object
dtypes: int64(1), object(1)
memory usage: 3.4+ MB


In [29]:
new_df['label'] = new_df['Rating'].astype(int) - 1
new_df = new_df[['Review', 'label']].rename(columns={'Review': 'text'})

In [30]:
new_dataset = Dataset.from_pandas(new_df)

In [33]:
tokenizer = AutoTokenizer.from_pretrained('./epoche 6')

In [34]:
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=64)

new_dataset = new_dataset.map(tokenize, batched=True)
new_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/149551 [00:00<?, ? examples/s]

In [35]:
model = AutoModelForSequenceClassification.from_pretrained('./epoche 6')

In [36]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }


from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=16,
    do_train=False,
    do_eval=True,
    report_to="none",
    fp16=True
)

In [37]:

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [38]:
from sklearn.metrics import classification_report
target_names = ["Rating 1", "Rating 2", "Rating 3", "Rating 4", "Rating 5"]
predictions_output = trainer.predict(new_dataset)
y_pred = np.argmax(predictions_output.predictions, axis=1)
y_true = predictions_output.label_ids

print("\nClassification Report:")
print(classification_report(y_true, y_pred, digits=4,target_names=target_names))



Classification Report:
              precision    recall  f1-score   support

    Rating 1     0.9240    0.9349    0.9294     27888
    Rating 2     0.8255    0.7933    0.8091     14823
    Rating 3     0.8497    0.8493    0.8495     21670
    Rating 4     0.8822    0.8834    0.8828     38052
    Rating 5     0.9363    0.9403    0.9383     47118

    accuracy                         0.8971    149551
   macro avg     0.8835    0.8803    0.8818    149551
weighted avg     0.8967    0.8971    0.8968    149551



In [None]:
import torch

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Example review text
texts = [
    "I bought this tablet for my 4 year old daughter. Too many advertisements. I would rather pay more and not have a single ad.",

]

# Tokenize
inputs = tokenizer(
    texts,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
).to(device)

# Prediction
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)+1

print("Predicted labels:", predictions.cpu().numpy())

Predicted labels: [1]
