In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split


In [None]:
df = pd.read_csv("dataset_without.csv")

In [None]:
df=df.drop(columns=['Unnamed: 0'])

In [None]:
# Convert float rating to int class label (0-indexed for BERT classification)
df['label'] = df['Rating'].astype(int) - 1  # Ratings 1–5 -> Labels 0–4

# Remove original Rating column
df = df[['Review', 'label']].rename(columns={'Review': 'text'})

# Split into train and test
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# Convert to HuggingFace Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
from transformers import AutoTokenizer

#model_name = "microsoft/deberta-v3-small"
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=64)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


Map:   0%|          | 0/119640 [00:00<?, ? examples/s]

Map:   0%|          | 0/29911 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted"),
    }

training_args = TrainingArguments(
    output_dir="./roberta-review-classifier2",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    save_strategy="epoch",
    report_to="none",
    fp16=True
)


In [37]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()



  trainer = Trainer(


Step,Training Loss
50,1.4786
100,1.2156
150,1.0601
200,1.007
250,0.9872
300,1.0104
350,0.96
400,0.9867
450,0.973
500,0.9484


Step,Training Loss
50,1.4786
100,1.2156
150,1.0601
200,1.007
250,0.9872
300,1.0104
350,0.96
400,0.9867
450,0.973
500,0.9484


TrainOutput(global_step=29912, training_loss=0.7138626198776268, metrics={'train_runtime': 3310.7417, 'train_samples_per_second': 144.548, 'train_steps_per_second': 9.035, 'total_flos': 1.573972728219648e+16, 'train_loss': 0.7138626198776268, 'epoch': 4.0})

In [38]:
trainer.save_model('./epoche 4')

In [39]:
train_metrics = trainer.evaluate(eval_dataset=train_dataset)
print(" Training Accuracy after Epoch 4:", train_metrics["eval_accuracy"])
print(" Training F1 after Epoch 4:", train_metrics["eval_f1"])

 Training Accuracy after Epoch 4: 0.8189986626546305
 Training F1 after Epoch 4: 0.8164014053806227


In [40]:
eval_metrics = trainer.evaluate(eval_dataset=test_dataset)
print(" Eval Accuracy after Epoch 4:", eval_metrics["eval_accuracy"])
print(" Eval F1 after Epoch 4:", eval_metrics["eval_f1"])

 Eval Accuracy after Epoch 4: 0.6723613386379593
 Eval F1 after Epoch 4: 0.6688587260200864


In [41]:
model = AutoModelForSequenceClassification.from_pretrained('./epoche 4')

In [42]:

training_args = TrainingArguments(
    output_dir='./roberta-review-classifier2',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    save_strategy="epoch",

    report_to="none",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Resume training from checkpoint
trainer.train(resume_from_checkpoint=True)


  trainer = Trainer(


Step,Training Loss
29950,0.5509
30000,0.5065
30050,0.582
30100,0.6238
30150,0.5651
30200,0.5638
30250,0.596
30300,0.5268
30350,0.5241
30400,0.5286


TrainOutput(global_step=44868, training_loss=0.17683073864738136, metrics={'train_runtime': 1612.6179, 'train_samples_per_second': 445.14, 'train_steps_per_second': 27.823, 'total_flos': 2.360959092329472e+16, 'train_loss': 0.17683073864738136, 'epoch': 6.0})

In [43]:
trainer.save_model('./epoche 6')

In [44]:
train_metrics = trainer.evaluate(eval_dataset=train_dataset)
print(" Training Accuracy after Epoch 6:", train_metrics["eval_accuracy"])
print(" Training F1 after Epoch 6:", train_metrics["eval_f1"])

 Training Accuracy after Epoch 6: 0.8632313607489134
 Training F1 after Epoch 6: 0.8617799939402804


In [45]:
eval_metrics = trainer.evaluate(eval_dataset=test_dataset)
print(" Eval Accuracy after Epoch 6:", eval_metrics["eval_accuracy"])
print(" Eval F1 after Epoch 6:", eval_metrics["eval_f1"])

 Eval Accuracy after Epoch 6: 0.6645381297850289
 Eval F1 after Epoch 6: 0.6617333888266799


In [None]:
model = AutoModelForSequenceClassification.from_pretrained('./epoch 4')

In [None]:

training_args = TrainingArguments(
    output_dir='./roberta-review-classifier2',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    save_strategy="epoch",

    report_to="none",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Resume training from checkpoint
trainer.train(resume_from_checkpoint=True)


  trainer = Trainer(


Step,Training Loss
29950,0.5621
30000,0.5407
30050,0.5879
30100,0.6037
30150,0.5998
30200,0.5834
30250,0.5964
30300,0.5438
30350,0.524
30400,0.5424


TrainOutput(global_step=44868, training_loss=0.17858587182162225, metrics={'train_runtime': 1584.8264, 'train_samples_per_second': 452.946, 'train_steps_per_second': 28.311, 'total_flos': 2.360959092329472e+16, 'train_loss': 0.17858587182162225, 'epoch': 6.0})

In [None]:
trainer.save_model('./epoch 6')

In [None]:
train_metrics = trainer.evaluate(eval_dataset=train_dataset)
print(" Training Accuracy after Epoch 6:", train_metrics["eval_accuracy"])
print(" Training F1 after Epoch 6:", train_metrics["eval_f1"])

 Training Accuracy after Epoch 6: 0.8719658976930792
 Training F1 after Epoch 6: 0.870887813426694


In [None]:
eval_metrics = trainer.evaluate(eval_dataset=test_dataset)
print(" Eval Accuracy after Epoch 6:", eval_metrics["eval_accuracy"])
print(" Eval F1 after Epoch 6:", eval_metrics["eval_f1"])

 Eval Accuracy after Epoch 6: 0.6646384273344255
 Eval F1 after Epoch 6: 0.6627130722239273
