In [None]:
df = pd.read_csv('/content/processed_data/yelp_cleaned_sample.csv') #importing dataset

In [None]:
#Import Libraries
from datasets import Dataset
from evaluate import load
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score, classification_report

import pandas as pd
import numpy as np
import torch

#Optimizing model and tuning hyperparameters

In [None]:
#Map original labels to binary (0-1 Negative, 3-4 Positive)
def map_to_binary(label):
    if label in [0, 1]:
        return 0
    elif label in [3, 4]:
        return 1
    else:
        return None

df['binary_label'] = df['label'].apply(map_to_binary)
df = df.dropna(subset=['binary_label'])
df['binary_label'] = df['binary_label'].astype(int)

# Data split
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['binary_label'], random_state=42)

#Convert to Hugging face
train_dataset = Dataset.from_pandas(train_df[['clean_text', 'binary_label']])
test_dataset = Dataset.from_pandas(test_df[['clean_text', 'binary_label']])

#Relabel
train_dataset = train_dataset.rename_column("binary_label", "labels")
test_dataset = test_dataset.rename_column("binary_label", "labels")

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(examples):
    return tokenizer(examples['clean_text'], padding='max_length', truncation=True)
# Tokenize and format
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

#Import to torchy
train_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])


# Creat Model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

#Define Metrics

def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    accuracy = accuracy_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"accuracy": accuracy, "recall": recall, "f1": f1}

#Training Args
training_args = TrainingArguments(
    output_dir="./bert_yelp",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,# Reduce batch size to reduce resource load
    per_device_eval_batch_size=8,# Reduce batch size to reduce resource load
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,
    gradient_accumulation_steps=2,
    push_to_hub=True,
)

#Trainer Set-up
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train and evaluate
trainer.train()
results = trainer.evaluate()
print(f"\n Final Evaluation Accuracy: {results['eval_accuracy']:.4f}")