In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
import numpy as np


In [2]:
# 1. Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [3]:
df=pd.read_csv(r"E:\Gradious_Final_Project\database\Final_Dataset.csv")
df.head()

Unnamed: 0,Index,Review,Cleaned_Review,Cleaned_Name,Severity,Urgency,Customer_Satisfaction,Review_Days,Response_Days,Predicted_Category,Probabilities,Mapped_Category,Hygiene,Food Quality,Atmosphere,Value for Money,Service Issue,Positive Review,Food Options,Review_Date
0,0,"Stop eating at this place, I have visited bang...",stop eating place visited bangalores nd punes ...,pramod kumar,High,Urgent,No Response,180,-1,"['Quality, taste, or freshness issues with foo...","[0.9965850710868835, 0.9946374893188477, 0.844...","['Food Quality', 'Hygiene', 'Service Issue']",1,1,0,0,1,0,0,10-05-2024
1,1,Food 2/5\nService 2/5\nAmbience 2/5 …,food service ambience,abhinav deep,Medium,Non-Urgent,High Satisfaction,365,365,"['Poor customer service or staff behavior', 'Q...","[0.8227755427360535, 0.7571902871131897]","['Service Issue', 'Food Quality']",0,1,0,0,1,0,0,07-11-2023
2,2,Idiotic varieties for the price they have char...,idiotic varieties price charged varieties boil...,vijay nammi,High,Urgent,No Response,30,-1,"['Quality, taste, or freshness issues with foo...","[0.9860756993293762, 0.98301100730896]","['Food Quality', 'Value for Money']",0,1,0,1,0,0,0,07-10-2024
3,3,"I am posting this live now, this is one of the...",posting live one worst places dont visit pathe...,surya ajay,High,Urgent,High Satisfaction,365,365,"['Poor customer service or staff behavior', 'Q...","[0.992651641368866, 0.9484805464744568]","['Service Issue', 'Food Quality']",0,1,0,0,1,0,0,07-11-2023
4,4,"We are pure vegetarians, I ordered veg biryani...",pure vegetarians ordered veg biryani swiggy go...,sai hithesh,Low,Non-Urgent,No Response,180,-1,"['Poor customer service or staff behavior', 'C...","[0.9473494291305542, 0.8612152338027954, 0.833...","['Service Issue', 'Value for Money', 'Food Qua...",0,1,0,1,1,0,0,10-05-2024


In [5]:
categories_to_add = ['Service Issue', 'Food Options', 'Food Quality', 'Atmosphere', 'Value for Money', 'Hygiene', 'Positive Review']

In [6]:
# 2. Split Data into Training and Test Sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Cleaned_Review'], df[categories_to_add], test_size=0.2, random_state=42
)

In [7]:
# 3. Load BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)

In [8]:
# 4. Convert Labels to Tensors
train_labels = torch.tensor(train_labels.values).float()
test_labels = torch.tensor(test_labels.values).float()

In [9]:
# 5. Custom Dataset Class
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):  # Corrected constructor method name
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):  # Corrected method name
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):  # Corrected method name
        return len(self.labels)

In [10]:

train_dataset = ReviewDataset(train_encodings, train_labels)
test_dataset = ReviewDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2)

In [11]:
# Load BERT model for classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=7).to(device)
criterion = torch.nn.BCEWithLogitsLoss()  # Standard binary cross-entropy loss
optimizer = AdamW(model.parameters(), lr=2e-5)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:

# Training function
def train(model, train_loader, optimizer, criterion, accumulation_steps=4):
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    for i, batch in enumerate(train_loader):
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)

        outputs = model(**inputs)
        loss = criterion(outputs.logits, labels)
        loss = loss / accumulation_steps
        loss.backward()

        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        
        total_loss += loss.item()

    return total_loss / len(train_loader)


In [None]:
# # Evaluation function
# def evaluate(model, test_loader, threshold=0.5):
#     model.eval()
#     all_preds = []
#     all_labels = []
#     with torch.no_grad():
#         for batch in test_loader:
#             inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
#             labels = batch['labels'].cpu().numpy()
#             outputs = model(**inputs)
#             preds = torch.sigmoid(outputs.logits).cpu().numpy()

#             # Apply threshold for multi-label classification
#             preds = (preds > threshold).astype(int)
#             all_preds.extend(preds)
#             all_labels.extend(labels)

#     return np.array(all_preds), np.array(all_labels)

In [13]:
# Training loop
epochs = 3
for epoch in range(epochs):
    train_loss = train(model, train_loader, optimizer, criterion)
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {train_loss:.4f}")


Epoch 1/3, Training Loss: 0.1104
Epoch 2/3, Training Loss: 0.0802
Epoch 3/3, Training Loss: 0.0611


In [14]:
category_labels = ['Service Issue', "Food Quality", "Atmosphere", "Value for Money", "Hygiene","Food Options","Positive Review"]

In [28]:
def evaluate(model, test_loader, threshold=0.5):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].cpu().numpy()  # True labels
            outputs = model(**inputs)
            preds = torch.sigmoid(outputs.logits).cpu().numpy()  # Get probabilities
            
            # Initialize predictions
            binary_preds = np.zeros((preds.shape[0], preds.shape[1]))

            # Process predictions
            for idx, prob in enumerate(preds):
                # Step 1: Check for categories with probability > threshold
                predicted_categories = []
                probabilities = []

                for i, score in enumerate(prob):
                    if score > threshold:  # Using threshold passed to the function
                        predicted_categories.append(category_labels[i])
                        probabilities.append(score)

                # Convert the predicted categories to binary format
                for category in category_labels:
                    if category in predicted_categories:
                        binary_preds[idx, category_labels.index(category)] = 1

            all_preds.append(binary_preds)
            all_labels.append(labels)

    # Convert lists to arrays
    return np.vstack(all_preds), np.vstack(all_labels)


In [43]:
# Sample Usage of the Evaluate Function
predicted_labels, true_labels = evaluate(model, test_loader, threshold=0.5)


In [44]:
# Check the lengths
print("Length of true labels:", true_labels.shape[0])
print("Length of predicted labels:", predicted_labels.shape[0])

Length of true labels: 841
Length of predicted labels: 841


In [46]:
# Generate the classification report
predicted_labels_binary = (predicted_labels > 0.5).astype(float)  # Binary representation based on probabilities

# Generate the classification report
print(classification_report(true_labels, predicted_labels_binary, target_names=category_labels))
print("Accuracy:", accuracy_score(true_labels, predicted_labels_binary))

                 precision    recall  f1-score   support

  Service Issue       0.86      0.84      0.85       392
   Food Quality       0.65      0.40      0.49       278
     Atmosphere       0.85      0.78      0.82       465
Value for Money       0.70      0.49      0.57       103
        Hygiene       0.83      0.44      0.57       184
   Food Options       0.69      0.74      0.72        58
Positive Review       0.77      0.57      0.66       160

      micro avg       0.80      0.65      0.72      1640
      macro avg       0.76      0.61      0.67      1640
   weighted avg       0.79      0.65      0.71      1640
    samples avg       0.80      0.70      0.71      1640

Accuracy: 0.38406658739595717


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:


def classify_reviews(model, reviews, category_labels, threshold=0.5):
    model.eval()
    all_preds = []
    all_probs = []

    # Tokenize the reviews (ensure you have a tokenizer defined)
    tokenized_reviews = tokenizer(reviews, padding=True, truncation=True, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**tokenized_reviews)
        preds = torch.sigmoid(outputs.logits).cpu().numpy()  # Get probabilities

        for prob in preds:
            predicted_categories = []
            probabilities = []

            # Step 1: Check for categories with probability > threshold
            for i, score in enumerate(prob):
                if score > threshold:
                    predicted_categories.append(category_labels[i])
                    probabilities.append(score)

            # Step 2: If no categories above threshold, select the category with the highest probability
            if not predicted_categories:
                max_prob = max(prob)
                max_category = category_labels[prob.tolist().index(max_prob)]
                predicted_categories.append(max_category)
                probabilities.append(max_prob)

            all_preds.append(predicted_categories)
            all_probs.append(probabilities)

    return all_preds, all_probs

# Sample usage
sample_reviews = [
    "The food was excellent and the service was great!",
    "Service is bad",
    "atmosphere, hygience, food options is worst"
]

predicted_categories, predicted_probabilities = classify_reviews(model, sample_reviews, category_labels, threshold=0.5)

# Display results
for review, categories, probs in zip(sample_reviews, predicted_categories, predicted_probabilities):
    print(f"Review: {review}")
    print(f"Predicted Categories: {categories}")
    print(f"Probabilities: {probs}")
    print("-----")


Review: The food was excellent and the service was great!
Predicted Categories: ['Positive Review']
Probabilities: [0.917658]
-----
Review: Service is bad
Predicted Categories: ['Service Issue']
Probabilities: [0.9763981]
-----
Review: atmosphere, hygience, food options is worst
Predicted Categories: ['Atmosphere', 'Value for Money', 'Food Options']
Probabilities: [0.8749844, 0.58649784, 0.88801455]
-----


In [50]:
import os

# Create a directory to save the model and tokenizer
model_save_path = 'E:\Gradious_Final_Project\model'
os.makedirs(model_save_path, exist_ok=True)

# Save the model
model.save_pretrained(model_save_path)

# Save the tokenizer
tokenizer.save_pretrained(model_save_path)

print(f'Model and tokenizer saved to {model_save_path}')

Model and tokenizer saved to E:\Gradious_Final_Project\model
