In [None]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from datasets import load_dataset
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def preprocess_text(text):
    text = text.lower()
    text = BeautifulSoup(text, "html.parser").get_text()

    contractions = {
        "n't": " not",
        "'re": " are",
        "'s": " is",
        "'d": " would",
        "'ll": " will",
        "'t": " not",
        "'ve": " have",
        "'m": " am"
    }

    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)

    text = re.sub(r'[^a-z\s]', '', text)

    return text

In [None]:
dataset = load_dataset('Yelp/yelp_review_full')
print(dataset['train'][0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/6.72k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/299M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

{'label': 4, 'text': "dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank."}


In [None]:
print(len(dataset['train']['text']))

650000


In [None]:
train_data = pd.DataFrame(dataset['train'][:5000])
train_data['cleaned_text'] = train_data['text'].apply(preprocess_text)
val_data = pd.DataFrame(dataset['train'][5000:5500])
val_data['cleaned_text'] = val_data['text'].apply(preprocess_text)
dataset = None

  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()


In [None]:
print(train_data.groupby('label').size())

label
0     883
1    1126
2    1119
3     978
4     894
dtype: int64


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_texts(text):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

encoded_train_data = train_data.apply(lambda row: tokenize_texts(row['cleaned_text']), axis=1)
encoded_val_data = val_data.apply(lambda row: tokenize_texts(row['cleaned_text']), axis=1)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
from transformers import BertForSequenceClassification
import torch

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)
model.config.problem_type = "regression"

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW


train_input_ids = torch.stack(list(encoded_train_data.apply(lambda x: x['input_ids'].squeeze())))
train_attention_masks = torch.stack(list(encoded_train_data.apply(lambda x: x['attention_mask'].squeeze())))
train_labels = torch.tensor(train_data['label'].apply(lambda x: (float(x)) / (5 - 1)).values, dtype=torch.float32)

val_input_ids = torch.stack(list(encoded_val_data.apply(lambda x: x['input_ids'].squeeze())))
val_attention_masks = torch.stack(list(encoded_val_data.apply(lambda x: x['attention_mask'].squeeze())))
val_labels = torch.tensor(val_data['label'].apply(lambda x: (float(x)) / (5 - 1)).values, dtype=torch.float32)

dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 10

criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * num_epochs)

best_val_loss = np.inf
patience = 2
counter = 0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits.squeeze()
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits.squeeze()
            loss = criterion(logits, labels)

            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Validation Loss: {avg_val_loss:.4f}")

    # Early Stopping and Model Checkpointing
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        counter = 0
        torch.save(model.state_dict(), 'best_model.pth')  # Save the best model
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping triggered")
            break

# Load the best model
model.load_state_dict(torch.load('best_model.pth'))

Epoch 1/10, Average Loss: 0.0595
Epoch 1/10, Validation Loss: 0.0510
Epoch 2/10, Average Loss: 0.0334
Epoch 2/10, Validation Loss: 0.0392
Epoch 3/10, Average Loss: 0.0235
Epoch 3/10, Validation Loss: 0.0320
Epoch 4/10, Average Loss: 0.0177
Epoch 4/10, Validation Loss: 0.0350
Epoch 5/10, Average Loss: 0.0144
Epoch 5/10, Validation Loss: 0.0320
Early stopping triggered


<All keys matched successfully>

In [None]:
drive_path = '/content/drive/MyDrive/Models/reviewbody_scoring_v1.pth'
torch.save(model.state_dict(), drive_path)

In [None]:
def predict_score(review):
    model.eval()
    inputs = tokenizer(review, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    return outputs.logits.item() * 4 + 1

In [None]:
reviews = [
    "Amazing restaurant! The sushi was incredibly fresh, and the service was top-notch. Highly recommend!",
    "What a delightful dining experience! The ambiance was warm and inviting, with soft music playing in the background. I ordered the grilled steak, which was cooked to perfection, and the accompanying sides were equally delicious. The staff was attentive and friendly, making sure we had everything we needed. I can't wait to return!",
    "Fantastic brunch spot! The pancakes were fluffy, and the coffee was strong. Will definitely be back!",
    "This hidden gem exceeded all my expectations. The menu featured a variety of unique dishes, and I opted for the roasted vegetable tart, which was bursting with flavor. The owner personally checked in on us, adding a lovely touch to the experience. The dessert selection was divine, especially the chocolate lava cake. A must-visit for anyone in the area!",
    "Terrible experience! The food was cold, and the service was slow. Not worth the price.",
    "I had high hopes for this restaurant, but it was a total letdown. We arrived to find a long wait despite having a reservation. Once seated, our server was inattentive and seemed overwhelmed. The food took forever to arrive, and when it did, my chicken was dry and overcooked. To top it off, the prices were exorbitant for such poor quality. I won't be returning.",
    "Disappointing meal. The pasta was overcooked, and the sauce lacked flavor. Not impressed.",
    "I was really looking forward to dining here, but it was a huge disappointment. The restaurant was noisy and chaotic, making it hard to enjoy our meal. My burger was undercooked, and my friend's salad was wilted and unappetizing. The staff seemed overwhelmed and uninterested in providing good service. Overall, a frustrating experience that I wouldn't recommend.",
    "Great atmosphere and drinks, but the food was just average. I expected more for the price.",
    "This restaurant has potential, but it fell short in some areas. The location is fantastic, with a beautiful view of the city skyline. The cocktails were creative and delicious, but the food was hit or miss. My appetizer, the calamari, was excellent, but my main dish, the risotto, was undercooked and bland. The service was friendly but slow. I’d give it another try, hoping for improvements."
]

for rev in reviews:
  score = predict_score(rev)
  print(f"review: {rev}, score: {score:.2f}")


UnboundLocalError: local variable 'inputs' referenced before assignment