<a href="https://colab.research.google.com/github/samcast1/Short-Term-Investments-Model/blob/main/notebooks/4.1_sc_sentiment_analysis_refined.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Objective: Create a model based on BERT pre-trained sentiment analysis model that's suited to detect sentiment in airBNB reviews.**

**Plan: Train and evaluate model on every review gathered from the webscrape - approximately 890,000 reviews total.**

Use ratings as labels and review text as predictors.

Colab offers GPUs, but I may need something more substantial - potentially a high-performing Google Cloud VM.

This is the script that should get me off the ground. The primary change will be to first concatenate all reviews from each city in one df prior to the train test split.

In [None]:
!pip install transformers
!pip install transformers[torch]
!pip install accelerate -U
!pip install tensorboard

import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, BertConfig, TrainerCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

city = 'birmingham-al'
file_path = '{city}_reviews_clean.csv'
reviews_df = pd.read_csv(file_path)

reviews_df = reviews_df.sample(frac=1).reset_index(drop=True)

def map_star_to_label(star_rating):
    return star_rating - 1

reviews_df['label'] = reviews_df['rating'].apply(map_star_to_label)

X_train, X_test, y_train, y_test = train_test_split(
    reviews_df['review_text'],
    reviews_df['label'],
    test_size=0.3,
    random_state=42,
    stratify=reviews_df['label']
)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print(f"Maximum token length: {tokenizer.model_max_length}")

train_encodings = tokenizer(list(X_train), truncation=True, padding=True)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True)

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, y_train.tolist())
test_dataset = SentimentDataset(test_encodings, y_test.tolist())


config = BertConfig.from_pretrained('bert-base-uncased')

config.num_hidden_layers = 12
config.num_labels = 5

class_counts = reviews_df['label'].value_counts()
total_samples = len(reviews_df)
class_weights = {i: total_samples / class_counts[i] for i in class_counts.index}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class_weights_tensor = torch.tensor([class_weights[i] for i in range(5)], dtype=torch.float).to(device)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

def forward(input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
    outputs = model.bert(
        input_ids=input_ids.to(device),
        attention_mask=attention_mask.to(device),
        token_type_ids=token_type_ids.to(device),
    )
    sequence_output = outputs[1]
    logits = model.classifier(sequence_output)
    loss = None
    if labels is not None:
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        loss = loss_fct(logits.view(-1, model.num_labels), labels.view(-1).to(device))
    return (loss, logits) if loss is not None else logits

model.forward = forward

training_args_run1 = TrainingArguments(
    output_dir='./results/run1',
    num_train_epochs= 5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.005,
    logging_dir='./logs/run1',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to='tensorboard'
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

trainer_run1 = Trainer(
    model=model,
    args=training_args_run1,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

output = trainer_run1.train()
eval_result = trainer_run1.evaluate()

%load_ext tensorboard
%tensorboard --logdir=./logs/run1