In [1]:
import numpy as np
import pandas as pd
import os
import re
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW

In [3]:
# Download NLTK stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords

# Load airline reviews dataset
file_path = "/content/airlines_reviews.csv"
df = pd.read_csv(file_path)

# Keep relevant columns
reviews = df[['Reviews', 'Recommended']].dropna()

# Convert target variable to binary
reviews['Recommended'] = reviews['Recommended'].apply(lambda x: 1 if x.lower() == 'yes' else 0)

# Data Preprocessing
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = text.split()
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word not in stop_words]  # Remove stopwords
    return " ".join(text)

reviews['cleaned_text'] = reviews['Reviews'].apply(clean_text)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    reviews['cleaned_text'], reviews['Recommended'], test_size=0.2, random_state=42, stratify=reviews['Recommended']
)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [5]:
# Logistic Regression Model
log_reg = LogisticRegression()
log_reg.fit(X_train_tfidf, y_train)
y_pred_log_reg = log_reg.predict(X_test_tfidf)

# Model Evaluation
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("Classification Report:")
print(classification_report(y_test, y_pred_log_reg))

Logistic Regression Accuracy: 0.9006172839506172
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.89      0.89       763
           1       0.90      0.91      0.91       857

    accuracy                           0.90      1620
   macro avg       0.90      0.90      0.90      1620
weighted avg       0.90      0.90      0.90      1620



Analysis of Logistic Regression Sentiment Classification Results

The results of the Logistic Regression model on the airline reviews dataset indicate a strong performance, achieving an accuracy of 90.18%. This means that out of 1,620 test samples, approximately 1,459 were correctly classified. The classification report provides further insights into the model’s performance across different evaluation metrics: precision, recall, and F1-score.

For class 0 (Not Recommended), the model achieved a precision of 0.90, meaning that when the model predicted a review as "Not Recommended," it was correct 90% of the time. The recall score of 0.89 indicates that the model correctly identified 89% of all actual "Not Recommended" reviews. The F1-score of 0.90 shows a balanced performance between precision and recall.

For class 1 (Recommended), the model demonstrated a precision of 0.90 and a recall of 0.91, meaning that 91% of actual "Recommended" reviews were correctly identified. The F1-score of 0.91 confirms the model's strong ability to classify positive reviews effectively.

The macro average scores (0.90 for precision, recall, and F1-score) indicate that the model performs consistently across both classes. Similarly, the weighted average scores (also 0.90) suggest that the model maintains a balanced classification performance, considering the distribution of both positive and negative reviews.

In [6]:
# BERT Model Implementation
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        encoding = self.tokenizer(
            self.texts[item],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[item], dtype=torch.long)
        }

# Load Pre-trained BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 128

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
# Create DataLoader for BERT
train_dataset = ReviewDataset(X_train.tolist(), y_train.tolist(), tokenizer, max_len)
test_dataset = ReviewDataset(X_test.tolist(), y_test.tolist(), tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Load Pre-trained BERT Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training Parameters
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
criterion = nn.CrossEntropyLoss()

# Training Loop
from tqdm import tqdm  # Import tqdm for progress bar

# Training Loop with tqdm
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")  # Add progress bar

    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())  # Update progress bar with loss

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 405/405 [02:23<00:00,  2.83it/s, loss=0.278]


Epoch 1, Loss: 0.3092631365965914


Epoch 2: 100%|██████████| 405/405 [02:26<00:00,  2.76it/s, loss=0.166]


Epoch 2, Loss: 0.19631498755864155


Epoch 3: 100%|██████████| 405/405 [02:26<00:00,  2.76it/s, loss=0.00691]

Epoch 3, Loss: 0.11886339161522043





In [8]:
# Evaluation
def evaluate_model(model, data_loader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    return accuracy_score(true_labels, predictions), classification_report(true_labels, predictions)

# Evaluate BERT Model
bert_accuracy, bert_report = evaluate_model(model, test_loader)
print("BERT Accuracy:", bert_accuracy)
print("BERT Classification Report:")
print(bert_report)

BERT Accuracy: 0.8864197530864197
BERT Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.93      0.89       763
           1       0.93      0.85      0.89       857

    accuracy                           0.89      1620
   macro avg       0.89      0.89      0.89      1620
weighted avg       0.89      0.89      0.89      1620



Analysis of BERT Sentiment Classification Results

The BERT-based sentiment analysis model achieved an accuracy of 88.64%, meaning it correctly classified approximately 1,437 out of 1,620 test samples. The classification report provides a detailed breakdown of precision, recall, and F1-score across different classes.

For class 0 (Not Recommended), the model attained a precision of 0.85, indicating that when it predicted a review as "Not Recommended," it was correct 85% of the time. The recall of 0.93 suggests that 93% of all actual "Not Recommended" reviews were correctly classified. The F1-score of 0.89 reflects a balanced trade-off between precision and recall.

For class 1 (Recommended), the model demonstrated a precision of 0.93, meaning that 93% of predicted "Recommended" reviews were correct. The recall of 0.85 shows that 85% of actual "Recommended" reviews were correctly identified, with an F1-score of 0.89, similar to class 0.

The macro average (0.89) and weighted average (0.89) suggest that the model performs consistently across both classes, without favoring one class over the other.

Comparison to Logistic Regression

Compared to the Logistic Regression model (90.18% accuracy, 0.90 F1-score), BERT achieves slightly lower accuracy (88.64%) and F1-score (0.89). However, BERT shows a higher recall for class 0 (0.93 vs. 0.89) and higher precision for class 1 (0.93 vs. 0.90), indicating that it captures more subtle relationships in text data but might still require fine-tuning for optimal performance.