### Sentiment Analysis for Reviews from 20 Apps

In [1]:
from encodings import search_function
import os

# import packages
import pandas as pd
import numpy as np
import re
import emoji

In [2]:
# read data
data_path = "data/all_combined.csv"
odf = pd.read_csv(data_path, encoding='utf-8', on_bad_lines="skip")

#### Clean Data and Establish new tag
1. To classify the reviews into 3 groups: negative, neutral and positive, I use a new mapping rule to create a new field called `flag`
2. Create a new flag to indicate whether the content contains emojis. It's unknown that whether the existence of emojis will affect the performance of the model
3. Drop some non-English reviews
4. Drop blank data

In [3]:
# create new field to re-classify the score
score_to_flag = {1:-1, 2:-1, 3:0, 4:1, 5:1}
odf['flag'] = odf['score'].map(score_to_flag)

In [4]:
# to show if the content includes emoji
def contains_emoji(text):
    if not isinstance(text, str):
        return False
    return any(char in emoji.EMOJI_DATA for char in text)

odf["has_emoji"] = odf["content"].apply(contains_emoji)

In [5]:
# filter whose score is not in 1-5
new_df = odf[odf["score"].between(1, 5, inclusive="both")]

# strip out of non-ascii char or non-emoji part in the content
def clean_review_content(text):
    """
    Clean review content by:
    1. Preserving emojis
    2. Removing non-ASCII characters except emojis
    3. Stripping extra whitespaces
    
    :param text (str): Input review text
    :param str: Cleaned review text
    """
    
    if pd.isna(text) or not isinstance(text, str):
        return ''
    try:
        # Extract emojis and save them
        emojis = ''.join(c for c in text if c in emoji.EMOJI_DATA)
        
        # Remove non-ASCII characters, keeping emojis
        cleaned_text = ''.join(c for c in text if (ord(c) < 128) or (c in emoji.EMOJI_DATA))
        
        # Remove multiple whitespaces and trim
        cleaned_text = ' '.join(cleaned_text.split())
        
        # Combine cleaned text with preserved emojis
        return (cleaned_text + ' ' + emojis).strip()
    
    except Exception as e:
        print(f"Error processing text: {text}. Error: {e}")
        return ''

new_df["cleaned_review"] = new_df["content"].apply(clean_review_content)


In [6]:
# drop NULL data
new_df = new_df.dropna(subset=["cleaned_review", "score", "content"])
new_df = new_df[new_df["cleaned_review"] != ""]
print(f"The original data size: {odf.shape}")
print(f"The cleaned data size: {new_df.shape}")

The original data size: (200000, 6)
The cleaned data size: (196688, 7)


In [7]:
new_df.head(20)

Unnamed: 0,reviewId,content,score,app,flag,has_emoji,cleaned_review
0,e2996bb1-cdf1-4f76-a4e5-88d47b3b8d5e,Oop,5,Facebook,1,False,Oop
1,d32653e0-f81f-43d5-8d61-e0d5ce419eb0,Facebook is a nice app,5,Facebook,1,False,Facebook is a nice app
2,6a4cd47e-44ee-4bfd-b1f7-f0c044e8419e,best,5,Facebook,1,False,best
3,d8e578db-d679-4fd6-ab11-cef028134049,Open Facebook update,5,Facebook,1,False,Open Facebook update
4,f6b1b7d5-5028-42c8-bb30-b57c6bf1a02b,Facebook bhoot aacha chize hai aap sabhi log b...,5,Facebook,1,True,Facebook bhoot aacha chize hai aap sabhi log b...
5,3130b4cf-53b7-4823-8669-a44e52e3a3d5,Nice 💯👍🫦,5,Facebook,1,True,Nice 💯👍🫦 💯👍🫦
6,9b870115-9d74-4d97-bcc3-37ae9dd0a1a2,Too many annoying useless notifications.,1,Facebook,-1,False,Too many annoying useless notifications.
7,0f45902f-ed8b-4a8a-a0d5-dda4e06d30b2,Soferrr gandaaa Siya sistaa!!!,5,Facebook,1,False,Soferrr gandaaa Siya sistaa!!!
8,88e523fa-e2f4-4ad1-a40b-4315dcf56345,Best 👌 👍,4,Facebook,1,True,Best 👌 👍 👌👍
9,3aa36269-df4a-4eb3-919a-c92d45d10af5,nice,5,Facebook,1,False,nice


#### Modelling

In [7]:
# import necessary packages
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    confusion_matrix, 
    classification_report
)
from typing import List, Tuple
from tqdm import tqdm
# natural language tool kits
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [8]:
# download necessary NLTK resources
# stopwords: words such as 
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

## set random seed
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [9]:
class ReviewDataset(Dataset):
    
    """
    Initialize the dataset with reviews, features and labels.
    
    
    :param texts: List/array of review texts
    :param app_features: List/array of feature vectors for each app
    :param lables: List/array of corresponding labels/ratings

    """
    def __init__(self, texts, app_features, lables):
        self.texts = texts
        self.app_features = app_features
        self.lables = lables
    
    """
    Return the total number of samples in the dataset.
    Required by PyTorch Dataset class.
    
    :returns int: Number of reviews/samples in the dataset
    
    """
    def __len__(self):
        return len(self.texts)
    
    """
    Fetch a single sample from the dataset at the specified index.
    Required by PyTorch Dataset class.
    
    :param idx: Index of the sample to retrieve
    :returns tuple: (review_text, app_features, label) for the specified index
    """
    def __getitem__(self, idx):
        return self.texts[idx], self.app_features[idx], self.lables[idx]

In [11]:
# create LSTM model
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, hidden_dim, n_layers, dropout, app_feature_dim):
        """
        Initialize LSTM Model for Sentiment Classification with App Features
        
        :param vocab_size (int): Total number of unique words in vocabulary
        :param embedding_dim (int): Size of word embedding vectors
        :param output_dim (int): Number of output classes (e.g., 2 for binary sentiment)
        :param hidden_dim (int): Number of features in LSTM hidden state
        :param n_layers (int): Number of stacked LSTM layers
        :param dropout (float): Dropout rate for regularization
        :param app_feature_dim (int): Dimension of additional app-specific features
        """
        super().__init__()
        
        self.vocab_size = vocab_size
        
        # Create embedding layer to convert word indices to dense vectors
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer processes sequence of word embeddings
        # batch_first=True means input shape is (batch_size, sequence_length, features)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            num_layers=n_layers,
                            dropout=dropout if n_layers > 1 else 0,
                            batch_first=True)
        
        # First fully connected layer combines LSTM output with app features
        self.fc1 = nn.Linear(hidden_dim + app_feature_dim, 128)
        
        # Output layer produces final classification
        self.fc2 = nn.Linear(128, output_dim)
        
        # Dropout layer for regularization to prevent overfitting
        self.dropout = nn.Dropout(dropout)
        
        # ReLU activation function adds non-linearity
        self.relu = nn.ReLU()
    
    def forward(self, text, app_features):
        """
        Forward pass of the model
        
        
        :param text (torch.Tensor): Input tensor of word indices, shape (batch_size, sequence_length)
        :param app_features (torch.Tensor): Additional app-specific features, shape (batch_size, app_feature_dim)
        
        
        :returns torch.Tensor: Model predictions, shape (batch_size, output_dim)
        """
        # Convert word indices to embeddings and apply dropout
        embedded = self.dropout(self.embedding(text)) # Shape: (batch_size, seq_len, embedding_dim)
        
        # Process sequence through LSTM
        output, (hidden, cell) = self.lstm(embedded)
        
        # Use final hidden state from last LSTM layer
        hidden = self.dropout(hidden[-1, :, :])
        # Concatenate hidden state with app features
        combined = torch.cat([hidden, app_features], dim=1) # Shape: (batch_size, hidden_dim)
        
        # Pass through fully connected layers with dropout and ReLU
        x = self.relu(self.fc1(combined))
        x = self.dropout(x)
        
        # Return final predictions
        return self.fc2(x)

In [71]:
class SentimentAnalyzer:
    
    """
    A comprehensive sentiment analysis pipeline that processes app reviews
    Combines text processing, LSTM modeling, and app-specific features
    
    Key Features:
    - Text preprocessing with emoji preservation
    - Custom vocabulary building
    - LSTM-based sentiment classification
    - Integration of app-specific features
    - Built-in training and evaluation pipeline
    
    :param df: DataFrame containing review data
    :param content_col: Name of column containing review text
    :param label_col: Name of column containing sentiment labels
    :param app_col: Name of column containing app identifiers
    :param test_size: Proportion of data to use for testing (0-1)
    :param random_state: Random seed for reproducible results
    """
    def __init__(self, df: pd.DataFrame, content_col: str = "cleaned_review",
                 label_col: str = "flag", app_col: str = "app", 
                 test_size: float = 0.2, random_state: int = 42):
        
        # Create a copy of input data to avoid modifications to original
        self.df = df.copy()
        
        # Preprocess all review texts
        self.df['processed_text'] = self.df[content_col].apply(self.preprocess_text)
        
        # Build vocabulary from processed texts
        self.vocab = self.build_vocab(self.df['processed_text'])
        
        # Create word-to-index mapping with special tokens
        self.word_to_idx = {word: i + 1 for i, word in enumerate(self.vocab)}
        self.word_to_idx['<PAD>'] = 0  
        self.word_to_idx['<UNK>'] = len(self.word_to_idx)
        
        # Convert texts to numerical sequences
        self.encoded_texts = self.encode_texts(self.df['processed_text'])
        
        # Encode sentiment labels
        label_encoder = LabelEncoder()
        self.labels = torch.tensor(label_encoder.fit_transform(self.df[label_col]))
        
        # One-hot encode app features
        app_encoder = OneHotEncoder(sparse_output=False)
        self.app_features = torch.tensor(
            app_encoder.fit_transform(self.df[app_col].values.reshape(-1, 1)),
            dtype=torch.float32
        )
        
        # Split data into training and test sets
        self.X_train, self.X_test, self.app_train, self.app_test, self.y_train, self.y_test = train_test_split(
            self.encoded_texts,
            self.app_features,
            self.labels,
            test_size=test_size,
            random_state=random_state,
            stratify=self.labels
        )
        
        # Create PyTorch datasets
        self.train_dataset = ReviewDataset(self.X_train, self.app_train, self.y_train)
        self.test_dataset = ReviewDataset(self.X_test, self.app_test, self.y_test)
        
        # Define model hyperparameters
        self.model_params = {
            'vocab_size': len(self.word_to_idx),
            'embedding_dim': 100,
            'hidden_dim': 128,
            'output_dim': len(np.unique(self.labels)),
            'n_layers': 2,
            'dropout': 0.5,
            'app_feature_dim': self.app_features.shape[1]
        }
        
       # Initialize model, loss function, and optimizer
        self.model = SentimentLSTM(**self.model_params)
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optim.Adam(self.model.parameters())
    
        
    def preprocess_text(self, text: str) -> str:
        """
        Preprocess text while preserving emojis
        
        Steps:
        1. Convert to lowercase
        2. Tokenize while keeping emojis intact
        3. Remove stopwords (except emojis)
        4. Join tokens back into text
        """
        # Convert to lowercase
        text = text.lower()
        
        def tokenize_with_emojis(text):
            """
            Custom tokenizer that preserves emojis as distinct tokens
            Handles the text character by character to properly separate
            emojis from regular words
            """
            tokens = []
            current_token = []
            
            for char in text:
                if emoji.is_emoji(char):
                    # If there's a current token, save it
                    if current_token:
                        tokens.append(''.join(current_token))
                        current_token = []
                    # Add emoji as a separate token
                    tokens.append(char)
                elif char.isalnum():
                    current_token.append(char)
                else:
                    # If there's a current token, save it
                    if current_token:
                        tokens.append(''.join(current_token))
                        current_token = []
            
            # Handle any remaining token
            if current_token:
                tokens.append(''.join(current_token))
            
            return tokens
        
        # Tokenize and remove stopwords (preserve emojis)
        tokens = tokenize_with_emojis(text)  
        stop_words = set(stopwords.words('english')) 
        tokens = [token for token in tokens if token not in stop_words or emoji.is_emoji(token)]
        return ' '.join(tokens) 
    
    def build_vocab(self, texts: pd.Series, max_vocab_size: int = 10000) -> List[str]:
        """
        Build vocabulary from texts, including emojis
        
        
        :param texts (pd.Series): Series of processed texts
        :param max_vocab_size (int): Maximum vocabulary size
        
        
        :returns List[str]: Vocabulary words and emojis
        """
        # Count word frequencies
        word_freq = {}
        for text in texts:
            for token in text.split():
                word_freq[token] = word_freq.get(token, 0) + 1
        
        # Sort by frequency and limit vocabulary
        vocab = sorted(word_freq, key=word_freq.get, reverse=True)[:max_vocab_size]
        return vocab
    
    def encode_texts(self, texts: pd.Series, max_length: int = 100) -> torch.Tensor:
        """
        Encode texts to tensor, handling emojis
        
        :param texts (pd.Series): Series of processed texts
        :param max_length (int): Maximum sequence length
        
        :returns torch.Tensor: Encoded texts
        """
        encoded_texts = []
        for text in texts:
            # Convert tokens to indices
            indices = [
                self.word_to_idx.get(token, self.word_to_idx['<UNK>']) 
                for token in text.split()
            ]
            
            # Pad or truncate to fixed length
            if len(indices) > max_length:
                indices = indices[:max_length]
            else:
                indices = indices + [self.word_to_idx['<PAD>']] * (max_length - len(indices))
            
            encoded_texts.append(indices)
        
        return torch.tensor(encoded_texts)
    
    def train(self, epochs: int = 10, batch_size: int = 32):
        """
        Train the LSTM model
        
        
        :param epochs (int): Number of training epochs
        :param batch_size (int): Batch size for training
        """
        train_loader = DataLoader(self.train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(self.test_dataset, batch_size=batch_size, shuffle=False)
        
        for epoch in range(epochs):
            # Training phase
            self.model.train()
            train_total_loss = 0
            
            for texts, app_features, labels in train_loader:
                # Clear gradients
                self.optimizer.zero_grad()
                
                # Forward pass
                outputs = self.model(texts, app_features)
                loss = self.criterion(outputs, labels)
                
                # Backward pass and optimization
                loss.backward()
                self.optimizer.step()
                
                train_total_loss += loss.item()
            
            # Calculate average training loss
            avg_train_loss = train_total_loss / len(train_loader)
            
            # Validation phase
            self.model.eval()
            test_total_loss = 0
            
            with torch.no_grad():
                for texts, app_features, labels in test_loader:
                    outputs = self.model(texts, app_features)
                    loss = self.criterion(outputs, labels)
                    test_total_loss += loss.item()
            
            # Calculate average test loss
            avg_test_loss = test_total_loss / len(test_loader)
            
            # Print both losses
            print(f'Epoch {epoch+1}/{epochs}:')
            print(f'Training Loss: {avg_train_loss:.4f}')
            print(f'Test Loss: {avg_test_loss:.4f}\n')
            
    def evaluate(self) -> dict:
        """
        Evaluate model performance
        
       
        :returns dict: Performance metrics
        """
        self.model.eval()
        test_loader = DataLoader(self.test_dataset, batch_size=32, shuffle=False)
        
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for texts, app_features, labels in test_loader:
                outputs = self.model(texts, app_features)
                _, preds = torch.max(outputs, 1)
                
                all_preds.extend(preds.numpy())
                all_labels.extend(labels.numpy())
        
        # Calculate metrics
        metrics = {
            'accuracy': accuracy_score(all_labels, all_preds),
            'precision': precision_score(all_labels, all_preds, average='weighted'),
            'recall': recall_score(all_labels, all_preds, average='weighted'),
            'f1_score': f1_score(all_labels, all_preds, average='weighted'),
            'confusion_matrix': confusion_matrix(all_labels, all_preds),
            'classification_report': classification_report(all_labels, all_preds)
        }
        
        return metrics
    
    
            

In [72]:
def main():
        set_seed(507)
        # Initialize Sentiment Analyzer
        analyzer = SentimentAnalyzer(new_df)
        
        # Train the model
        analyzer.train(epochs=10)
        
        # Evaluate the model
        metrics = analyzer.evaluate()
        
        # Print metrics
        print("Model Performance Metrics:")
        for key, value in metrics.items():
            print(f"{key}:\n{value}\n")

if __name__ == '__main__':
    main()

Epoch 1/10:
Training Loss: 0.7297
Test Loss: 0.7190

Epoch 2/10:
Training Loss: 0.7216
Test Loss: 0.7185

Epoch 3/10:
Training Loss: 0.7208
Test Loss: 0.7184

Epoch 4/10:
Training Loss: 0.7201
Test Loss: 0.7178

Epoch 5/10:
Training Loss: 0.7197
Test Loss: 0.7183

Epoch 6/10:
Training Loss: 0.6469
Test Loss: 0.5410

Epoch 7/10:
Training Loss: 0.5349
Test Loss: 0.5071

Epoch 8/10:
Training Loss: 0.5115
Test Loss: 0.4948

Epoch 9/10:
Training Loss: 0.4992
Test Loss: 0.4892

Epoch 10/10:
Training Loss: 0.4900
Test Loss: 0.4851

Model Performance Metrics:
accuracy:
0.8352483603640246

precision:
0.7878294310216243

recall:
0.8352483603640246

f1_score:
0.8099420265903265

confusion_matrix:
[[ 7314     0  2867]
 [  752     0  1355]
 [ 1507     0 25543]]

classification_report:
              precision    recall  f1-score   support

           0       0.76      0.72      0.74     10181
           1       0.00      0.00      0.00      2107
           2       0.86      0.94      0.90     27050


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [73]:
# increase epochs
def main():
        set_seed(507)
        # Initialize Sentiment Analyzer
        analyzer = SentimentAnalyzer(new_df)
        
        # Train the model
        analyzer.train(epochs=30)
        
        # Evaluate the model
        metrics = analyzer.evaluate()
        
        # Print metrics
        print("Model Performance Metrics:")
        for key, value in metrics.items():
            print(f"{key}:\n{value}\n")

if __name__ == '__main__':
    main()

Epoch 1/30:
Training Loss: 0.7297
Test Loss: 0.7190

Epoch 2/30:
Training Loss: 0.7216
Test Loss: 0.7185

Epoch 3/30:
Training Loss: 0.7208
Test Loss: 0.7184

Epoch 4/30:
Training Loss: 0.7201
Test Loss: 0.7178

Epoch 5/30:
Training Loss: 0.7197
Test Loss: 0.7183

Epoch 6/30:
Training Loss: 0.6469
Test Loss: 0.5410

Epoch 7/30:
Training Loss: 0.5349
Test Loss: 0.5071

Epoch 8/30:
Training Loss: 0.5115
Test Loss: 0.4948

Epoch 9/30:
Training Loss: 0.4992
Test Loss: 0.4892

Epoch 10/30:
Training Loss: 0.4900
Test Loss: 0.4851

Epoch 11/30:
Training Loss: 0.4838
Test Loss: 0.4893

Epoch 12/30:
Training Loss: 0.4800
Test Loss: 0.4833

Epoch 13/30:
Training Loss: 0.4758
Test Loss: 0.4875

Epoch 14/30:
Training Loss: 0.4713
Test Loss: 0.4840

Epoch 15/30:
Training Loss: 0.4689
Test Loss: 0.4911

Epoch 16/30:
Training Loss: 0.4666
Test Loss: 0.4861

Epoch 17/30:
Training Loss: 0.4641
Test Loss: 0.4848

Epoch 18/30:
Training Loss: 0.4623
Test Loss: 0.4887

Epoch 19/30:
Training Loss: 0.4599
Te

#### Modified LSTM Model


In [10]:
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt


In [11]:
def check_gpu_status():
   if torch.backends.mps.is_available() and torch.backends.mps.is_built():
        print("MPS (Apple Silicon GPU) is available")
        print("Using Apple Metal Performance Shaders (MPS) for GPU acceleration")
        print(f"PyTorch version: {torch.__version__}")
   else:
        print("MPS device not found, using CPU instead")

In [12]:
class FocalLoss(nn.Module):
    """
    Implementation of Focal Loss for handling class imbalance in classification tasks.
    
    Focal Loss modifies standard cross entropy by reducing the loss contribution from easy examples
    and increasing the importance of hard-to-classify examples.
    """
    def __init__(self, alpha, gamma=2):
        """
        :prarm alpha: A tensor of weights for each class to handle class imbalance
        :param gamma: Focusing parameter that adjusts how much to down-weight easy examples (default: 2)
        """
        super().__init__()
        self.gamma = gamma
        self.register_buffer('alpha', alpha)  # 类别权重
        
    def forward(self, inputs, targets):
        """
        Calculate the focal loss.
        
        :param inputs: Model predictions (logits)
        :param targets: Ground truth labels
        
        :returns focal_loss: Computed focal loss value
        """
        # Calculate standard cross entropy loss for each sample
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=self.alpha)
        
        # Calculate probability of correct class (pt)
        # pt closer to 1 means the model is more confident in its prediction
        pt = torch.exp(-ce_loss)
        
        # Calculate focal loss:
        # - For easy examples (pt close to 1), (1-pt)**gamma reduces their contribution
        # - For hard examples (pt close to 0), (1-pt)**gamma keeps their contribution high
        focal_loss = ((1-pt)**self.gamma * ce_loss).mean()
        return focal_loss

In [13]:
## LSTMmodelNew
class SentimentLSTM_new(nn.Module):
    def __init__(self, 
                 vocab_size, 
                 embedding_dim, 
                 hidden_dim, 
                 output_dim, 
                 n_layers, 
                 dropout, 
                 app_feature_dim,
                 bidirectional=True):  # Whether to use bidirectional LSTM
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.lstm = nn.LSTM(
            embedding_dim, 
            hidden_dim, 
            num_layers=n_layers, 
            dropout=dropout if n_layers > 1 else 0,
            batch_first=True,
            bidirectional=bidirectional 
        )
        
        # Attention layer to focus on important parts of the sequence
        self.attention = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, 1)
        
        # Calculate LSTM output dimension based on directionality
        lstm_output_dim = hidden_dim * 2 if bidirectional else hidden_dim
        self.feature_fusion = nn.Sequential(
            nn.Linear(lstm_output_dim + app_feature_dim, hidden_dim), # Combine features
            nn.ReLU(), # Non-linear activation
            nn.Dropout(dropout), # Regularization
            nn.BatchNorm1d(hidden_dim), # Normalize activations
            nn.Linear(hidden_dim, hidden_dim // 2), # Reduce dimension
            nn.ReLU(), # Non-linear activation
            nn.Dropout(dropout), # Additional regularization
            nn.BatchNorm1d(hidden_dim // 2) # Final normalization
        )
        
        # Final classification layer
        self.fc = nn.Linear(hidden_dim // 2, output_dim)
        
        # Layer normalization for LSTM output
        self.layer_norm = nn.LayerNorm(hidden_dim * 2 if bidirectional else hidden_dim)
        
    def attention_net(self, lstm_output, final_state):
        # Calculate attention weights for each time step
        attn_weights = F.softmax(
            self.attention(lstm_output).squeeze(-1), dim=1
        )
         # Apply attention weights to get context vector
        context = torch.bmm(
            attn_weights.unsqueeze(1), 
            lstm_output
        ).squeeze(1)
        return context
        
    def forward(self, text, app_features):
        # Apply word embeddings with dropout
        embedded = F.dropout(self.embedding(text), 0.3, training=self.training)
        
        # Process sequence through LSTM
        lstm_output, (hidden, cell) = self.lstm(embedded)
        
        # Apply attention mechanism
        attn_output = self.attention_net(
            lstm_output,
            hidden
        )
        # Normalize LSTM output
        normalized_output = self.layer_norm(attn_output)
        
        # Combine LSTM output with application features
        combined = torch.cat([normalized_output, app_features], dim=1)
        
        # Process through feature fusion network
        fused_features = self.feature_fusion(combined)
        
        return self.fc(fused_features)

In [19]:
class SentimentAnalyzer_new:
    def __init__(self, 
                 df: pd.DataFrame, 
                 content_col: str = 'cleaned_review', 
                 label_col: str = 'flag',
                 app_col: str = 'app',
                 test_size: float = 0.2,
                 random_state: int = 42):
        
        """
        Initialize the sentiment analyzer with data preprocessing and model setup.
        
        
        :param df: Input DataFrame containing review text and labels
        :param content_col: Column name for review text
        :param label_col: Column name for sentiment labels
        :param app_col: Column name for app/product identifiers
        :param test_size: Proportion of data to use for testing
        :param random_state: Random seed for reproducibility
        """
        # Set up GPU/CPU device for computation
        self.device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
        check_gpu_status()
        print(f"Using device: {self.device}")
        
        # Create a copy of input data and preprocess text
        self.df = df.copy()
        self.df['processed_text'] = self.df[content_col].apply(self.preprocess_text)
        
        # Create vocabulary and word indexing system
        self.vocab = self.build_vocab(self.df['processed_text'])
        self.word_to_idx = {word: i+1 for i, word in enumerate(self.vocab)}
        self.word_to_idx['<PAD>'] = 0
        self.word_to_idx['<UNK>'] = len(self.word_to_idx)
        
        # Convert text to numerical sequences
        self.encoded_texts = self.encode_texts(self.df['processed_text']).to(self.device)
        
        # Encode labels
        label_encoder = LabelEncoder()
        self.labels = torch.tensor(
            label_encoder.fit_transform(self.df[label_col]),
            device=self.device
        )
        
        # One-hot encode app/product features
        app_encoder = OneHotEncoder(sparse_output=False)
        self.app_features = torch.tensor(
            app_encoder.fit_transform(self.df[app_col].values.reshape(-1, 1)),
            dtype=torch.float32,
            device=self.device
        )
        
        # split the dataset into training set and test set on cpu
        (self.X_train, self.X_test, 
         self.app_train, self.app_test,
         self.y_train, self.y_test) = train_test_split(
            self.encoded_texts.cpu().numpy(),
            self.app_features.cpu().numpy(),
            self.labels.cpu().numpy(), 
            test_size=test_size, 
            random_state=random_state,
            stratify=self.labels.cpu().numpy()
        )
        
        # Prepare data for SMOTE oversampling
        batch_size = len(self.X_train)
        seq_length = self.X_train.shape[1]
        embedding_dim = 1
        
        X_train_reshaped = np.reshape(self.X_train, (batch_size, -1))
        X_train_combined = np.concatenate([X_train_reshaped, self.app_train], axis=1)
        
        print("Applying SMOTE oversampling...")
        smote = SMOTE(random_state=random_state)
        X_resampled, y_resampled = smote.fit_resample(X_train_combined, self.y_train)
        
        # Separate text and app features after SMOTE
        text_features_dim = seq_length * embedding_dim
        X_text_resampled = X_resampled[:, :text_features_dim]
        X_app_resampled = X_resampled[:, text_features_dim:]
        
        # move processed data to the device
        self.X_train = torch.tensor(
            X_text_resampled.reshape(-1, seq_length),
            dtype=torch.long,
            device=self.device
        )
        self.X_test = torch.tensor(self.X_test, dtype=torch.long, device=self.device)
        self.app_train = torch.tensor(X_app_resampled, dtype=torch.float32, device=self.device)
        self.app_test = torch.tensor(self.app_test, dtype=torch.float32, device=self.device)
        self.y_train = torch.tensor(y_resampled, device=self.device)
        self.y_test = torch.tensor(self.y_test, device=self.device)
        
        print(f"After SMOTE - Class distribution: {np.bincount(y_resampled)}")
        
        # Create PyTorch datasets
        self.train_dataset = ReviewDataset(self.X_train, self.app_train, self.y_train)
        self.test_dataset = ReviewDataset(self.X_test, self.app_test, self.y_test)
        
        self.train_losses = []
        self.test_losses = []
        self.learning_rates = []
        self.gradient_norms = []
        
        # initialize the model
        self.model_params = {
            'vocab_size': len(self.word_to_idx),
            'embedding_dim': 200,
            'hidden_dim': 256,
            'output_dim': len(np.unique(self.labels.cpu().numpy())),
            'n_layers': 3,
            'dropout': 0.3,
            'app_feature_dim': self.app_features.shape[1],
            'bidirectional': True
        }
        
        self.model = SentimentLSTM_new(**self.model_params).to(self.device)
        
        # set up loss function and optimizer
        class_weights = self.calculate_class_weights()
        self.criterion = FocalLoss(alpha=class_weights)
        
        self.optimizer = optim.AdamW(
            self.model.parameters(),
            lr=1e-4,
            weight_decay=0.01
        )
        
        # self.scheduler = optim.lr_scheduler.OneCycleLR(
        #     self.optimizer,
        #     max_lr=2e-4,
        #     epochs=10,
        #     steps_per_epoch=len(self.train_dataset) // 32 + 1,
        #     pct_start=0.3
        # )
        
        self.scheduler = None
    
    def preprocess_text(self, text: str) -> str:
        """
        Preprocess text by converting to lowercase, tokenizing, and removing stop words.
        Preserves emojis as separate tokens.
        """
        if pd.isna(text) or not isinstance(text, str):
            return ''
            
        text = text.lower()
        
        def tokenize_with_emojis(text):
            """Helper function to tokenize text while preserving emojis"""
            tokens = []
            current_token = []
            
            for char in text:
                if emoji.is_emoji(char):
                    if current_token:
                        tokens.append(''.join(current_token))
                        current_token = []
                    tokens.append(char)
                elif char.isalnum():
                    current_token.append(char)
                else:
                    if current_token:
                        tokens.append(''.join(current_token))
                        current_token = []
            
            if current_token:
                tokens.append(''.join(current_token))
            
            return tokens
        
        tokens = tokenize_with_emojis(text)
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words or emoji.is_emoji(token)]
        
        return ' '.join(tokens)
    
    def build_vocab(self, texts: pd.Series, max_vocab_size: int = 10000) -> List[str]:
        """
        Build vocabulary from text data, limiting to most frequent words.
        Returns list of words sorted by frequency.
        """
        word_freq = {}
        for text in texts:
            for token in text.split():
                word_freq[token] = word_freq.get(token, 0) + 1
        
        vocab = sorted(word_freq, key=word_freq.get, reverse=True)[:max_vocab_size]
        return vocab
    
    def encode_texts(self, texts: pd.Series, max_length: int = 100) -> torch.Tensor:
        """
        Convert text to numerical sequences using word indices.
        Pads or truncates sequences to specified length.
        """
        encoded_texts = []
        for text in texts:
            indices = [
                self.word_to_idx.get(token, self.word_to_idx['<UNK>']) 
                for token in text.split()
            ]
            
            if len(indices) > max_length:
                indices = indices[:max_length]
            else:
                indices = indices + [self.word_to_idx['<PAD>']] * (max_length - len(indices))
            
            encoded_texts.append(indices)
        
        return torch.tensor(encoded_texts)
    
    def calculate_class_weights(self):
        """Calculate class weights to handle class imbalance"""
        class_counts = torch.bincount(self.y_train)
        total = len(self.y_train)
        weights = total / (len(class_counts) * class_counts)
        return weights.to(self.device)
    
    def calculate_gradient_norm(self):
        """Calculate total gradient norm for monitoring"""
        total_norm = 0
        for p in self.model.parameters():
            if p.grad is not None:
                param_norm = p.grad.data.norm(2)
                total_norm += param_norm.item() ** 2
        return total_norm ** 0.5
    
    def save_checkpoint(self, epoch, batch_idx, loss):
        """Save training checkpoint with proper state dict handling"""
        checkpoint = {
            'epoch': epoch,
            'batch_idx': batch_idx,
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'scheduler_state_dict': self.scheduler.state_dict() if self.scheduler else None,
            'loss': loss,
            'train_losses': self.train_losses,
            'test_losses': self.test_losses,
            'learning_rates': self.learning_rates,
            'gradient_norms': self.gradient_norms
        }
        torch.save(checkpoint, 'training_checkpoint.pt')
        print(f"\nCheckpoint saved at epoch {epoch+1}, batch {batch_idx+1}")
    
    def load_checkpoint(self):
        """Load training checkpoint with proper state dict handling"""
        if os.path.exists('training_checkpoint.pt'):
            checkpoint = torch.load('training_checkpoint.pt', map_location=self.device)
            
            # Load model state
            self.model.load_state_dict(checkpoint['model_state_dict'])
            
            # Load optimizer state
            optimizer_state = checkpoint['optimizer_state_dict']
            # Move optimizer state to correct device
            for state in optimizer_state['state'].values():
                for k, v in state.items():
                    if isinstance(v, torch.Tensor):
                        state[k] = v.to(self.device)
            self.optimizer.load_state_dict(optimizer_state)
            
            # Load scheduler state if it exists
            if checkpoint['scheduler_state_dict'] and self.scheduler:
                self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
            
            # Load training history
            self.train_losses = checkpoint['train_losses']
            self.test_losses = checkpoint['test_losses']
            self.learning_rates = checkpoint['learning_rates']
            self.gradient_norms = checkpoint['gradient_norms']
            
            start_epoch = checkpoint['epoch']
            start_batch = checkpoint['batch_idx'] + 1
            
            print(f"Resuming from epoch {start_epoch+1}, batch {start_batch}")
            return start_epoch, start_batch
            
        return 0, 0
    
    def train(self, epochs: int = 10, batch_size: int = 8):
        """
        Train the model with checkpointing
        """
        train_loader = DataLoader(
            self.train_dataset, 
            batch_size=batch_size, 
            shuffle=True,
            pin_memory=True
        )
        
        test_loader = DataLoader(
            self.test_dataset,
            batch_size=batch_size,
            shuffle=False,
            pin_memory=True
        )
        
        # Initialize or load scheduler
        if self.scheduler is None:
            self.scheduler = optim.lr_scheduler.OneCycleLR(
                self.optimizer,
                max_lr=1e-4,
                epochs=epochs,
                steps_per_epoch=len(train_loader),
                pct_start=0.3
            )
        
        # Try to load checkpoint
        start_epoch, start_batch = self.load_checkpoint()
        
        print(f"Training starts with {epochs} epochs")
        print(f"Total training steps per epoch: {len(train_loader)}")
        
        try:
            for epoch in range(start_epoch, epochs):
                self.model.train()
                total_train_loss = 0
                epoch_gradient_norms = []
                avg_batch_loss = float('inf')
                
                pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}')
                
                # Skip already processed batches if resuming
                if epoch == start_epoch and start_batch > 0:
                    for _ in range(start_batch):
                        next(iter(pbar))
                
                for batch_idx, (texts, app_features, labels) in enumerate(pbar, start=start_batch):
                    try:
                        texts = texts.to(self.device)
                        app_features = app_features.to(self.device)
                        labels = labels.to(self.device)
                        
                        self.optimizer.zero_grad()
                        
                        # Forward pass
                        outputs = self.model(texts, app_features)
                        loss = self.criterion(outputs, labels)
                        
                        # Update avg_batch_loss
                        total_train_loss += loss.item()
                        avg_batch_loss = total_train_loss / (batch_idx + 1)
                        
                        # Backward pass
                        loss.backward()
                        
                        # Gradient clipping
                        grad_norm_before = self.calculate_gradient_norm()
                        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=0.5)
                        grad_norm_after = self.calculate_gradient_norm()
                        epoch_gradient_norms.append(grad_norm_after)
                        
                        # Gradient control
                        for param in self.model.parameters():
                            if param.grad is not None:
                                torch.clamp_(param.grad, -1, 1)
                        
                        self.optimizer.step()
                        self.scheduler.step()
                        
                        # Monitoring
                        current_lr = self.scheduler.get_last_lr()[0]
                        self.learning_rates.append(current_lr)
                        
                        pbar.set_postfix({
                            'train_loss': f'{avg_batch_loss:.4f}',
                            'lr': f'{current_lr:.6f}',
                            'grad_norm': f'{grad_norm_after:.4f}'
                        })
                        
                        # Save checkpoint every 5000 batches
                        if (batch_idx + 1) % 5000 == 0:
                            self.save_checkpoint(epoch, batch_idx, avg_batch_loss)
                        
                    except RuntimeError as e:
                        print(f"Error in batch {batch_idx}: {e}")
                        # Save checkpoint on error
                        self.save_checkpoint(epoch, max(0, batch_idx-1), avg_batch_loss)
                        raise e
                
                # Calculate epoch metrics
                avg_train_loss = total_train_loss / len(train_loader)
                self.train_losses.append(avg_train_loss)
                
                # Evaluation phase
                self.model.eval()
                total_test_loss = 0
                
                print("\nRunning evaluation...")
                with torch.no_grad():
                    for texts, app_features, labels in test_loader:
                        texts = texts.to(self.device)
                        app_features = app_features.to(self.device)
                        labels = labels.to(self.device)
                        
                        outputs = self.model(texts, app_features)
                        loss = self.criterion(outputs, labels)
                        total_test_loss += loss.item()
                
                avg_test_loss = total_test_loss / len(test_loader)
                self.test_losses.append(avg_test_loss)
                
                # Print epoch summary
                print(f'\nEpoch {epoch+1}/{epochs} Summary:')
                print(f'Training Loss: {avg_train_loss:.4f}')
                print(f'Test Loss: {avg_test_loss:.4f}')
                print(f'Learning Rate: {current_lr:.6f}')
                print(f'Average Gradient Norm: {np.mean(epoch_gradient_norms):.4f}')
                
                # Save checkpoint at end of epoch
                self.save_checkpoint(epoch, len(train_loader)-1, avg_train_loss)
                
        except Exception as e:
            print(f"Training interrupted: {str(e)}")
            print("You can resume training from the last checkpoint")
            raise e
                
            
    
    def evaluate(self):
        self.model.eval()
        test_loader = DataLoader(
            self.test_dataset, 
            batch_size=32, 
            shuffle=False,
            pin_memory=True
        )
        
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for texts, app_features, labels in test_loader:
                texts = texts.to(self.device)
                app_features = app_features.to(self.device)
                labels = labels.to(self.device)
                
                outputs = self.model(texts, app_features)
                _, preds = torch.max(outputs, 1)
                
                # move prediction result back to cpu
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        
        if self.device.type == "mps":
            torch.mps.empty_cache()
        
        metrics = {
            'accuracy': accuracy_score(all_labels, all_preds),
            'precision': precision_score(all_labels, all_preds, average='weighted'),
            'recall': recall_score(all_labels, all_preds, average='weighted'),
            'f1_score': f1_score(all_labels, all_preds, average='weighted'),
            'confusion_matrix': confusion_matrix(all_labels, all_preds),
            'classification_report': classification_report(all_labels, all_preds)
        }
        
        return metrics
    
    def plot_training_history(self):
        """Plot training metrics history"""
        plt.figure(figsize=(15, 5))
        
        # Plot losses
        plt.subplot(1, 3, 1)
        plt.plot(self.train_losses, label='Train Loss')
        plt.plot(self.test_losses, label='Test Loss')
        plt.title('Loss History')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        
        # Plot learning rates
        plt.subplot(1, 3, 2)
        plt.plot(self.learning_rates)
        plt.title('Learning Rate History')
        plt.xlabel('Step')
        plt.ylabel('Learning Rate')
        
        # Plot gradient norms
        plt.subplot(1, 3, 3)
        plt.plot(self.gradient_norms)
        plt.title('Gradient Norm History')
        plt.xlabel('Step')
        plt.ylabel('Gradient Norm')
        
        plt.tight_layout()
        plt.show()

In [None]:
set_seed(507)
analyzer = SentimentAnalyzer_new(new_df)
try:
    analyzer.train(epochs=30, batch_size=32)
except Exception as e:
    print(f"Training interrupted: {e}")
        
# Train the model
analyzer.train(epochs=30)
        
# Evaluate the model
metrics = analyzer.evaluate()
        
# Print metrics
print("Model Performance Metrics:")
for key, value in metrics.items():
    print(f"{key}:\n{value}\n")