# Q1

### Deep Learning (Neural Network with Vector Embeddings)

In [1]:
%pip install torch==2.0.1 transformers pandas numpy scikit-learn tqdm textblob sentence_transformers

Note: you may need to restart the kernel to use updated packages.


In [9]:
!pip install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [1]:
from transformers import AutoTokenizer, AutoModel, pipeline
from sentence_transformers import SentenceTransformer
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from textblob import TextBlob
from multiprocessing import Pool, cpu_count
from functools import partial

In [2]:
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
from tqdm import tqdm


class BertEmbedder:
    def __init__(self, model_name='distilbert-base-uncased'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")
        self.model.to(self.device)
        self.batch_size = 32
        self.max_length = 256  # Reduced from 512 for efficiency

    def process_batch(self, texts):
        # Convert each text to string and strip whitespace
        texts = [str(text).strip() for text in texts]
        
        # Tokenize
        inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        # Move inputs to device
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Get embeddings
        with torch.no_grad():
            outputs = self.model(**inputs)
            # Use mean pooling instead of just [CLS] token
            attention_mask = inputs['attention_mask']
            token_embeddings = outputs.last_hidden_state
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
            return embeddings.cpu().numpy()

    def get_embeddings(self, df):
        # Process pros, cons, and headlines separately
        print("Processing pros embeddings...")
        pros_emb = self._process_text_column(df['pros'])
        print("Processing cons embeddings...")
        cons_emb = self._process_text_column(df['cons'])
        print("Processing headline embeddings...")
        headline_emb = self._process_text_column(df['headline'])
        
        return pros_emb, cons_emb, headline_emb
    
    def _process_text_column(self, texts):
        embeddings = []
        for i in tqdm(range(0, len(texts), self.batch_size)):
            batch = texts[i:i + self.batch_size].tolist()
            batch_embeddings = self.process_batch(batch)
            embeddings.append(batch_embeddings)
        return np.vstack(embeddings)

In [20]:
class DataPreprocessor:
    def __init__(self):
        self.scaler = StandardScaler()
        self.firm_stats = {}
        self.bert = BertEmbedder()
        self.min_reviews_for_firm = 50  # Increased threshold for firm features
        
    def _calculate_firm_statistics(self, df):
        """Calculate enhanced firm statistics."""
        firm_stats = df.groupby('firm').agg({
            'rating': ['count', 'mean', 'std', 'median']
        }).round(3)
        firm_stats.columns = ['count', 'mean', 'std', 'median']
        firm_stats = firm_stats.reset_index()
        
        # Identify firms with enough reviews
        frequent_firms = firm_stats[firm_stats['count'] >= self.min_reviews_for_firm]['firm'].tolist()
        
        self.firm_stats = {
            'frequent_firms': set(frequent_firms),
            'stats': firm_stats.set_index('firm').to_dict(),
            'global_mean': df['rating'].mean(),
            'global_std': df['rating'].std(),
            'global_median': df['rating'].median()
        }
        
    def _get_sentiment_features(self, text):
        """Extract sentiment features from text using TextBlob."""
        if not isinstance(text, str) or text.strip() == '':
            return 0.0
        
        try:
            blob = TextBlob(text.strip())
            return blob.sentiment.polarity
        except:
            return 0.0
    
    def _scale_features(self, df, is_test=False):
        if not is_test:
            df[self.numerical_features] = self.scaler.fit_transform(df[self.numerical_features])
        else:
            df[self.numerical_features] = self.scaler.transform(df[self.numerical_features])
        return df
    
    def _create_features(self, df):
        df = df.copy()
        
        # Handle missing values
        df['pros'] = df['pros'].fillna('').astype(str)
        df['cons'] = df['cons'].fillna('').astype(str)
        df['headline'] = df['headline'].fillna('').astype(str)
        
        # Basic text features
        df['pros_length'] = df['pros'].str.strip().str.len()
        df['cons_length'] = df['cons'].str.strip().str.len()
        df['pros_cons_ratio'] = (df['pros_length'] + 1) / (df['cons_length'] + 1)
        
        # Sentiment
        print('Sentiment...')
        df['cons_sentiment'] = df['cons'].apply(self._get_sentiment_features)
        df['pros_sentiment'] = df['pros'].apply(self._get_sentiment_features)
        df['headline_sentiment'] = df['headline'].apply(self._get_sentiment_features)
        
        # Enhanced firm features
        df['is_frequent_firm'] = df['firm'].isin(self.firm_stats['frequent_firms']).astype(int)
        df['firm_mean_rating'] = df['firm'].map(self.firm_stats['stats']['mean']).fillna(self.firm_stats['global_mean'])
        df['firm_rating_std'] = df['firm'].map(self.firm_stats['stats']['std']).fillna(self.firm_stats['global_std'])
        
        # Year features (bucketed)
        df['year_bucket'] = pd.qcut(df['year_review'], q=5, labels=['very_old', 'old', 'medium', 'recent', 'very_recent'])
        year_dummies = pd.get_dummies(df['year_bucket'], prefix='year')
        
        # Get BERT embeddings
        print("Getting BERT embeddings...")
        pros_emb, cons_emb, headline_emb = self.bert.get_embeddings(df)
        
        # Create feature matrix
        emb_features = np.hstack([pros_emb, cons_emb, headline_emb])
        emb_cols = [f'emb_{i}' for i in range(emb_features.shape[1])]
        
        # Combine all features
        numerical_features = ['pros_length', 'cons_length', 'pros_cons_ratio',
                            'firm_mean_rating', 'firm_rating_std', 'cons_sentiment', 'pros_sentiment' ,'headline_sentiment']
        
        df_final = pd.concat([
            df[numerical_features],
            year_dummies,
            pd.DataFrame(emb_features, columns=emb_cols)
        ], axis=1)
        
        return df_final
    
    def preprocess_data(self, df_small, df_large, df_test, random_state=42):
        print("Combining training datasets...")
        df_train = pd.concat([df_small, df_large], axis=0, ignore_index=True)
        
        print("Calculating firm statistics...")
        self._calculate_firm_statistics(df_train)
        
        print("Processing training data...")
        X_train_full = self._create_features(df_train)
        
        print("Processing test data...")
        X_test = self._create_features(df_test)
        
        y_train_full = df_train['rating']
        
        # Create stratified split with larger validation set
        print("Creating validation split...")
        X_train, X_val, y_train, y_val = train_test_split(
            X_train_full, y_train_full,
            test_size=0.15,  # Increased validation size
            random_state=random_state,
            stratify=y_train_full
        )
        
        # Scale features
        numerical_features = X_train.select_dtypes(include=['float64']).columns
        scaler = StandardScaler()
        X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
        X_val[numerical_features] = scaler.transform(X_val[numerical_features])
        X_test[numerical_features] = scaler.transform(X_test[numerical_features])
        
        return X_train, X_val, y_train, y_val, X_test

In [21]:
def load_and_preprocess_data(small_train_path, large_train_path, test_path):
    """Load and preprocess all datasets."""
    print("Loading datasets...")
    df_small = pd.read_csv(small_train_path)
    df_large = pd.read_csv(large_train_path)
    df_test = pd.read_csv(test_path)
    
    preprocessor = DataPreprocessor()
    return preprocessor.preprocess_data(df_small, df_large, df_test)  

In [None]:
small_train_path = "./424_F2024_Final_PC_small_train_v1.csv"
large_train_path = "./424_F2024_Final_PC_large_train_v1.csv"
test_path = "./424_F2024_Final_PC_test_without_response_v1.csv"

X_train, X_val, y_train, y_val, X_test = load_and_preprocess_data(
    small_train_path, large_train_path, test_path
)

print("\nProcessed data shapes:")
print(f"X_train: {X_train.shape}")
print(f"X_val: {X_val.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_val: {y_val.shape}")
print(f"X_test: {X_test.shape}")

print("\nFeatures created:")
for col in X_train.columns:
    print(f"- {col}")

Loading datasets...
Using device: cuda
Combining training datasets...
Calculating firm statistics...
Processing training data...
Sentiment...


100%|██████████| 18750/18750 [13:31<00:00, 23.10it/s]


Processing headline embeddings...


100%|██████████| 18750/18750 [02:56<00:00, 106.10it/s]


Processing test data...
Sentiment...
Getting BERT embeddings...
Processing pros embeddings...


100%|██████████| 3125/3125 [01:56<00:00, 26.81it/s]


Processing cons embeddings...


100%|██████████| 3125/3125 [03:21<00:00, 15.50it/s]


Processing headline embeddings...


100%|██████████| 3125/3125 [00:30<00:00, 102.91it/s]


Creating validation split...

Processed data shapes:
X_train: (510000, 2317)
X_val: (90000, 2317)
y_train: (510000,)
y_val: (90000,)
X_test: (100000, 2317)

Features created:
- pros_length
- cons_length
- pros_cons_ratio
- firm_mean_rating
- firm_rating_std
- cons_sentiment
- pros_sentiment
- headline_sentiment
- year_very_old
- year_old
- year_medium
- year_recent
- year_very_recent
- emb_0
- emb_1
- emb_2
- emb_3
- emb_4
- emb_5
- emb_6
- emb_7
- emb_8
- emb_9
- emb_10
- emb_11
- emb_12
- emb_13
- emb_14
- emb_15
- emb_16
- emb_17
- emb_18
- emb_19
- emb_20
- emb_21
- emb_22
- emb_23
- emb_24
- emb_25
- emb_26
- emb_27
- emb_28
- emb_29
- emb_30
- emb_31
- emb_32
- emb_33
- emb_34
- emb_35
- emb_36
- emb_37
- emb_38
- emb_39
- emb_40
- emb_41
- emb_42
- emb_43
- emb_44
- emb_45
- emb_46
- emb_47
- emb_48
- emb_49
- emb_50
- emb_51
- emb_52
- emb_53
- emb_54
- emb_55
- emb_56
- emb_57
- emb_58
- emb_59
- emb_60
- emb_61
- emb_62
- emb_63
- emb_64
- emb_65
- emb_66
- emb_67
- emb_68
- 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, r2_score

class TextPathway(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.pathway = nn.Sequential(
            nn.Linear(input_dim, 300),  # Embedding-like layer
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(300, 100),  # Bidirectional-like transformation
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(100, 64),  # Dense reduction
            nn.ReLU(),
            nn.Dropout(0.1)
        )
    
    def forward(self, x):
        return self.pathway(x)

class RatingPredictor(nn.Module):
    def __init__(self, bert_dim, num_categorical):
        super().__init__()
        
        # Separate pathways for each text type
        self.pros_pathway = TextPathway(bert_dim)
        self.cons_pathway = TextPathway(bert_dim)
        self.headline_pathway = TextPathway(bert_dim)
        
        # Pathway for categorical features
        self.categorical_pathway = nn.Sequential(
            nn.Linear(num_categorical, 32),
            nn.ReLU(),
            nn.Dropout(0.1)
        )
        
        # Combined processing
        combined_dim = 64 * 3 + 32  # Three text pathways + categorical
        
        self.final_layers = nn.Sequential(
            nn.Linear(combined_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(64, 16),
            nn.ReLU(),
            nn.Dropout(0.1),
            
            nn.Linear(16, 1),
            nn.Sigmoid()  # Scale to [0,1], will be rescaled to [1,5]
        )
    
    def forward(self, x):
        # Split input into components
        bert_dim = x.shape[1] - self.categorical_pathway[0].in_features
        dim_per_text = bert_dim // 3
        
        pros_emb = x[:, :dim_per_text]
        cons_emb = x[:, dim_per_text:2*dim_per_text]
        headline_emb = x[:, 2*dim_per_text:bert_dim]
        categorical = x[:, bert_dim:]
        
        # Process each pathway
        pros_features = self.pros_pathway(pros_emb)
        cons_features = self.cons_pathway(cons_emb)
        headline_features = self.headline_pathway(headline_emb)
        categorical_features = self.categorical_pathway(categorical)
        
        # Combine all features
        combined = torch.cat([
            pros_features,
            cons_features,
            headline_features,
            categorical_features
        ], dim=1)
        
        # Final processing
        output = self.final_layers(combined)
        
        # Scale from [0,1] to [1,5]
        return 1 + 4 * output.squeeze()

class ReviewDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.FloatTensor(X.values)
        self.y = torch.FloatTensor(y.values) if y is not None else None
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        return self.X[idx]

class DeepLearningTrainer:
    def __init__(self, bert_dim, num_categorical, device=None):
        if device is None:
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        else:
            self.device = device
        print(f"Using device: {self.device}")
        
        self.bert_dim = bert_dim
        self.num_categorical = num_categorical
    
    def train_epoch(self, model, train_loader, criterion, optimizer):
        model.train()
        total_loss = 0
        pbar = tqdm(train_loader, desc="Training")
        
        for X_batch, y_batch in pbar:
            X_batch, y_batch = X_batch.to(self.device), y_batch.to(self.device)
            
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            pbar.set_postfix({'loss': f"{loss.item():.4f}"})
        
        return total_loss / len(train_loader)
    
    def train_and_evaluate(self, X_train, X_val, y_train, y_val, 
                          batch_size=64, epochs=50, patience=5):
        # Create datasets
        train_dataset = ReviewDataset(X_train, y_train)
        val_dataset = ReviewDataset(X_val, y_val)
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)
        
        # Initialize model
        model = RatingPredictor(self.bert_dim, self.num_categorical).to(self.device)
        
        # Loss and optimizer
        criterion = nn.MSELoss()
        optimizer = optim.AdamW(model.parameters(), lr=2e-4, weight_decay=0.01)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='min', factor=0.5, patience=2, verbose=True
        )
        
        # Training loop
        best_val_loss = float('inf')
        best_model = None
        no_improve = 0
        
        for epoch in range(epochs):
            print(f"\nEpoch {epoch + 1}/{epochs}")
            
            # Train
            train_loss = self.train_epoch(model, train_loader, criterion, optimizer)
            
            # Validate
            model.eval()
            val_predictions = []
            val_actuals = []
            
            with torch.no_grad():
                for X_batch, y_batch in val_loader:
                    X_batch = X_batch.to(self.device)
                    y_pred = model(X_batch)
                    val_predictions.extend(y_pred.cpu().numpy())
                    val_actuals.extend(y_batch.numpy())
            
            val_predictions = np.array(val_predictions)
            val_actuals = np.array(val_actuals)
            
            val_mse = mean_squared_error(val_actuals, val_predictions)
            val_r2 = r2_score(val_actuals, val_predictions)
            
            print(f"Train Loss: {train_loss:.4f}")
            print(f"Val MSE: {val_mse:.4f}")
            print(f"Val R2: {val_r2:.4f}")
            
            # Learning rate scheduling
            scheduler.step(val_mse)
            
            # Early stopping
            if val_mse < best_val_loss:
                best_val_loss = val_mse
                best_model = model.state_dict()
                no_improve = 0
            else:
                no_improve += 1
                if no_improve >= patience:
                    print("Early stopping triggered")
                    break
        
        # Load best model
        model.load_state_dict(best_model)
        return model

    def make_predictions(self, model, X_test):
        model.eval()
        test_dataset = ReviewDataset(X_test)
        test_loader = DataLoader(test_dataset, batch_size=64)
        predictions = []
        
        with torch.no_grad():
            for X_batch in test_loader:
                X_batch = X_batch.to(self.device)
                y_pred = model(X_batch)
                predictions.extend(y_pred.cpu().numpy())
        
        return np.clip(np.array(predictions), 1, 5)

In [None]:
# Calculate dimensions
bert_dim = len([col for col in X_train.columns if col.startswith('emb_')])
num_categorical = len([col for col in X_train.columns if not col.startswith('emb_')])

# Initialize trainer
trainer = DeepLearningTrainer(bert_dim//3, num_categorical)  # divide by 3 because we have 3 text inputs

# Train model
model = trainer.train_and_evaluate(X_train, X_val, y_train, y_val)

# Make predictions
predictions = trainer.make_predictions(model, X_test)

Using device: cuda

Epoch 1/50


Training: 100%|██████████| 7969/7969 [01:27<00:00, 90.76it/s, loss=0.7600]


Train Loss: 0.6038
Val MSE: 0.5361
Val R2: 0.5644

Epoch 2/50


Training: 100%|██████████| 7969/7969 [01:28<00:00, 90.08it/s, loss=0.6812]


Train Loss: 0.5507
Val MSE: 0.5266
Val R2: 0.5722

Epoch 3/50


Training: 100%|██████████| 7969/7969 [01:27<00:00, 90.87it/s, loss=0.5180]


Train Loss: 0.5391
Val MSE: 0.5273
Val R2: 0.5715

Epoch 4/50


Training: 100%|██████████| 7969/7969 [01:28<00:00, 90.37it/s, loss=0.6258]


Train Loss: 0.5316
Val MSE: 0.5189
Val R2: 0.5784

Epoch 5/50


Training: 100%|██████████| 7969/7969 [01:28<00:00, 89.77it/s, loss=0.4356]


Train Loss: 0.5262
Val MSE: 0.5117
Val R2: 0.5843

Epoch 6/50


Training: 100%|██████████| 7969/7969 [01:28<00:00, 89.82it/s, loss=0.4488]


Train Loss: 0.5211
Val MSE: 0.5094
Val R2: 0.5861

Epoch 7/50


Training: 100%|██████████| 7969/7969 [01:29<00:00, 89.21it/s, loss=0.4055]


Train Loss: 0.5170
Val MSE: 0.5129
Val R2: 0.5832

Epoch 8/50


Training: 100%|██████████| 7969/7969 [01:28<00:00, 89.67it/s, loss=0.4444]


Train Loss: 0.5133
Val MSE: 0.5124
Val R2: 0.5837

Epoch 9/50


Training: 100%|██████████| 7969/7969 [01:29<00:00, 89.45it/s, loss=0.5961]


Train Loss: 0.5094
Val MSE: 0.5107
Val R2: 0.5851
Epoch 00009: reducing learning rate of group 0 to 1.0000e-04.

Epoch 10/50


Training: 100%|██████████| 7969/7969 [01:29<00:00, 89.39it/s, loss=0.4213]


Train Loss: 0.4976
Val MSE: 0.5054
Val R2: 0.5894

Epoch 11/50


Training: 100%|██████████| 7969/7969 [01:29<00:00, 89.13it/s, loss=0.5206]


Train Loss: 0.4932
Val MSE: 0.5052
Val R2: 0.5896

Epoch 12/50


Training: 100%|██████████| 7969/7969 [01:28<00:00, 90.25it/s, loss=0.4381]


Train Loss: 0.4900
Val MSE: 0.5039
Val R2: 0.5906

Epoch 13/50


Training: 100%|██████████| 7969/7969 [01:28<00:00, 90.04it/s, loss=0.6143]


Train Loss: 0.4870
Val MSE: 0.5040
Val R2: 0.5905

Epoch 14/50


Training: 100%|██████████| 7969/7969 [01:28<00:00, 90.23it/s, loss=0.7461]


Train Loss: 0.4841
Val MSE: 0.5084
Val R2: 0.5869

Epoch 15/50


Training: 100%|██████████| 7969/7969 [01:28<00:00, 89.72it/s, loss=0.4926]


Train Loss: 0.4807
Val MSE: 0.5047
Val R2: 0.5899
Epoch 00015: reducing learning rate of group 0 to 5.0000e-05.

Epoch 16/50


Training: 100%|██████████| 7969/7969 [01:30<00:00, 87.92it/s, loss=0.3737]


Train Loss: 0.4719
Val MSE: 0.5047
Val R2: 0.5900

Epoch 17/50


Training: 100%|██████████| 7969/7969 [01:28<00:00, 89.65it/s, loss=0.3685]


Train Loss: 0.4687
Val MSE: 0.5052
Val R2: 0.5895
Early stopping triggered


In [None]:
def create_neural_net_submission(trainer, model, X_test, X_train, y_train, student_id="20886030", anonymous_name="zesty"):
    """
    Create submission file for neural network model.
    
    Parameters:
    -----------
    trainer : DeepLearningTrainer
        The trainer object used to make predictions
    model : RatingPredictor
        The trained neural network model
    X_test : pd.DataFrame
        Test data
    X_train : pd.DataFrame
        Training data
    y_train : pd.Series
        Training labels
    """
    # Get predictions using the trainer
    predictions = trainer.make_predictions(model, X_test)
    
    # Calculate R² and MSE on training data using the trainer
    train_predictions = trainer.make_predictions(model, X_train)
    train_r2 = r2_score(y_train, train_predictions)
    train_mse = mean_squared_error(y_train, train_predictions)
    
    # Model description
    model_description = "Deep Learning (DistilBERT Embeddings + Neural Network)"
    
    # Create submission file
    with open('submission_neural_net.csv', 'w') as f:
        f.write(f"{student_id}\n")
        f.write(f"{anonymous_name}\n")
        f.write(f"{train_r2:.4f}\n")
        f.write(f"{model_description}\n")
        
        # Write predictions
        for pred in predictions:
            f.write(f"{pred:.4f}\n")
    
    print(f"Submission file created with:")
    print(f"Student ID: {student_id}")
    print(f"Anonymous name: {anonymous_name}")
    print(f"Training R²: {train_r2:.4f}")
    print(f"Training MSE: {train_mse:.4f}")  # Print the MSE
    print(f"Model: {model_description}")
    print(f"Number of predictions: {len(predictions)}")
    
    return 'submission_neural_net.csv'


In [None]:
# After training the model:
submission_file = create_neural_net_submission(
    trainer=trainer,  # Pass the trainer object
    model=model,
    X_test=X_test,
    X_train=X_train,
    y_train=y_train
)

Submission file created with:
Student ID: 20886030
Anonymous name: zesty
Training R²: 0.6388
Training MSE: 0.4445
Model: Deep Learning (DistilBERT Embeddings + Neural Network)
Number of predictions: 100000


# Q3

In [10]:
!pip install wordcloud

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

def load_original_data(small_train_path, large_train_path):
    """Load and combine the original training datasets"""
    df_small = pd.read_csv(small_train_path)
    df_large = pd.read_csv(large_train_path)
    return pd.concat([df_small, df_large], axis=0, ignore_index=True)

def analyze_word_frequencies(small_train_path, large_train_path):
    # Load original data
    print("Loading data...")
    df_train = load_original_data(small_train_path, large_train_path)
    
    # Download required NLTK data
    nltk.download('punkt')
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    
    def clean_text(text):
        """Clean text by removing special characters and converting to lowercase"""
        if not isinstance(text, str):
            return ""
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
        # Remove extra whitespace
        text = ' '.join(text.split())
        return text
    
    def get_word_freq(texts):
        """Get word frequencies excluding stopwords"""
        words = []
        for text in texts:
            tokens = word_tokenize(clean_text(text))
            words.extend([word for word in tokens if word not in stop_words and len(word) > 2])
        return Counter(words)
    
    print("Creating visualizations...")
    # Create figure with subplots
    plt.figure(figsize=(20, 12))
    
    # Process pros and cons separately for each rating
    for idx, text_type in enumerate(['pros', 'cons']):
        plt.subplot(1, 2, idx+1)
        
        # Store frequencies for each rating
        frequencies = []
        ratings = sorted(df_train['rating'].unique())
        
        for rating in ratings:
            texts = df_train[df_train['rating'] == rating][text_type]
            freq = get_word_freq(texts)
            # Get top 10 words
            top_words = dict(sorted(freq.items(), key=lambda x: x[1], reverse=True)[:10])
            frequencies.append(pd.Series(top_words))
        
        # Create DataFrame with frequencies
        freq_df = pd.DataFrame(frequencies, index=ratings)
        
        # Create heatmap
        sns.heatmap(freq_df.fillna(0), 
                   cmap='YlOrRd', 
                   annot=True, 
                   fmt='.0f',
                   cbar_kws={'label': 'Word Frequency'})
        
        plt.title(f'Top 10 Words in {text_type.capitalize()} by Rating')
        plt.xlabel('Words')
        plt.ylabel('Rating')
    
    plt.tight_layout()
    plt.savefig('word_frequencies.png', dpi=300, bbox_inches='tight')
    plt.close()

    print("Creating word clouds...")
    # Create word clouds for each rating
    plt.figure(figsize=(20, 15))
    for i, rating in enumerate(sorted(df_train['rating'].unique()), 1):
        plt.subplot(3, 2, i)
        
        # Combine pros and cons for this rating
        texts = (df_train[df_train['rating'] == rating]['pros'].fillna('') + ' ' + 
                df_train[df_train['rating'] == rating]['cons'].fillna(''))
        
        # Create and generate word cloud
        text = ' '.join(texts)
        wordcloud = WordCloud(width=800, height=400,
                            background_color='white',
                            max_words=50).generate(clean_text(text))
        
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(f'{rating}-Star Reviews')
        plt.axis('off')
    
    plt.tight_layout()
    plt.savefig('word_clouds.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    print("Analysis complete! Check 'word_frequencies.png' and 'word_clouds.png' for results.")

# Example usage:
analyze_word_frequencies(
    small_train_path="424_F2024_Final_PC_small_train_v1.csv",
    large_train_path="424_F2024_Final_PC_large_train_v1.csv"
)

Loading data...


[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Creating visualizations...
Creating word clouds...
Analysis complete! Check 'word_frequencies.png' and 'word_clouds.png' for results.


In [None]:
def create_all_visualizations(trainer, model, X_train, X_test, y_train):
    """Create all visualizations using the trainer and model"""
    
    def create_feature_importance_plot(trainer, model, X_train, filename='feature_importance.png'):
        print("Creating feature importance plot...")
        model.eval()
        importances = []
        
        # Convert to tensor and move to same device as model
        X = torch.FloatTensor(X_train.values).to(trainer.device)
        
        # Calculate importance for each feature
        for i in range(X.shape[1]):
            X_modified = X.clone()
            X_modified[:, i] += torch.std(X[:, i]) * 0.1
            
            with torch.no_grad():
                original_output = model(X)
                modified_output = model(X_modified)
                importance = torch.mean(torch.abs(modified_output - original_output))
                importances.append(importance.cpu().item())
        
        # Create importance DataFrame
        importance_df = pd.DataFrame({
            'Feature': X_train.columns,
            'Importance': importances
        }).sort_values('Importance', ascending=False)
        
        # Plot
        plt.figure(figsize=(12, 6))
        sns.barplot(x='Importance', y='Feature', data=importance_df.head(20))
        plt.title('Top 20 Feature Importances')
        plt.tight_layout()
        plt.savefig(filename)
        plt.close()
        
        return importance_df

    # Alternative version if you want to include only the most important embedding features
    def create_correlation_plot_with_top_embeddings(X_train, importance_df, top_n=10, filename='correlation_matrix.png'):
        """Create correlation matrix including only top important features"""
        print("Creating correlation matrix plot with top features...")

        # Get non-embedding columns
        non_embedding_cols = [col for col in X_train.columns if not col.startswith('emb_')]

        # Get top important embedding features
        top_embedding_cols = [col for col in importance_df['Feature'].head(top_n) 
                             if col.startswith('emb_')]

        # Combine columns
        selected_cols = non_embedding_cols + top_embedding_cols

        # Calculate correlation matrix for selected features
        corr_matrix = X_train[selected_cols].corr()

        plt.figure(figsize=(12, 10))
        sns.heatmap(corr_matrix, 
                    cmap='RdBu_r', 
                    center=0,
                    annot=True,
                    fmt='.2f')
        plt.title('Feature Correlation Matrix\n(Basic Features + Top Important Embeddings)')
        plt.xticks(rotation=45, ha='right')
        plt.yticks(rotation=0)
        plt.tight_layout()
        plt.savefig(filename)
        plt.close()

    def compare_distributions(X_train, X_test, filename='distribution_comparison.png'):
        """Compare feature distributions between train and test sets"""
        print("Creating distribution comparison plots...")
        
        # Select numerical features (limit to first 15 for visibility)
        numerical_features = X_train.select_dtypes(include=['float64', 'int64']).columns[:15]
        
        n_features = len(numerical_features)
        n_cols = 3
        n_rows = (n_features + n_cols - 1) // n_cols
        
        plt.figure(figsize=(15, 5*n_rows))
        
        for idx, feature in enumerate(numerical_features, 1):
            plt.subplot(n_rows, n_cols, idx)
            
            sns.kdeplot(data=X_train[feature], label='Train', alpha=0.5)
            sns.kdeplot(data=X_test[feature], label='Test', alpha=0.5)
            
            plt.title(f'Distribution of {feature}')
            plt.legend()
        
        plt.tight_layout()
        plt.savefig(filename)
        plt.close()

    def analyze_prediction_errors(trainer, model, X_train, y_train, filename='prediction_errors.png'):
        """Analyze prediction errors across different rating levels"""
        print("Creating prediction error analysis plots...")
        
        # Get predictions using the trainer's prediction method
        y_pred = trainer.make_predictions(model, X_train)
        
        # Create error analysis DataFrame
        error_df = pd.DataFrame({
            'Actual': y_train,
            'Predicted': y_pred,
            'Error': y_pred - y_train
        })
        
        plt.figure(figsize=(15, 10))
        
        # Plot 1: Actual vs Predicted
        plt.subplot(2, 2, 1)
        sns.scatterplot(data=error_df, x='Actual', y='Predicted', alpha=0.1)
        plt.plot([1, 5], [1, 5], 'r--')
        plt.title('Actual vs Predicted Ratings')
        
        # Plot 2: Error Distribution
        plt.subplot(2, 2, 2)
        sns.histplot(data=error_df, x='Error', bins=50)
        plt.title('Distribution of Prediction Errors')
        
        # Plot 3: Box plot of errors by rating
        plt.subplot(2, 2, 3)
        sns.boxplot(data=error_df, x='Actual', y='Error')
        plt.title('Prediction Errors by Actual Rating')
        
        # Plot 4: Mean Absolute Error by rating
        plt.subplot(2, 2, 4)
        mae_by_rating = error_df.groupby('Actual')['Error'].apply(lambda x: np.abs(x).mean())
        sns.barplot(x=mae_by_rating.index, y=mae_by_rating.values)
        plt.title('Mean Absolute Error by Rating')
        
        plt.tight_layout()
        plt.savefig(filename)
        plt.close()
    
    # Create all visualizations
    importance_df = create_feature_importance_plot(trainer, model, X_train)
    create_correlation_plot_with_top_embeddings(X_train, importance_df)
    compare_distributions(X_train, X_test)
    analyze_prediction_errors(trainer, model, X_train, y_train)
    
    print("All visualizations have been created!")
    return importance_df

In [None]:
visualizations = create_all_visualizations(
    trainer=trainer,  # Your DeepLearningTrainer instance
    model=model,      # Your trained model
    X_train=X_train,
    X_test=X_test,
    y_train=y_train
)

Creating feature importance plot...
Creating correlation matrix plot with top features...
Creating distribution comparison plots...
Creating prediction error analysis plots...
All visualizations have been created!


# Q4

In [28]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
import nltk
import re
from textblob import TextBlob
import warnings
warnings.filterwarnings('ignore')

def analyze_company_ratings(df_small_path, df_large_path):
    """
    Analyze what drives ratings at different company quality levels.
    
    Parameters:
    -----------
    df_small_path : str
        Path to small training dataset CSV
    df_large_path : str
        Path to large training dataset CSV
    """
    # Download required NLTK data
    try:
        nltk.download('punkt')
        nltk.download('stopwords')
        nltk.download('averaged_perceptron_tagger')
    except:
        print("NLTK data already downloaded or error in downloading")
    
    def load_and_prepare_data():
        """Load and prepare the datasets"""
        # Load data
        df_small = pd.read_csv(df_small_path)
        df_large = pd.read_csv(df_large_path)
        df = pd.concat([df_small, df_large], ignore_index=True)
        
        # Calculate company average ratings
        company_ratings = df.groupby('firm')['rating'].agg(['mean', 'count']).reset_index()
        
        # Only consider companies with at least 10 reviews
        company_ratings = company_ratings[company_ratings['count'] >= 10]
        
        # Categorize companies
        company_ratings['quality_category'] = pd.qcut(
            company_ratings['mean'], 
            q=3, 
            labels=['Low', 'Medium', 'High']
        )
        
        # Create mapping dictionary
        company_category_map = dict(zip(company_ratings['firm'], company_ratings['quality_category']))
        
        # Add categories to original dataframe
        df['company_quality'] = df['firm'].map(company_category_map)
        
        return df, company_ratings
    
    def clean_text(text):
        """Clean text data"""
        if not isinstance(text, str):
            return ""
        
        # Convert to lowercase and remove special characters
        text = re.sub(r'[^a-zA-Z\s]', ' ', text.lower())
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        return text
    
    def extract_key_phrases(texts, top_n=10):
        """Extract key phrases from text"""
        # Combine all texts
        combined_text = ' '.join([clean_text(text) for text in texts if isinstance(text, str)])
        
        # Tokenize
        words = word_tokenize(combined_text)
        
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words and len(word) > 2]
        
        # Count frequencies
        word_freq = Counter(words)
        
        return dict(word_freq.most_common(top_n))
    
    def analyze_topics_by_category(df):
        """Analyze topics for each company quality category"""
        results = {}
        
        for category in ['Low', 'Medium', 'High']:
            category_df = df[df['company_quality'] == category]
            
            results[category] = {
                'pros_topics': extract_key_phrases(category_df['pros']),
                'cons_topics': extract_key_phrases(category_df['cons']),
                'avg_rating': category_df['rating'].mean(),
                'n_companies': category_df['firm'].nunique(),
                'n_reviews': len(category_df)
            }
        
        return results
    
    def create_visualizations(results):
        """Create visualizations for the analysis"""
        # Set up the plots
        fig, axes = plt.subplots(2, 1, figsize=(15, 20))
        
        # Plot pros
        pros_data = []
        for category in ['Low', 'Medium', 'High']:
            for word, freq in results[category]['pros_topics'].items():
                pros_data.append({
                    'Category': category,
                    'Word': word,
                    'Frequency': freq
                })
        
        pros_df = pd.DataFrame(pros_data)
        sns.barplot(
            data=pros_df,
            x='Frequency',
            y='Word',
            hue='Category',
            ax=axes[0]
        )
        axes[0].set_title('Most Common Positive Aspects by Company Quality')
        
        # Plot cons
        cons_data = []
        for category in ['Low', 'Medium', 'High']:
            for word, freq in results[category]['cons_topics'].items():
                cons_data.append({
                    'Category': category,
                    'Word': word,
                    'Frequency': freq
                })
        
        cons_df = pd.DataFrame(cons_data)
        sns.barplot(
            data=cons_df,
            x='Frequency',
            y='Word',
            hue='Category',
            ax=axes[1]
        )
        axes[1].set_title('Most Common Complaints by Company Quality')
        
        plt.tight_layout()
        plt.savefig('company_analysis.png')
        plt.close()
        
        # Create summary table
        summary_data = []
        for category in ['Low', 'Medium', 'High']:
            summary_data.append({
                'Category': category,
                'Average Rating': f"{results[category]['avg_rating']:.2f}",
                'Number of Companies': results[category]['n_companies'],
                'Number of Reviews': results[category]['n_reviews'],
                'Top Pros': ', '.join(list(results[category]['pros_topics'].keys())[:5]),
                'Top Cons': ', '.join(list(results[category]['cons_topics'].keys())[:5])
            })
        
        summary_df = pd.DataFrame(summary_data)
        summary_df.to_csv('company_analysis_summary.csv', index=False)
        
        return summary_df
    
    # Main execution
    print("Loading and preparing data...")
    df, company_ratings = load_and_prepare_data()
    
    print("Analyzing topics by category...")
    results = analyze_topics_by_category(df)
    
    print("Creating visualizations...")
    summary_df = create_visualizations(results)
    
    print("\nAnalysis Complete!")
    print("\nKey Findings:")
    print("-------------")
    for category in ['Low', 'Medium', 'High']:
        print(f"\n{category}-rated companies:")
        print(f"Average rating: {results[category]['avg_rating']:.2f}")
        print(f"Number of companies: {results[category]['n_companies']}")
        print("Top pros:", ', '.join(list(results[category]['pros_topics'].keys())[:5]))
        print("Top cons:", ', '.join(list(results[category]['cons_topics'].keys())[:5]))
    
    return results, summary_df

In [30]:
results, summary = analyze_company_ratings(
    "424_F2024_Final_PC_small_train_v1.csv",
    "424_F2024_Final_PC_large_train_v1.csv"
)

Loading and preparing data...


[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Analyzing topics by category...
Creating visualizations...

Analysis Complete!

Key Findings:
-------------

Low-rated companies:
Average rating: 3.39
Number of companies: 138
Top pros: good, work, great, people, company
Top cons: work, management, pay, hours, get

Medium-rated companies:
Average rating: 3.72
Number of companies: 137
Top pros: good, work, great, people, company
Top cons: work, hours, management, long, life

High-rated companies:
Average rating: 4.14
Number of companies: 138
Top pros: great, work, good, people, benefits
Top cons: work, company, management, hours, get
