# Phase 3: Advanced & Optimized Modeling

## 1. Objective
We aim to build a high-performance sentiment classifier using **Review Text**, **Brand**, and **Category**. 

### Performance Optimizations:
1. **Parallelism**: Using `n_jobs=-1` for multi-core execution.
2. **Dimensionality Reduction**: Using `TruncatedSVD` (Latent Semantic Analysis) to condense text features.
3. **Fast Gradient Boosting**: Using XGBoost with `tree_method='hist'` for rapid training.
4. **Reliability**: Unified Scikit-Learn Pipelines to prevent data leakage.

## 2. Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report, confusion_matrix

try:
    import xgboost as xgb
except ImportError:
    print("XGBoost not found. Run: !pip install xgboost")

tqdm.pandas()

# Load cleaned data
data_path = os.path.join('..', 'data', 'interim', 'cleaned_amazon.csv')
df = pd.read_csv(data_path)

# Target Binning (Neg: 0, Neu: 1, Pos: 2)
df['sentiment'] = df['reviews.rating'].map({1: 0, 2: 0, 3: 1, 4: 2, 5: 2})

# Clean metadata and text
df = df.dropna(subset=['cleaned_text', 'brand', 'categories'])
df = df[df['cleaned_text'].str.strip().astype(bool)]

print(f"Final Dataset Shape: {df.shape}")

## 3. Data Splitting (Stratified)
We maintain an 80/20 split, ensuring sentiment ratios are preserved.

In [None]:
X = df[['cleaned_text', 'brand', 'categories']]
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

## 4. Optimized Feature Pipeline
We use `TruncatedSVD` to reduce dimensionality after TF-IDF. 

**Note**: Multinomial Naive Bayes cannot use SVD features because SVD produces negative values. We will define a specific pipeline for it.

In [None]:
# Standard preprocessor for most models
text_transformer = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english', min_df=5)),
    ('svd', TruncatedSVD(n_components=100, random_state=42))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, 'cleaned_text'),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['brand', 'categories'])
    ],
    remainder='drop'
)

# Raw TF-IDF preprocessor (specifically for MultinomialNB)
nb_preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english', min_df=5), 'cleaned_text'),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['brand', 'categories'])
    ]
)

print("Preprocessors defined (Standard + NB-specific).")

## 5. Model 0: Multinomial Naive Bayes (Classic Baseline)
Naive Bayes is a computationally cheap and effective baseline for text.

In [None]:
%%time
nb_pipeline = Pipeline([
    ('preprocessor', nb_preprocessor),
    ('clf', MultinomialNB())
])

print("Training Naive Bayes Baseline...")
nb_pipeline.fit(X_train, y_train)
joblib.dump(nb_pipeline, '../models/nb_baseline_pipeline.pkl')

## 6. Optimized Model 1: Logistic Regression (+ Parallelism)
We use `n_jobs=-1` to distribute training across all available CPUs.

In [None]:
%%time
lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression(class_weight='balanced', max_iter=2000, n_jobs=-1, random_state=42))
])

print("Training Optimized Logistic Regression...")
lr_pipeline.fit(X_train, y_train)
joblib.dump(lr_pipeline, '../models/optimized_lr_pipeline.pkl')

## 7. Optimized Model 2: XGBoost (+ Hist method)
Using `tree_method='hist'` drastically reduces training time on large datasets.

In [None]:
%%time
try:
    xgb_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('clf', xgb.XGBClassifier(
            tree_method='hist', 
            n_jobs=-1, 
            random_state=42, 
            eval_metric='mlogloss'
        ))
    ])
    
    print("Training Fast XGBoost...")
    xgb_pipeline.fit(X_train, y_train)
    joblib.dump(xgb_pipeline, '../models/optimized_xgb_pipeline.pkl')
except NameError:
    print("XGBoost missing.")

## 8. Comparative Evaluation

In [None]:
def evaluate(pipeline, name):
    y_pred = pipeline.predict(X_test)
    print(f"\n--- {name} Report ---")
    print(classification_report(y_test, y_pred, target_names=['Neg', 'Neu', 'Pos']))
    
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='inferno', xticklabels=['Neg','Neu','Pos'], yticklabels=['Neg','Neu','Pos'])
    plt.title(f"{name} Confusion Matrix")
    plt.show()

evaluate(nb_pipeline, "Naive Bayes Baseline")
evaluate(lr_pipeline, "Optimized LogReg")
try: evaluate(xgb_pipeline, "Optimized XGBoost")
except NameError: pass