# Amazon Musical Instruments Reviews - Sentiment Analysis

This notebook provides an interactive exploration of the sentiment analysis pipeline for Amazon Musical Instruments Reviews dataset.

## 1. Setup and Imports

In [None]:
import sys
import os
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
sys.path.append("../src")

# Import custom modules
from utils.data_loader import DataLoader, get_data_summary
from preprocessing.feature_engineering import FeatureEngineer
from preprocessing.text_preprocessing import TextPreprocessor
from models.sentiment_models import SentimentModelTrainer
from evaluation.model_evaluation import ModelEvaluator

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

# Disable warnings
import warnings
warnings.filterwarnings('ignore')

print("Setup completed successfully!")
print(f"Python version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## 2. Data Loading and Exploration

In [None]:
# Load the dataset
data_path = "../data"
loader = DataLoader(data_path)

# Try to load processed data first, then raw data
try:
    df = loader.load_processed_data()
    print("Loaded processed data")
except:
    df = loader.load_raw_data()
    print("Loaded raw data")

print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Get comprehensive data information
info = loader.get_data_info()

print("DATASET INFORMATION:")
print("=" * 40)
for key, value in info.items():
    if key != 'data_types':
        print(f"{key}: {value}")

In [None]:
# Visualize rating distribution
plt.figure(figsize=(10, 6))

# Rating distribution
plt.subplot(1, 2, 1)
rating_counts = df['overall'].value_counts().sort_index()
plt.bar(rating_counts.index, rating_counts.values, color='skyblue')
plt.xlabel('Rating (Stars)')
plt.ylabel('Count')
plt.title('Rating Distribution')
plt.grid(True, alpha=0.3)

# Add percentage labels
total = len(df)
for i, v in enumerate(rating_counts.values):
    plt.text(rating_counts.index[i], v + 50, f'{v/total*100:.1f}%', 
             ha='center', va='bottom')

# Review length distribution
plt.subplot(1, 2, 2)
review_lengths = df['reviewText'].fillna('').apply(len)
plt.hist(review_lengths, bins=50, color='lightcoral', alpha=0.7)
plt.xlabel('Review Length (characters)')
plt.ylabel('Frequency')
plt.title('Review Length Distribution')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Feature Engineering

In [None]:
# Apply feature engineering if not already done
if 'sentiment_binary' not in df.columns:
    print("Applying feature engineering...")
    feature_engineer = FeatureEngineer()
    df = feature_engineer.fit_transform(df)
    
    print(f"Added features: {feature_engineer.get_feature_names()}")
else:
    print("Features already engineered")

# Show target variable distribution
print("\nTarget Variable Distribution:")
print(df['sentiment_binary'].value_counts().sort_index())

In [None]:
# Visualize engineered features
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

features_to_plot = [
    'review_length', 'review_word_count', 'helpful_votes',
    'exclamation_count', 'question_count', 'capital_letter_ratio'
]

for i, feature in enumerate(features_to_plot):
    if feature in df.columns:
        axes[i].hist(df[feature], bins=30, alpha=0.7)
        axes[i].set_title(f'{feature.replace("_", " ").title()}')
        axes[i].set_xlabel(feature)
        axes[i].set_ylabel('Frequency')
        axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Text Preprocessing

In [None]:
# Text preprocessing example
sample_text = df['reviewText'].dropna().iloc[10]

print("ORIGINAL TEXT:")
print("-" * 50)
print(sample_text[:300] + "...")

# Initialize preprocessor
preprocessor = TextPreprocessor(
    lowercase=True,
    remove_punctuation=True,
    remove_digits=True,
    remove_stopwords=True,
    apply_lemmatization=True
)

# Preprocess sample text
processed_text = preprocessor.fit_transform([sample_text])[0]

print("\nPROCESSED TEXT:")
print("-" * 50)
print(processed_text)

In [None]:
# Process all texts (small sample for demonstration)
sample_df = df.sample(n=1000, random_state=42)  # Use smaller sample for notebook

print("Processing text data...")
sample_df = sample_df.copy()
sample_df['reviewText_processed'] = preprocessor.fit_transform(
    sample_df['reviewText'].fillna('')
)

print("Text preprocessing completed!")
print(f"Sample size: {len(sample_df)} reviews")

## 5. Model Training (Sample)

In [None]:
# Train models on sample data
trainer = SentimentModelTrainer(
    random_state=42,
    test_size=0.2,
    cv_folds=3,  # Reduced for faster execution
    use_smote=True
)

# Additional features
additional_features = [
    'helpful_votes', 'total_votes', 'helpfulness_ratio',
    'review_length', 'review_word_count', 'summary_length',
    'exclamation_count', 'question_count'
]

# Prepare data
trainer.prepare_data(
    sample_df, 
    text_column='reviewText_processed', 
    target_column='sentiment_binary',
    additional_features=additional_features
)

print("Data prepared for training")

In [None]:
# Train selected models (faster ones for notebook)
models_to_train = ['logistic_regression', 'naive_bayes']

results = {}
for model_name in models_to_train:
    print(f"Training {model_name}...")
    result = trainer.train_model(model_name, perform_cv=True)
    results[model_name] = result
    print(f"Test F1-Score: {result['test_f1']:.4f}")

print("\nModel training completed!")

## 6. Model Evaluation

In [None]:
# Create model comparison
summary_df = trainer.get_results_summary()
print("MODEL COMPARISON:")
print(summary_df)

In [None]:
# Evaluate models with detailed metrics
evaluator = ModelEvaluator(save_plots=False)

evaluation_results = {}

for model_name in models_to_train:
    if model_name in trainer.models:
        model = trainer.models[model_name]
        
        # Get predictions
        y_test_pred = model.predict(trainer.X_test_text)
        
        # Get probabilities
        try:
            y_test_proba = model.predict_proba(trainer.X_test_text)[:, 1]
        except:
            y_test_proba = None
        
        # Evaluate
        eval_result = evaluator.evaluate_single_model(
            trainer.y_test, y_test_pred, y_test_proba, model_name
        )
        
        evaluation_results[model_name] = eval_result

In [None]:
# Plot comparison
if evaluation_results:
    comparison_df = evaluator.compare_models(evaluation_results)
    
    # Plot metrics comparison
    fig, ax = plt.subplots(figsize=(10, 6))
    
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
    x = np.arange(len(comparison_df))
    width = 0.15
    
    for i, metric in enumerate(metrics):
        if metric in comparison_df.columns:
            ax.bar(x + i*width, comparison_df[metric], width, label=metric)
    
    ax.set_xlabel('Models')
    ax.set_ylabel('Score')
    ax.set_title('Model Performance Comparison')
    ax.set_xticks(x + width * 1.5)
    ax.set_xticklabels(comparison_df['Model'])
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Confusion matrix for best model
if trainer.best_model:
    best_model_name = trainer.best_model['name']
    best_model = trainer.best_model['model']
    
    y_pred = best_model.predict(trainer.X_test_text)
    
    # Plot confusion matrix
    evaluator.plot_confusion_matrix(
        evaluation_results[best_model_name]['confusion_matrix'],
        class_names=['Negative', 'Positive'],
        model_name=best_model_name
    )

## 7. Model Prediction Examples

In [None]:
# Test predictions on new text
test_reviews = [
    "This guitar is amazing! Great sound quality and very well built. Highly recommended!",
    "Terrible product. Broke after one week. Complete waste of money.",
    "It's okay, not bad but not great either. Does the job."
]

if trainer.best_model:
    # Preprocess test reviews
    test_reviews_processed = preprocessor.transform(test_reviews)
    
    # Make predictions
    predictions = trainer.predict(test_reviews_processed)
    probabilities = trainer.best_model['model'].predict_proba(test_reviews_processed)
    
    print("PREDICTION EXAMPLES:")
    print("=" * 50)
    
    for i, (review, pred, prob) in enumerate(zip(test_reviews, predictions, probabilities)):
        sentiment = "Positive" if pred == 1 else "Negative"
        confidence = max(prob)
        
        print(f"Review {i+1}: {review[:50]}...")
        print(f"Prediction: {sentiment} (Confidence: {confidence:.3f})")
        print()

## 8. Summary and Next Steps

In [None]:
print("SENTIMENT ANALYSIS PIPELINE SUMMARY")
print("=" * 50)
print(f"Dataset size: {len(sample_df)} reviews (sample)")
print(f"Models trained: {len(results)}")

if trainer.best_model:
    best_name = trainer.best_model['name']
    best_score = trainer.best_model['score']
    print(f"Best model: {best_name} (F1-Score: {best_score:.4f})")

print("\nNext Steps:")
print("- Run the complete pipeline with main.py for full dataset")
print("- Experiment with hyperparameter tuning")
print("- Try advanced models (XGBoost, Neural Networks)")
print("- Deploy the best model for production use")
print("- Collect more recent data for model updates")