# Brazilian E-Commerce Analysis Pipeline

This notebook demonstrates the complete analysis pipeline using our refactored codebase. It includes:
1. Data Loading and Preprocessing
2. Exploratory Data Analysis
3. Customer Analysis
4. Sales Analysis
5. Product Analysis
6. Model Evaluation

In [None]:
import sys
from pathlib import Path

# Add project root to Python path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_processing.data_loader import OlistDataLoader
from src.visualization.plot_utils import (
    set_plotting_style,
    plot_time_series,
    plot_category_distribution,
    plot_correlation_matrix
)
from src.models.model_evaluation import (
    evaluate_classifier,
    evaluate_regression,
    evaluate_time_series
)

## 1. Data Loading and Preprocessing

In [None]:
# Initialize data loader
data_dir = project_root / 'data' / 'raw'
loader = OlistDataLoader(data_dir)

# Load and preprocess all datasets
processed_data = loader.get_preprocessed_data()

# Extract individual datasets
orders = processed_data['orders']
customer_features = processed_data['customer_features']
products = processed_data['products']
reviews = processed_data['reviews']

## 2. Exploratory Data Analysis

In [None]:
# Set consistent plotting style
set_plotting_style()

# Time series of orders
daily_orders = orders.groupby('order_purchase_timestamp').size().reset_index()
daily_orders.columns = ['date', 'count']

fig = plot_time_series(
    data=daily_orders,
    date_column='date',
    value_column='count',
    title='Daily Order Volume'
)
plt.show()

# Product category distribution
fig = plot_category_distribution(
    data=products,
    category_column='product_category_name',
    title='Product Categories Distribution'
)
plt.show()

## 3. Customer Analysis

In [None]:
# Analyze customer features
customer_metrics = [
    'order_count',
    'total_spend',
    'avg_order_value',
    'days_since_last_purchase'
]

fig = plot_correlation_matrix(
    data=customer_features,
    columns=customer_metrics,
    title='Customer Metrics Correlation'
)
plt.show()

## 4. Sales Analysis

In [None]:
# Monthly sales analysis
orders['month'] = orders['order_purchase_timestamp'].dt.to_period('M')
monthly_sales = orders.groupby('month').size().reset_index()
monthly_sales.columns = ['month', 'sales']

# Convert period to timestamp for plotting
monthly_sales['month'] = monthly_sales['month'].astype(str).apply(pd.to_datetime)

fig = plot_time_series(
    data=monthly_sales,
    date_column='month',
    value_column='sales',
    title='Monthly Sales Trend'
)
plt.show()

## 5. Model Evaluation Example

In [None]:
# Example: Evaluate time series forecast
# (Using dummy data for illustration)
dates = pd.date_range(start='2022-01-01', end='2022-12-31', freq='D')
actual = pd.Series(np.random.normal(100, 10, len(dates)), index=dates)
forecast = actual + np.random.normal(0, 5, len(dates))

metrics = evaluate_time_series(actual, forecast)
print("Time Series Forecast Metrics:")
for metric, value in metrics.items():
    print(f"{metric.upper()}: {value:.2f}")

# Visualize actual vs forecast
fig = plot_time_series(
    data=pd.DataFrame({'date': dates, 'value': actual}),
    date_column='date',
    value_column='value',
    title='Actual vs Forecast',
    forecast=pd.Series(forecast, index=dates)
)
plt.show()