# Advanced Analytics for Brazilian E-Commerce Dataset

This notebook focuses on advanced analytics, including customer churn prediction, sales forecasting, and recommendation systems.

## 1. Customer Churn Prediction

We aim to predict customer churn using features such as recency, frequency, monetary value, and delivery performance.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Import our data loading module
import sys
sys.path.append('..')
from src.data_processing.data_loader import OlistDataLoader
from config.settings import RAW_DATA_DIR, DATE_COLUMNS, FEATURE_GROUPS

# Set random seed for reproducibility
np.random.seed(42)

# Initialize data loader and load datasets
try:
    data_loader = OlistDataLoader(RAW_DATA_DIR)
    datasets = data_loader.load_all_datasets()
    
    # Get preprocessed data
    processed_data = data_loader.get_preprocessed_data()
    orders = processed_data['orders']
    customer_features = processed_data['customer_features']
    
    print("Data loaded and preprocessed successfully!")
except Exception as e:
    print(f"Error loading data: {e}")
    print("Please ensure all data files are in the data/raw/ directory")

# Display the first few rows of the features
print("\nCustomer features for churn prediction:")
display(customer_features.head())

# Check class balance
churn_distribution = customer_features['churned'].value_counts(normalize=True) * 100
print(f"\nChurn distribution: {churn_distribution[1]:.2f}% churned, {churn_distribution[0]:.2f}% active")

# Prepare features and target
X = customer_features.drop(['customer_id', 'churned', 'first_purchase_date', 'last_purchase_date'], axis=1)
y = customer_features['churned']

# Split categorical and numerical features
categorical_features = ['customer_state']
numerical_features = [col for col in X.columns if col not in categorical_features]

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ]
)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create and train the model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("\nModel Evaluation:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## 2. Sales Forecasting

We use time series models to forecast sales by category and region.

In [None]:
# Import libraries for time series forecasting
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Import additional libraries for time series forecasting
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from pmdarima import auto_arima
import warnings

# Import time series settings from config
from config.settings import (
    TIME_WINDOWS,
    FORECAST_HORIZONS,
    FORECAST_HORIZON,
    SEASONAL_PERIODS,
    FIGURE_SIZES,
    COLORS
)

# Suppress warnings
warnings.filterwarnings('ignore')

# Use proper figure size and colors from config
plt.style.use(PLOT_STYLE)
default_figsize = FIGURE_SIZES['medium']

# Sales Forecasting Functions
def prepare_sales_data(category=None, state=None):
    """
    Prepare sales data for forecasting by category and state.
    
    Parameters:
        category (str): Product category to forecast (default: most frequent)
        state (str): Customer state to forecast (default: most frequent)
    
    Returns:
        tuple: (weekly_sales, category, state) - The prepared time series data and metadata
    """
    try:
        # Use data from centralized loader
        datasets = data_loader.get_preprocessed_data()
        orders = datasets['orders']
        products = datasets['products']
        
        sales_data = orders.merge(
            order_items[['order_id', 'price']], 
            on='order_id'
        ).merge(
            products[['product_id', 'product_category_name']], 
            on='product_id'
        ).merge(
            customers[['customer_id', 'customer_state']], 
            on='customer_id'
        )
        
        # Select category and state if not provided
        if category is None:
            category = sales_data['product_category_name'].value_counts().index[0]
        if state is None:
            state = sales_data['customer_state'].value_counts().index[0]
            
        print(f"Preparing data for category '{category}' in state '{state}'")
        
        # Filter and aggregate data
        filtered_sales = sales_data[
            (sales_data['product_category_name'] == category) & 
            (sales_data['customer_state'] == state)
        ]
        
        daily_sales = filtered_sales.groupby(
            sales_data['order_purchase_timestamp'].dt.date
        )['price'].sum().reset_index()
        daily_sales['order_purchase_timestamp'] = pd.to_datetime(daily_sales['order_purchase_timestamp'])
        
        # Create complete time series
        date_range = pd.date_range(
            start=sales_data['order_purchase_timestamp'].min(),
            end=sales_data['order_purchase_timestamp'].max()
        )
        ts_data = pd.DataFrame({'date': date_range})
        ts_data = ts_data.merge(daily_sales, left_on='date', right_on='order_purchase_timestamp', how='left')
        ts_data['price'] = ts_data['price'].fillna(0)
        ts_data.set_index('date', inplace=True)
        
        # Resample to weekly data
        weekly_sales = ts_data['price'].resample('W').sum()
        
        return weekly_sales, category, state
        
    except Exception as e:
        print(f"Error preparing sales data: {e}")
        return None, None, None

def train_forecast_model(weekly_sales):
    """
    Train a SARIMA model for sales forecasting.
    
    Parameters:
        weekly_sales (pd.Series): Weekly sales time series data
    
    Returns:
        object: Fitted SARIMA model
    """
    try:
        # Find best SARIMA parameters
        auto_model = auto_arima(
            weekly_sales,
            seasonal=True,
            m=52,  # Weekly seasonality
            start_p=0, start_q=0,
            max_p=3, max_q=3,
            d=None, max_d=2,
            D=None, max_D=1,
            trace=True,
            error_action='ignore',
            suppress_warnings=True,
            stepwise=True
        )
        
        print(f"Best ARIMA model: {auto_model.order}, seasonal: {auto_model.seasonal_order}")
        
        # Fit final model
        model = SARIMAX(
            weekly_sales,
            order=auto_model.order,
            seasonal_order=auto_model.seasonal_order,
            enforce_stationarity=False,
            enforce_invertibility=False
        )
        
        return model.fit(disp=False)
        
    except Exception as e:
        print(f"Error training forecast model: {e}")
        return None

def evaluate_forecast_model(model, test_data, category, state):
    """
    Evaluate the performance of the sales forecasting model.
    
    Parameters:
        model: Fitted SARIMA model
        test_data (pd.Series): Test data to evaluate against
        category (str): Product category name
        state (str): Customer state
    """
    try:
        # Make predictions
        predictions = model.get_forecast(steps=len(test_data))
        forecast = predictions.predicted_mean
        conf_int = predictions.conf_int()
        
        # Calculate error metrics
        mse = ((test_data - forecast) ** 2).mean()
        rmse = np.sqrt(mse)
        mape = np.abs((test_data - forecast) / test_data).mean() * 100
        
        print(f"\nForecast Evaluation for {category} in {state}:")
        print(f"Root Mean Square Error: {rmse:.2f}")
        print(f"Mean Absolute Percentage Error: {mape:.2f}%")
        
        # Plot actual vs predicted
        plt.figure(figsize=(12, 6))
        plt.plot(test_data.index, test_data, label='Actual')
        plt.plot(test_data.index, forecast, label='Forecast')
        plt.fill_between(test_data.index,
                        conf_int.iloc[:, 0],
                        conf_int.iloc[:, 1],
                        color='gray', alpha=0.2)
        plt.title(f'Sales Forecast Evaluation - {category} in {state}')
        plt.xlabel('Date')
        plt.ylabel('Sales (BRL)')
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.show()
        
    except Exception as e:
        print(f"Error evaluating forecast model: {e}")

def run_sales_forecast(category=None, state=None, test_size=0.2):
    """
    Run the complete sales forecasting pipeline.
    
    Parameters:
        category (str): Product category to forecast
        state (str): Customer state to forecast
        test_size (float): Proportion of data to use for testing
    """
    # Prepare data
    weekly_sales, category, state = prepare_sales_data(category, state)
    if weekly_sales is None:
        return
    
    # Split into train and test
    train_size = int(len(weekly_sales) * (1 - test_size))
    train_data = weekly_sales[:train_size]
    test_data = weekly_sales[train_size:]
    
    # Train model
    model = train_forecast_model(train_data)
    if model is None:
        return
        
    # Evaluate model
    evaluate_forecast_model(model, test_data, category, state)

## 3. Recommendation System

We build a collaborative filtering-based recommendation system to suggest products to customers.

In [None]:
# Import libraries for recommendation systems
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Use data from the centralized loader for recommendation system
datasets = data_loader.get_preprocessed_data()
orders = datasets['orders']
products = datasets['products']

# Create user-item matrix if not already created
if 'user_item_matrix' not in globals():
    print("Creating user-item matrix...")
    # Create user-item matrix (customer-product interactions)
    user_item_data = order_items.merge(orders[['order_id', 'customer_id']], on='order_id')
    user_item_data = user_item_data.merge(products[['product_id', 'product_category_name']], on='product_id')
    
    # Count purchases of each product category by each customer
    purchase_counts = user_item_data.groupby(['customer_id', 'product_category_name']).size().reset_index(name='purchase_count')
    
    # Create a pivot table: customers x product categories
    user_item_matrix = purchase_counts.pivot(
        index='customer_id',
        columns='product_category_name',
        values='purchase_count'
    ).fillna(0)
    
    # Calculate item-item similarity matrix using cosine similarity
    sparse_user_item = csr_matrix(user_item_matrix.values)
    item_similarity = cosine_similarity(sparse_user_item.T)
    item_similarity_df = pd.DataFrame(
        item_similarity,
        index=user_item_matrix.columns,
        columns=user_item_matrix.columns
    )
else:
    print("Using existing user-item matrix...")

# Function to get top N similar items
def get_similar_categories(category_name, n=5):
    """
    Find the top N most similar product categories to a given category.
    
    Parameters:
        category_name (str): Name of the product category to find similarities for
        n (int): Number of similar categories to return (default: 5)
    
    Returns:
        pd.Series: Top N similar categories with their similarity scores
    """
    try:
        if category_name not in item_similarity_df.index:
            print(f"Category '{category_name}' not found in the dataset")
            return pd.Series()
        
        similar_categories = item_similarity_df[category_name].sort_values(ascending=False)
        # Exclude the category itself
        similar_categories = similar_categories.drop(category_name, errors='ignore')
        return similar_categories.head(n)
    except Exception as e:
        print(f"Error finding similar categories: {e}")
        return pd.Series()

# Function to recommend products for a customer
def recommend_for_customer(customer_id, n_recommendations=5):
    """
    Generate product category recommendations for a specific customer.
    
    Parameters:
        customer_id (str): ID of the customer to generate recommendations for
        n_recommendations (int): Number of recommendations to generate (default: 5)
    
    Returns:
        pd.Series: Top N recommended categories with their scores
    """
    try:
        if customer_id not in user_item_matrix.index:
            print(f"Customer '{customer_id}' not found in the dataset")
            return pd.Series()
        
        # Get the customer's purchase history
        customer_purchases = user_item_matrix.loc[customer_id]
        
        # Initialize recommendation scores
        recommendation_scores = pd.Series(0, index=user_item_matrix.columns)
        
        # For each category the customer has purchased
        for category, count in customer_purchases.items():
            if count > 0:
                # Get similar categories
                similar_categories = item_similarity_df[category]
                # Weight by purchase count
                recommendation_scores += similar_categories * count
        
        # Remove categories the customer has already purchased
        purchased_categories = customer_purchases[customer_purchases > 0].index
        recommendation_scores = recommendation_scores.drop(purchased_categories, errors='ignore')
        
        return recommendation_scores.sort_values(ascending=False).head(n_recommendations)
    except Exception as e:
        print(f"Error generating recommendations: {e}")
        return pd.Series()