## 2. Sales Forecasting

We use time series models to forecast sales by category and region.

In [None]:
# Import libraries for time series forecasting
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Prepare data for forecasting
# Import additional libraries for time series forecasting
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from pmdarima import auto_arima
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# Prepare sales data by category and region
orders = pd.read_csv('data/olist_orders_dataset.csv')
order_items = pd.read_csv('data/olist_order_items_dataset.csv')
customers = pd.read_csv('data/olist_customers_dataset.csv')
payments = pd.read_csv('data/olist_order_payments_dataset.csv')
products = pd.read_csv('data/olist_products_dataset.csv')
# Merge orders, order_items, products, and customers
sales_data = orders.merge(order_items, on='order_id')
sales_data = sales_data.merge(products[['product_id', 'product_category_name']], on='product_id')
sales_data = sales_data.merge(customers[['customer_id', 'customer_state']], on='customer_id')

# Convert timestamps to datetime
sales_data['order_purchase_timestamp'] = pd.to_datetime(sales_data['order_purchase_timestamp'])

# Extract date components
sales_data['date'] = sales_data['order_purchase_timestamp'].dt.date
sales_data['year'] = sales_data['order_purchase_timestamp'].dt.year
sales_data['month'] = sales_data['order_purchase_timestamp'].dt.month
sales_data['day'] = sales_data['order_purchase_timestamp'].dt.day
sales_data['dayofweek'] = sales_data['order_purchase_timestamp'].dt.dayofweek

# Aggregate sales by date, category, and state
daily_sales = sales_data.groupby(['date', 'product_category_name', 'customer_state'])['price'].sum().reset_index()
daily_sales['date'] = pd.to_datetime(daily_sales['date'])

# Select a specific category and state for demonstration
top_category = sales_data['product_category_name'].value_counts().index[0]
top_state = sales_data['customer_state'].value_counts().index[0]

print(f"Forecasting sales for category '{top_category}' in state '{top_state}'")

# Filter data for the selected category and state
category_state_sales = daily_sales[
    (daily_sales['product_category_name'] == top_category) & 
    (daily_sales['customer_state'] == top_state)
]

# Create a time series with complete date range
date_range = pd.date_range(start=daily_sales['date'].min(), end=daily_sales['date'].max())
ts_data = pd.DataFrame({'date': date_range})

# Merge with sales data
ts_data = ts_data.merge(
    category_state_sales[['date', 'price']], 
    on='date', 
    how='left'
).fillna(0)

# Set date as index
ts_data.set_index('date', inplace=True)

# Resample to weekly data for more stable patterns
weekly_sales = ts_data['price'].resample('W').sum()

# Train and evaluate the SARIMA model
# Automatically find the best ARIMA parameters
try:
    auto_model = auto_arima(
        weekly_sales,
        seasonal=True,
        m=52,  # Weekly data, so period is 52 weeks
        start_p=0, start_q=0,
        max_p=3, max_q=3,
        d=None, max_d=2,
        D=None, max_D=1,
        trace=True,
        error_action='ignore',
        suppress_warnings=True,
        stepwise=True
    )
    
    print(f"Best ARIMA model: {auto_model.order}, seasonal: {auto_model.seasonal_order}")
    
    # Fit the SARIMA model with the best parameters
    best_order = auto_model.order
    best_seasonal_order = auto_model.seasonal_order
    
    model = SARIMAX(
        weekly_sales,
        order=best_order,
        seasonal_order=best_seasonal_order,
        enforce_stationarity=False,
        enforce_invertibility=False
    )
    
    results = model.fit(disp=False)
    print(results.summary())
    
    # Forecast for the next 12 weeks
    future_forecast = results.get_forecast(steps=12)
    future_forecast_mean = future_forecast.predicted_mean
    future_forecast_ci = future_forecast.conf_int()
    
    # Plot the future forecast
    plt.figure(figsize=(14, 6))
    plt.plot(weekly_sales.index, weekly_sales, label='Historical Data')
    plt.plot(future_forecast_mean.index, future_forecast_mean, label='Future Forecast')
    plt.fill_between(future_forecast_mean.index, 
                    future_forecast_ci.iloc[:, 0], 
                    future_forecast_ci.iloc[:, 1], 
                    color='k', alpha=0.1, label='95% Confidence Interval')
    plt.title(f'12-Week Sales Forecast for {top_category} in {top_state}')
    plt.xlabel('Date')
    plt.ylabel('Sales (BRL)')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()
    
    print("\nSales forecasting model complete.")
    
except Exception as e:
    print(f"Error in time series forecasting: {e}")