In [1]:
# Install Category encoders
try:
    import category_encoders
except ImportError:
    !pip install category_encoders

# Install xgboost
try:
    import xgboost
except ImportError:
    !pip install xgboost

Collecting category_encoders
  Downloading category_encoders-2.9.0-py3-none-any.whl.metadata (7.9 kB)
Collecting patsy>=0.5.1 (from category_encoders)
  Downloading patsy-1.0.2-py2.py3-none-any.whl.metadata (3.6 kB)
Collecting statsmodels>=0.9.0 (from category_encoders)
  Downloading statsmodels-0.14.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (9.5 kB)
Downloading category_encoders-2.9.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.9/85.9 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading patsy-1.0.2-py2.py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.3/233.3 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading statsmodels-0.14.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m119.6 MB/s[0m eta [36m0

In [2]:
# Standard libraries
import numpy as np
import pandas as pd
import gc
import os
import json
import pickle
from datetime import datetime

# XGBoost
import xgboost as xgb

# Category encoders
from category_encoders import TargetEncoder

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.dates import DateFormatter, MonthLocator

# Google Colab
from google.colab import drive

print("All imports successful!")


All imports successful!


In [3]:
# Mount Drive
drive.mount('/content/drive')

# Configuration
CONFIG = {
    'data_path': "/content/drive/MyDrive/SU Works/CPSC_5305_Intro_to_DS/Rizvans Works/Saved Data/final_df_v2.parquet",
    'model_dir': "/content/drive/MyDrive/SU Works/CPSC_5305_Intro_to_DS/Rizvans Works/XGBoost_Results/",
    'output_dir': "/content/drive/MyDrive/SU Works/CPSC_5305_Intro_to_DS/Rizvans Works/XGBoost_Results/Forecast_Plots/",
    'validation_days': 28,
    'random_state': 42
}

os.makedirs(CONFIG['output_dir'], exist_ok=True)

print("Configuration loaded!")
print(f"Data path: {CONFIG['data_path']}")
print(f"Model directory: {CONFIG['model_dir']}")
print(f"Output directory: {CONFIG['output_dir']}")


Mounted at /content/drive
Configuration loaded!
Data path: /content/drive/MyDrive/SU Works/CPSC_5305_Intro_to_DS/Rizvans Works/Saved Data/final_df_v2.parquet
Model directory: /content/drive/MyDrive/SU Works/CPSC_5305_Intro_to_DS/Rizvans Works/XGBoost_Results/
Output directory: /content/drive/MyDrive/SU Works/CPSC_5305_Intro_to_DS/Rizvans Works/XGBoost_Results/Forecast_Plots/


In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print(f'Memory usage decreased to {end_mem:5.2f} Mb ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    return df

def plot_state_forecast(state_data, state_name, save_path):
    # Set style
    plt.style.use('seaborn-v0_8-darkgrid')

    # Create figure
    fig, ax = plt.subplots(figsize=(16, 6))

    # Define colors and styles
    plot_config = {
        'Train': {
            'color': '#808080',
            'linewidth': 1.5,
            'alpha': 0.6,
            'linestyle': '-',
            'zorder': 1
        },
        'Real Values': {
            'color': '#2ca02c',
            'linewidth': 2.5,
            'alpha': 1.0,
            'linestyle': '-',
            'zorder': 3  # Draw on top
        },
        'Forecast': {
            'color': '#1f77b4',
            'linewidth': 2.0,
            'alpha': 0.7,
            'linestyle': '--',  # Dashed line to distinguish from Real Values
            'zorder': 2
        }
    }

    # Plot in specific order: Train first, then Forecast, then Real Values on top
    for plot_type in ['Train', 'Forecast', 'Real Values']:
        type_data = state_data[state_data['type'] == plot_type]
        if len(type_data) > 0:
            config = plot_config[plot_type]
            ax.plot(type_data['date'],
                   type_data['sales_count'],
                   label=plot_type,
                   color=config['color'],
                   linewidth=config['linewidth'],
                   alpha=config['alpha'],
                   linestyle=config['linestyle'],
                   zorder=config['zorder'])

    # Formatting
    ax.set_xlabel('Date', fontsize=12, fontweight='bold')
    ax.set_ylabel('Daily sales count', fontsize=12, fontweight='bold')
    ax.set_title(f'Forecast for all stores in {state_name}',
                fontsize=16, fontweight='bold', pad=20)

    # Format x-axis
    ax.xaxis.set_major_locator(MonthLocator())
    ax.xaxis.set_major_formatter(DateFormatter('%b'))

    # Add legend
    ax.legend(loc='upper left', fontsize=11, frameon=True,
             fancybox=True, shadow=True, title='type', title_fontsize=12)

    # Grid
    ax.grid(True, alpha=0.3, linestyle='--')

    # Add vertical lines for train/val split
    train_end = state_data[state_data['type'] == 'Train']['date'].max()
    val_start = state_data[state_data['type'] == 'Real Values']['date'].min()

    if pd.notna(train_end):
        ax.axvline(train_end, color='black', linestyle=':', alpha=0.5, linewidth=1.5)
    if pd.notna(val_start):
        ax.axvline(val_start, color='black', linestyle=':', alpha=0.5, linewidth=1.5)

    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()

    print(f"  Saved: {save_path}")


In [5]:
print("="*80)
print("LOADING DATA")
print("="*80)

# Load data
print("\nLoading data...")
df = pd.read_parquet(CONFIG['data_path'])
df = reduce_mem_usage(df)
gc.collect()

print(f"Shape: {df.shape}, Memory: {df.memory_usage(deep=True).sum() / 1024**3:.2f} GB")

# Define features
target = 'sales_count'
cols_to_drop = ['date', 'wm_yr_wk']
categorical_features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
                       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
all_features = [col for col in df.columns if col != target and col not in cols_to_drop]

print(f"Features: {len(all_features)} ({len(categorical_features)} categorical)")

# Check available states
states = df['state_id'].unique()
print(f"\nStates in data: {sorted(states)}")

# Time-based split
df = df.sort_values(['item_id', 'store_id', 'date']).reset_index(drop=True)
split_date = df['date'].unique()[-CONFIG['validation_days']]

train_df = df[df['date'] < split_date].copy()
val_df = df[df['date'] >= split_date].copy()

print(f"\nTrain: {len(train_df):,} rows ({train_df['date'].min()} to {train_df['date'].max()})")
print(f"Val:   {len(val_df):,} rows ({val_df['date'].min()} to {val_df['date'].max()})")

del df, split_date
gc.collect()


LOADING DATA

Loading data...
Memory usage decreased to 9820.56 Mb (0.0% reduction)
Shape: (59181090, 55), Memory: 9.59 GB
Features: 52 (9 categorical)

States in data: ['CA', 'TX', 'WI']

Train: 58,327,370 rows (2011-01-29 00:00:00 to 2016-04-24 00:00:00)
Val:   853,720 rows (2016-04-25 00:00:00 to 2016-05-22 00:00:00)


0

In [6]:
print("\n" + "="*80)
print("LOADING TRAINED MODEL")
print("="*80)

# Load model
model_path = os.path.join(CONFIG['model_dir'], 'xgb_model_final.json')
print(f"\nLoading model from: {model_path}")

if not os.path.exists(model_path):
    raise FileNotFoundError(f"Model not found at {model_path}")

model = xgb.Booster()
model.load_model(model_path)
print("Model loaded successfully")

# Load encoder
encoder_path = os.path.join(CONFIG['model_dir'], 'encoder_final.pkl')
print(f"\nLoading encoder from: {encoder_path}")

if not os.path.exists(encoder_path):
    raise FileNotFoundError(f"Encoder not found at {encoder_path}")

with open(encoder_path, 'rb') as f:
    encoder = pickle.load(f)
print("Encoder loaded successfully")



LOADING TRAINED MODEL

Loading model from: /content/drive/MyDrive/SU Works/CPSC_5305_Intro_to_DS/Rizvans Works/XGBoost_Results/xgb_model_final.json
Model loaded successfully

Loading encoder from: /content/drive/MyDrive/SU Works/CPSC_5305_Intro_to_DS/Rizvans Works/XGBoost_Results/encoder_final.pkl
Encoder loaded successfully


In [7]:
print("\n" + "="*80)
print("ENCODING DATA AND GENERATING PREDICTIONS")
print("="*80)

# Encode training data
print("\nEncoding training data...")
X_train = encoder.transform(train_df[all_features])
print(f"Training shape: {X_train.shape}")

# Encode validation data
print("Encoding validation data...")
X_val = encoder.transform(val_df[all_features])
print(f"Validation shape: {X_val.shape}")

# Create DMatrix
dtrain = xgb.DMatrix(X_train)
dval = xgb.DMatrix(X_val)

# Make predictions
print("\nGenerating predictions...")
train_pred = model.predict(dtrain)
val_pred = model.predict(dval)

print(f"Train predictions: {len(train_pred):,}")
print(f"Val predictions: {len(val_pred):,}")

# Add predictions to dataframes
train_df['prediction'] = train_pred
val_df['prediction'] = val_pred

del X_train, X_val, dtrain, dval, train_pred, val_pred
gc.collect()


ENCODING DATA AND GENERATING PREDICTIONS

Encoding training data...
Training shape: (58327370, 52)
Encoding validation data...
Validation shape: (853720, 52)

Generating predictions...
Train predictions: 58,327,370
Val predictions: 853,720


45

In [8]:
print("\n" + "="*80)
print("PREPARING DATA FOR VISUALIZATION")
print("="*80)

# Select last N days of training data for visualization (e.g., last 90 days)
viz_train_days = 90
train_viz_cutoff = train_df['date'].unique()[-viz_train_days]
train_viz = train_df[train_df['date'] >= train_viz_cutoff].copy()

print(f"\nUsing last {viz_train_days} days of training data for visualization")
print(f"Visualization train period: {train_viz['date'].min()} to {train_viz['date'].max()}")

# 1. Aggregate Training Data (History/Actuals)
print("\nAggregating training data by state and date...")
# FIX: Only aggregate 'sales_count' to avoid duplicate columns later
train_agg = train_viz.groupby(['state_id', 'date'], as_index=False)['sales_count'].sum()
train_agg['type'] = 'Train'

print(f"Train aggregated: {len(train_agg):,} rows")

# 2. Aggregate Validation Data (Actuals)
print("Aggregating validation actual values by state and date...")
val_actual_agg = val_df.groupby(['state_id', 'date'], as_index=False)['sales_count'].sum()
val_actual_agg['type'] = 'Real Values'

print(f"Val actual aggregated: {len(val_actual_agg):,} rows")

# 3. Aggregate Validation Predictions (Forecast)
print("Aggregating validation predictions by state and date...")
val_pred_agg = val_df.groupby(['state_id', 'date'], as_index=False)['prediction'].sum()
val_pred_agg = val_pred_agg.rename(columns={'prediction': 'sales_count'})
val_pred_agg['type'] = 'Forecast'

print(f"Val forecast aggregated: {len(val_pred_agg):,} rows")

# Combine all data
print("\nCombining all data for plotting...")
plot_data = pd.concat([train_agg, val_actual_agg, val_pred_agg], ignore_index=True)
plot_data = plot_data.sort_values(['state_id', 'date']).reset_index(drop=True)

print(f"Total plot data: {len(plot_data):,} rows")
print(f"States: {plot_data['state_id'].nunique()}")
print(f"Date range: {plot_data['date'].min()} to {plot_data['date'].max()}")

# Verify data
print(f"\nData types in plot_data:")
for type_val in ['Train', 'Real Values', 'Forecast']:
    count = len(plot_data[plot_data['type'] == type_val])
    print(f"  {type_val}: {count} rows")

# Clean up
del train_viz, train_agg, val_actual_agg, val_pred_agg, train_df, val_df
gc.collect()


PREPARING DATA FOR VISUALIZATION

Using last 90 days of training data for visualization
Visualization train period: 2016-01-26 00:00:00 to 2016-04-24 00:00:00

Aggregating training data by state and date...
Train aggregated: 270 rows
Aggregating validation actual values by state and date...
Val actual aggregated: 84 rows
Aggregating validation predictions by state and date...
Val forecast aggregated: 84 rows

Combining all data for plotting...
Total plot data: 438 rows
States: 3
Date range: 2016-01-26 00:00:00 to 2016-05-22 00:00:00

Data types in plot_data:
  Train: 270 rows
  Real Values: 84 rows
  Forecast: 84 rows


  train_agg = train_viz.groupby(['state_id', 'date'], as_index=False)['sales_count'].sum()
  val_actual_agg = val_df.groupby(['state_id', 'date'], as_index=False)['sales_count'].sum()
  val_pred_agg = val_df.groupby(['state_id', 'date'], as_index=False)['prediction'].sum()


0

In [9]:

print("\n" + "="*80)
print("CREATING COMBINED PLOT")
print("="*80)

# Create subplot for all states
n_states = len(states)
fig, axes = plt.subplots(n_states, 1, figsize=(16, 5*n_states))

if n_states == 1:
    axes = [axes]

colors = {
    'Train': '#808080',
    'Real Values': '#2ca02c',
    'Forecast': '#1f77b4'
}

for idx, state in enumerate(states):
    ax = axes[idx]
    state_data = plot_data[plot_data['state_id'] == state]

    # Plot each type
    for plot_type in ['Train', 'Real Values', 'Forecast']:
        type_data = state_data[state_data['type'] == plot_type]
        if len(type_data) > 0:
            ax.plot(type_data['date'],
                   type_data['sales_count'],
                   label=plot_type,
                   color=colors[plot_type],
                   linewidth=2 if plot_type != 'Train' else 1.5,
                   alpha=0.9 if plot_type != 'Train' else 0.6)

    # Formatting
    ax.set_ylabel('Daily sales count', fontsize=11, fontweight='bold')
    ax.set_title(f'Forecast for all stores in {state}',
                fontsize=13, fontweight='bold', pad=10)
    ax.xaxis.set_major_locator(MonthLocator())
    ax.xaxis.set_major_formatter(DateFormatter('%b'))
    ax.legend(loc='upper left', fontsize=10, title='type')
    ax.grid(True, alpha=0.3, linestyle='--')

    # Add vertical lines
    train_end = state_data[state_data['type'] == 'Train']['date'].max()
    if pd.notna(train_end):
        ax.axvline(train_end, color='black', linestyle=':', alpha=0.5, linewidth=1.5)

axes[-1].set_xlabel('Date', fontsize=12, fontweight='bold')

plt.tight_layout()
combined_path = os.path.join(CONFIG['output_dir'], 'forecast_all_states_combined.png')
plt.savefig(combined_path, dpi=300, bbox_inches='tight')
plt.close()

print(f"Combined plot saved to: {combined_path}")



CREATING COMBINED PLOT
Combined plot saved to: /content/drive/MyDrive/SU Works/CPSC_5305_Intro_to_DS/Rizvans Works/XGBoost_Results/Forecast_Plots/forecast_all_states_combined.png


In [10]:
print("\n" + "="*80)
print("FORECAST VISUALIZATION COMPLETE!")
print("="*80)
print(f"\nGenerated {len(states)} individual state plots")
print(f"All plots saved to: {CONFIG['output_dir']}")
print("\nFiles created:")
print(f"  - Individual plots: forecast_[STATE].png")
print(f"  - Combined plot: forecast_all_states_combined.png")
print(f"  - Summary statistics: state_forecast_summary.csv")



FORECAST VISUALIZATION COMPLETE!

Generated 3 individual state plots
All plots saved to: /content/drive/MyDrive/SU Works/CPSC_5305_Intro_to_DS/Rizvans Works/XGBoost_Results/Forecast_Plots/

Files created:
  - Individual plots: forecast_[STATE].png
  - Combined plot: forecast_all_states_combined.png
  - Summary statistics: state_forecast_summary.csv
