# Day 1 :

 # <u> *Business Problem Understanding* <u/>
## <u> 1. Project Objective <u/>
* Predict next month's product-level sales for a retail chain to optimize inventory management and reduce stockouts/overstock situations.
## <u>2. Business Context<u/>
1. Why forecast?: Retailers typically lose 8-10% of sales due to stockouts and 10-15% due to overstocking

2. Use Case: Monthly purchase order planning, warehouse staffing, promotional planning

3. Impact: Every 1% improvement in forecast accuracy can reduce inventory costs by 2-3%

### <u>3. Scope & Constraints<u/>
1. Forecast Horizon: 1 month ahead

2. Frequency: Monthly predictions

3. Granularity: Product-level forecasting (may aggregate to category level if data sparse)

4. Time Frame: Use 2-3 years of historical data

5. Assumption: No major business disruptions (store openings/closures, pandemics)

# Day 2 :



In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
import pickle
import os

In [33]:
df = pd.read_csv('walmart.csv')

In [34]:
df.shape

(6435, 8)

In [35]:
df.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         6435 non-null   int64  
 1   Date          6435 non-null   object 
 2   Weekly_Sales  6435 non-null   float64
 3   Holiday_Flag  6435 non-null   int64  
 4   Temperature   6435 non-null   float64
 5   Fuel_Price    6435 non-null   float64
 6   CPI           6435 non-null   float64
 7   Unemployment  6435 non-null   float64
dtypes: float64(5), int64(2), object(1)
memory usage: 402.3+ KB


In [37]:
# Fix the date format (it's DD-MM-YYYY)
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y', dayfirst=True)

In [39]:
df.isnull().sum() # No Null Values

Store           0
Date            0
Weekly_Sales    0
Holiday_Flag    0
Temperature     0
Fuel_Price      0
CPI             0
Unemployment    0
dtype: int64

### Converting weekly data to monthly

In [40]:
# Create month column
df['YearMonth'] = df['Date'].dt.to_period('M')

In [47]:
# Group by store and month
monthly_all = df.groupby(['Store', 'YearMonth']).agg({
    'Weekly_Sales': 'sum',
    'Temperature': 'mean',
    'Fuel_Price': 'mean',
    'CPI': 'mean',
    'Unemployment': 'mean',
    'Holiday_Flag': 'max'
}).reset_index()

In [48]:
monthly_all = monthly_all.rename(columns={'Weekly_Sales': 'Monthly_Sales'})
monthly_all['Date'] = monthly_all['YearMonth'].dt.to_timestamp('M')

In [49]:
monthly_all.head()

Unnamed: 0,Store,YearMonth,Monthly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,Date
0,1,2010-02,6307344.1,41.845,2.54875,211.236828,8.106,1,2010-02-28
1,1,2010-03,5871293.98,52.58,2.686,211.241116,8.106,0,2010-03-31
2,1,2010-04,7422801.92,65.34,2.7744,210.552135,7.808,0,2010-04-30
3,1,2010-05,5929938.64,76.0525,2.8185,210.547812,7.808,0,2010-05-31
4,1,2010-06,6084081.46,82.3925,2.66575,211.356237,7.808,0,2010-06-30


## Creating features

In [50]:
# Sort by store and date
monthly_all = monthly_all.sort_values(['Store', 'Date']).reset_index(drop=True)

In [51]:
# Time features
monthly_all['Month'] = monthly_all['Date'].dt.month
monthly_all['Year'] = monthly_all['Date'].dt.year

In [52]:
# Create lag features per store
all_features_data = []

In [53]:
# Process each store
for store_id in monthly_all['Store'].unique():
    store_df = monthly_all[monthly_all['Store'] == store_id].copy()
    
    # Sort by date
    store_df = store_df.sort_values('Date').reset_index(drop=True)
    
    # Lag features
    store_df['Prev_Month_Sales'] = store_df['Monthly_Sales'].shift(1)
    store_df['Prev_2_Months_Sales'] = store_df['Monthly_Sales'].shift(2)
    store_df['Prev_Year_Sales'] = store_df['Monthly_Sales'].shift(12)
    # Rolling averages
    store_df['Rolling_3M_Avg'] = store_df['Monthly_Sales'].rolling(window=3, min_periods=1).mean()
    store_df['Rolling_6M_Avg'] = store_df['Monthly_Sales'].rolling(window=6, min_periods=1).mean()
    
    # Month-over-month change
    store_df['MoM_Change'] = store_df['Monthly_Sales'].pct_change()
    
    # Target: Next month's sales
    store_df['Next_Month_Sales'] = store_df['Monthly_Sales'].shift(-1)
    
    all_features_data.append(store_df)

In [54]:
# Combine all stores back
all_data = pd.concat(all_features_data, ignore_index=True)

In [55]:
all_data.isnull().sum()

Store                    0
YearMonth                0
Monthly_Sales            0
Temperature              0
Fuel_Price               0
CPI                      0
Unemployment             0
Holiday_Flag             0
Date                     0
Month                    0
Year                     0
Prev_Month_Sales        45
Prev_2_Months_Sales     90
Prev_Year_Sales        540
Rolling_3M_Avg           0
Rolling_6M_Avg           0
MoM_Change              45
Next_Month_Sales        45
dtype: int64

In [56]:
# Drop rows with NaN (from lag features)
all_data = all_data.dropna()

## Preparing Data For Modeling

In [57]:
features = ['Store', 'Month', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
            'Holiday_Flag', 'Prev_Month_Sales', 'Prev_2_Months_Sales',
            'Prev_Year_Sales', 'Rolling_3M_Avg', 'Rolling_6M_Avg', 'MoM_Change']

In [58]:
X = all_data[features]
y = all_data['Next_Month_Sales']

In [59]:
# Split data (time-based, keep stores together)
# First, get unique dates sorted
unique_dates = all_data['Date'].unique()
split_idx = int(len(unique_dates) * 0.8)
train_date_cutoff = unique_dates[split_idx]

In [60]:
# Split by date
X_train = X[all_data['Date'] < train_date_cutoff]
X_test = X[all_data['Date'] >= train_date_cutoff]
y_train = y[all_data['Date'] < train_date_cutoff]
y_test = y[all_data['Date'] >= train_date_cutoff]

In [62]:
# Scale features (excluding Store ID)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
feature_cols_to_scale = [f for f in features if f != 'Store']

In [63]:
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

In [64]:
X_train_scaled[feature_cols_to_scale] = scaler.fit_transform(X_train[feature_cols_to_scale])
X_test_scaled[feature_cols_to_scale] = scaler.transform(X_test[feature_cols_to_scale])

## Training XGBoost

In [65]:
model_all = xgb.XGBRegressor(
    n_estimators=150,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

In [66]:
# Train the model
model_all.fit(X_train_scaled, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [67]:
# Make predictions
y_pred_train = model_all.predict(X_train_scaled)
y_pred_test = model_all.predict(X_test_scaled)

In [68]:
# Calculate metrics
def calculate_metrics(y_true, y_pred, label="Dataset"):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    
    print(f"{label} Metrics:")
    print(f"  MAE: ${mae:,.2f}")
    print(f"  RMSE: ${rmse:,.2f}")
    print(f"  MAPE: {mape:.2f}%")
    print()
    
    return {'mae': mae, 'rmse': rmse, 'mape': mape}

In [69]:
print("\nðŸ“ˆ Model Performance (ALL Stores):")
train_metrics = calculate_metrics(y_train, y_pred_train, "Training")
test_metrics = calculate_metrics(y_test, y_pred_test, "Testing")


ðŸ“ˆ Model Performance (ALL Stores):
Training Metrics:
  MAE: $22,264.60
  RMSE: $29,990.19
  MAPE: 0.62%

Testing Metrics:
  MAE: $473,127.85
  RMSE: $644,203.31
  MAPE: 10.92%



## Training ARIMA models

In [70]:
sample_stores = [1, 5, 10, 20]  # Sample stores to analyze
arima_results = {}