In [11]:
import pandas as pd

df = pd.read_csv("warehouse_sales_export.csv", )

print(df.head())


           ds category      region      y
0  2023-01-01    dairy    Barishal  182.0
1  2023-01-01    dairy  Chattogram  198.0
2  2023-01-01    dairy       Dhaka  146.0
3  2023-01-01    dairy      Khulna  187.0
4  2023-01-01    dairy  Mymensingh  225.0


In [12]:
print("Dataset info ")
print(df.info())
print("Dataset describe ")
print(df.describe())

Dataset info 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78912 entries, 0 to 78911
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ds        78912 non-null  object 
 1   category  78912 non-null  object 
 2   region    78912 non-null  object 
 3   y         78912 non-null  float64
dtypes: float64(1), object(3)
memory usage: 2.4+ MB
None
Dataset describe 
                  y
count  78912.000000
mean     209.007984
std       42.942520
min       32.000000
25%      179.000000
50%      207.000000
75%      237.000000
max      411.000000


In [13]:
# Check for missing values and handle them if necessary:
df.isnull().sum()


ds          0
category    0
region      0
y           0
dtype: int64

Data Processing

In [14]:
df['order_date'] = pd.to_datetime(df['ds'])


Feature Engineering

In [15]:
from IPython.core.display_functions import display
# Extract time-based features
df['day_of_week'] = df['order_date'].dt.dayofweek
df['month'] = df['order_date'].dt.month
df['quarter'] = df['order_date'].dt.quarter

# Lag features
df['lag_1'] = df['y'].shift(1)
df['lag_7'] = df['y'].shift(7)
df['lag_30'] = df['y'].shift(30)

# Rolling features
df['rolling_7'] = df['y'].rolling(7).mean()
df['rolling_30'] = df['y'].rolling(30).mean()

# One-hot encode categorical features
df = pd.get_dummies(df, columns=['region', 'category'])
df.dropna(inplace=True)  # remove rows with NaN after lag/rolling

display(df)

Unnamed: 0,ds,y,order_date,day_of_week,month,quarter,lag_1,lag_7,lag_30,rolling_7,...,region_Sylhet,category_dairy,category_fish,category_fruits,category_meat,category_oil,category_rice,category_snacks,category_spices,category_vegetables
30,2023-01-01,204.0,2023-01-01,6,1,1,173.0,195.0,182.0,214.142857,...,False,False,False,False,True,False,False,False,False,False
31,2023-01-01,179.0,2023-01-01,6,1,1,204.0,266.0,198.0,201.714286,...,True,False,False,False,True,False,False,False,False,False
32,2023-01-01,244.0,2023-01-01,6,1,1,179.0,278.0,146.0,196.857143,...,False,False,False,False,False,True,False,False,False,False
33,2023-01-01,139.0,2023-01-01,6,1,1,244.0,135.0,187.0,197.428571,...,False,False,False,False,False,True,False,False,False,False
34,2023-01-01,158.0,2023-01-01,6,1,1,139.0,217.0,225.0,189.000000,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78907,2025-12-31,233.0,2025-12-31,2,12,4,174.0,304.0,181.0,210.857143,...,False,False,False,False,False,False,False,False,False,True
78908,2025-12-31,212.0,2025-12-31,2,12,4,233.0,196.0,220.0,213.142857,...,False,False,False,False,False,False,False,False,False,True
78909,2025-12-31,201.0,2025-12-31,2,12,4,212.0,314.0,157.0,197.000000,...,False,False,False,False,False,False,False,False,False,True
78910,2025-12-31,209.0,2025-12-31,2,12,4,201.0,139.0,217.0,207.000000,...,False,False,False,False,False,False,False,False,False,True


Split Train/Test

In [16]:
train_size = int(len(df) * 0.8)
train = df.iloc[:train_size]
test = df.iloc[train_size:]

Model Selection

We will compare 3 models for demand forecast:

* Prophet (Time-series model)

* XGBoost / LightGBM (ML regression)

* Simple baseline (like last week average)

## Prophet Model

In [20]:
# ! pip install Prophet

In [22]:
from prophet import Prophet

prophet_df = df.groupby('order_date')['y'].sum().reset_index()
prophet_df.rename(columns={'order_date': 'ds',}, inplace=True)

model_prophet = Prophet(daily_seasonality=True, yearly_seasonality=True)
model_prophet.fit(prophet_df.iloc[:train_size])

future = model_prophet.make_future_dataframe(periods=len(test))
forecast = model_prophet.predict(future)


03:44:45 - cmdstanpy - INFO - Chain [1] start processing
03:44:45 - cmdstanpy - INFO - Chain [1] done processing


In [23]:
# Evaluate
from sklearn.metrics import mean_absolute_error

mae_prophet = mean_absolute_error(forecast['yhat'][-len(test):], test['y'])
print("Prophet MAE:", mae_prophet)

Prophet MAE: 14640.171502582376


## XGBoost / LightGBM Regression

In [30]:
df.columns

Index(['ds', 'y', 'order_date', 'day_of_week', 'month', 'quarter', 'lag_1',
       'lag_7', 'lag_30', 'rolling_7', 'rolling_30', 'region_Barishal',
       'region_Chattogram', 'region_Dhaka', 'region_Khulna',
       'region_Mymensingh', 'region_Rajshahi', 'region_Rangpur',
       'region_Sylhet', 'category_dairy', 'category_fish', 'category_fruits',
       'category_meat', 'category_oil', 'category_rice', 'category_snacks',
       'category_spices', 'category_vegetables'],
      dtype='object')

In [28]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# 1️⃣ Define features
features = [c for c in df.columns if c not in ['order_date', 'y']]

# 2️⃣ Convert datetime columns to numeric features
for df_split in [train, test]:
    if 'order_date' in df_split.columns:
        df_split['order_date'] = pd.to_datetime(df_split['order_date'])
        df_split['day'] = df_split['order_date'].dt.day
        df_split['month'] = df_split['order_date'].dt.month
        df_split['year'] = df_split['order_date'].dt.year
        df_split['day_of_week'] = df_split['order_date'].dt.dayofweek
        df_split.drop('order_date', axis=1, inplace=True)

# 3️⃣ Make sure all features are numeric
X_train = train[features].apply(pd.to_numeric, errors='coerce').fillna(0)
y_train = train['y']
X_test = test[features].apply(pd.to_numeric, errors='coerce').fillna(0)
y_test = test['y']

# 4️⃣ Train XGBoost
xgb_model = XGBRegressor(n_estimators=500, learning_rate=0.05)
xgb_model.fit(X_train, y_train)

# 5️⃣ Predict & Evaluate
y_pred_xgb = xgb_model.predict(X_test)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
print("XGBoost MAE:", mae_xgb)

XGBoost MAE: 31.586419617577395


## Baseline Model

Simple last week average forecast:

In [31]:
y_pred_baseline = test['lag_7'].values
mae_baseline = mean_absolute_error(y_test, y_pred_baseline)
print("Baseline MAE:", mae_baseline)

Baseline MAE: 48.533815047220635


## Compare Models

In [32]:
print(f"MAE Comparison:")
print(f"Prophet: {mae_prophet:.2f}")
print(f"XGBoost: {mae_xgb:.2f}")
print(f"Baseline: {mae_baseline:.2f}")

MAE Comparison:
Prophet: 14640.17
XGBoost: 31.59
Baseline: 48.53


In [35]:
import pandas as pd

if mae_xgb < mae_prophet and mae_xgb < mae_baseline:
    final_model = xgb_model
    
    # Prepare next 30 rows for prediction
    future_features = df[features].tail(30).copy()

    # Convert datetime if present
    if 'order_date' in future_features.columns:
        future_features['order_date'] = pd.to_datetime(future_features['order_date'])
        future_features['day'] = future_features['order_date'].dt.day
        future_features['month'] = future_features['order_date'].dt.month
        future_features['year'] = future_features['order_date'].dt.year
        future_features['day_of_week'] = future_features['order_date'].dt.dayofweek
        future_features = future_features.drop('order_date', axis=1)
    
    # Drop any leftover object columns (like 'ds' from Prophet)
    object_cols = future_features.select_dtypes(include='object').columns
    future_features = future_features.drop(columns=object_cols)
    
    # Ensure numeric
    future_features = future_features.apply(pd.to_numeric, errors='coerce').fillna(0)
    
    # Predict
    forecast_final = final_model.predict(future_features)
    
elif mae_prophet < mae_baseline:
    final_model = model_prophet
    future = final_model.make_future_dataframe(periods=30)
    forecast_final = final_model.predict(future)['yhat'][-30:]
else:
    forecast_final = df['lag_7'].tail(30).values  # baseline

ValueError: feature_names mismatch: ['ds', 'day_of_week', 'month', 'quarter', 'lag_1', 'lag_7', 'lag_30', 'rolling_7', 'rolling_30', 'region_Barishal', 'region_Chattogram', 'region_Dhaka', 'region_Khulna', 'region_Mymensingh', 'region_Rajshahi', 'region_Rangpur', 'region_Sylhet', 'category_dairy', 'category_fish', 'category_fruits', 'category_meat', 'category_oil', 'category_rice', 'category_snacks', 'category_spices', 'category_vegetables'] ['day_of_week', 'month', 'quarter', 'lag_1', 'lag_7', 'lag_30', 'rolling_7', 'rolling_30', 'region_Barishal', 'region_Chattogram', 'region_Dhaka', 'region_Khulna', 'region_Mymensingh', 'region_Rajshahi', 'region_Rangpur', 'region_Sylhet', 'category_dairy', 'category_fish', 'category_fruits', 'category_meat', 'category_oil', 'category_rice', 'category_snacks', 'category_spices', 'category_vegetables']
expected ds in input data