In [1]:
import pandas as pd

forecast_df = pd.read_csv("../data/forecast_features.csv")
forecast_df['order_date'] = pd.to_datetime(forecast_df['order_date'])


In [2]:
target = 'weekly_sales'

feature_cols = [
    'year',
    'month',
    'week_of_year',
    'sales_lag_1',
    'sales_lag_4',
    'sales_lag_12',
    'rolling_mean_4',
    'rolling_mean_8'
]


In [3]:
train = forecast_df[forecast_df['order_date'] < '2018-01-01']
test  = forecast_df[forecast_df['order_date'] >= '2018-01-01']


In [4]:
train.head()

Unnamed: 0,order_date,region,category,weekly_sales,total_quantity,avg_discount,total_profit,year,month,week_of_year,sales_lag_1,sales_lag_4,sales_lag_12,rolling_mean_4,rolling_mean_8
0,2014-06-08,Central,Furniture,330.479,10,0.55,-249.7729,2014,6,23,2717.087,1406.941,76.728,1076.5605,1176.71875
1,2014-06-15,Central,Furniture,1110.802,9,0.2,-18.0584,2014,6,24,330.479,34.79,429.63,807.445,1038.47525
2,2014-06-22,Central,Furniture,840.9056,12,0.16,12.5176,2014,6,25,1110.802,147.424,393.83,1076.448,901.967625
3,2014-07-13,Central,Furniture,512.358,3,0.3,-14.6388,2014,7,28,840.9056,2717.087,45.48,1249.8184,1006.397075
4,2014-07-20,Central,Furniture,998.112,7,0.45,-154.425,2014,7,29,512.358,330.479,1436.427,698.63615,887.598325


In [5]:
forecast_df['order_date'].min(), forecast_df['order_date'].max()

(Timestamp('2014-04-27 00:00:00'), Timestamp('2017-12-31 00:00:00'))

In [6]:
split_date = forecast_df['order_date'].quantile(0.8)

train = forecast_df[forecast_df['order_date'] < split_date]
test  = forecast_df[forecast_df['order_date'] >= split_date]

In [7]:
test['naive_forecast'] = test['sales_lag_1']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['naive_forecast'] = test['sales_lag_1']


In [8]:
from xgboost import XGBRegressor

model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    random_state=42,
    objective='reg:squarederror'
)


In [9]:
_ = model.fit(train[feature_cols], train[target])


In [10]:
test['ml_forecast'] = model.predict(test[feature_cols])
test[['weekly_sales', 'naive_forecast', 'ml_forecast']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['ml_forecast'] = model.predict(test[feature_cols])


Unnamed: 0,weekly_sales,naive_forecast,ml_forecast
125,907.426,2585.5548,369.822205
126,24.5,907.426,819.239746
127,766.155,24.5,751.163513
128,2232.5464,766.155,262.295929
129,132.901,2232.5464,862.058228


In [11]:
forecast_df['weekly_sales'].describe()

count      629.000000
mean      1122.464312
std       1294.119859
min          3.984000
25%        283.920000
50%        728.090000
75%       1503.285200
max      10030.622000
Name: weekly_sales, dtype: float64

In [12]:
from sklearn.metrics import mean_absolute_error

mae_naive = mean_absolute_error(test['weekly_sales'], test['naive_forecast'])
mae_xgb   = mean_absolute_error(test['weekly_sales'], test['ml_forecast'])

print("Naive MAE:", mae_naive)
print("XGBoost MAE:", mae_xgb)


Naive MAE: 1211.34087751938
XGBoost MAE: 1037.188893253013


In [13]:
final_forecast = test[
    [
        'order_date',
        'region',
        'category',
        'weekly_sales',
        'naive_forecast',
        'ml_forecast'
    ]
]

final_forecast.to_csv("../data/sales_forecast_output.csv", index=False)

In [14]:
import numpy as np

In [15]:
forecast_df['log_weekly_sales'] = np.log1p(forecast_df['weekly_sales'])


In [16]:
forecast_df['discount_lag_1'] = (
    forecast_df
    .groupby(['region', 'category'])['avg_discount']
    .shift(1)
)

In [17]:
feature_cols = [
    'year', 'month', 'week_of_year',
    'sales_lag_1', 'sales_lag_4', 'sales_lag_12',
    'rolling_mean_4', 'rolling_mean_8',
    'discount_lag_1'
]

target = 'log_weekly_sales'


In [18]:
model = XGBRegressor(
    n_estimators=400,
    learning_rate=0.03,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    objective='reg:squarederror'
)



In [19]:
forecast_df = forecast_df.sort_values(
    by=['region', 'category', 'order_date']
)

forecast_df['discount_lag_1'] = (
    forecast_df
    .groupby(['region', 'category'])['avg_discount']
    .shift(1)
)

In [20]:
forecast_df = forecast_df.dropna().reset_index(drop=True)

In [21]:
split_date = forecast_df['order_date'].quantile(0.8)

train = forecast_df[forecast_df['order_date'] < split_date]
test  = forecast_df[forecast_df['order_date'] >= split_date]

In [22]:
train.shape, test.shape


((500, 17), (125, 17))

In [23]:
_ = model.fit(train[feature_cols], train[target])

In [24]:
test['log_forecast'] = model.predict(test[feature_cols])
test['ml_forecast'] = np.expm1(test['log_forecast'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['log_forecast'] = model.predict(test[feature_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['ml_forecast'] = np.expm1(test['log_forecast'])


In [25]:
print(test.columns.tolist())
print(test.shape)

['order_date', 'region', 'category', 'weekly_sales', 'total_quantity', 'avg_discount', 'total_profit', 'year', 'month', 'week_of_year', 'sales_lag_1', 'sales_lag_4', 'sales_lag_12', 'rolling_mean_4', 'rolling_mean_8', 'log_weekly_sales', 'discount_lag_1', 'log_forecast', 'ml_forecast']
(125, 19)


In [26]:
test = test.copy()   # VERY important
test['naive_forecast'] = test['sales_lag_1']

In [27]:
test[['weekly_sales', 'sales_lag_1', 'naive_forecast']].head()

Unnamed: 0,weekly_sales,sales_lag_1,naive_forecast
125,24.5,907.426,907.426
126,766.155,24.5,24.5
127,2232.5464,766.155,766.155
128,132.901,2232.5464,2232.5464
129,479.158,132.901,132.901


In [28]:
from sklearn.metrics import mean_absolute_error

mae_naive = mean_absolute_error(
    test['weekly_sales'],
    test['naive_forecast']
)

mae_tuned = mean_absolute_error(
    test['weekly_sales'],
    test['ml_forecast']
)

print("Naive MAE :", mae_naive)
print("Tuned XGB MAE:", mae_tuned)

Naive MAE : 1199.4173952
Tuned XGB MAE: 995.3780088789063


In [29]:
final_df = test[[
    'order_date',
    'region',
    'category',
    'weekly_sales',
    'naive_forecast',
    'ml_forecast'
]].copy()

final_df['error_naive'] = final_df['weekly_sales'] - final_df['naive_forecast']
final_df['error_ml'] = final_df['weekly_sales'] - final_df['ml_forecast']

final_df['abs_error_naive'] = final_df['error_naive'].abs()
final_df['abs_error_ml'] = final_df['error_ml'].abs()

final_df.head()


Unnamed: 0,order_date,region,category,weekly_sales,naive_forecast,ml_forecast,error_naive,error_ml,abs_error_naive,abs_error_ml
125,2017-05-21,Central,Furniture,24.5,907.426,231.571289,-882.926,-207.071289,882.926,207.071289
126,2017-05-28,Central,Furniture,766.155,24.5,484.72876,741.655,281.42624,741.655,281.42624
127,2017-06-04,Central,Furniture,2232.5464,766.155,436.959381,1466.3914,1795.587019,1466.3914,1795.587019
128,2017-06-11,Central,Furniture,132.901,2232.5464,310.824554,-2099.6454,-177.923554,2099.6454,177.923554
129,2017-06-18,Central,Furniture,479.158,132.901,691.993042,346.257,-212.835042,346.257,212.835042


In [30]:
import pandas as pd
import numpy as np

# Number of weeks to forecast
future_weeks = 10

# Copy your existing final_df (historical)
forecast_df = final_df.copy()

# Ensure required lag/feature columns exist in historical data
for col in ['sales_lag_1', 'sales_lag_4', 'sales_lag_12', 
            'rolling_mean_4', 'rolling_mean_8', 'avg_discount', 'discount_lag_1',
            'year', 'month', 'week_of_year']:
    if col not in forecast_df.columns:
        if col == 'year':
            forecast_df['year'] = forecast_df['order_date'].dt.year
        elif col == 'month':
            forecast_df['month'] = forecast_df['order_date'].dt.month
        elif col == 'week_of_year':
            forecast_df['week_of_year'] = forecast_df['order_date'].dt.isocalendar().week
        elif col == 'discount_lag_1':
            forecast_df['discount_lag_1'] = forecast_df['avg_discount'].shift(1).fillna(method='ffill').fillna(0)
        else:
            forecast_df[col] = forecast_df['weekly_sales'].shift(1).fillna(method='ffill').fillna(0)

# Prepare recursive forecasting
history = forecast_df.copy()
future_predictions = []

region_list = history['region'].unique()
for region in region_list:
    for i in range(future_weeks):
        # Next week date
        next_week_date = history['order_date'].max() + pd.Timedelta(weeks=1)
        
        # New row for prediction
        row = {'order_date': next_week_date, 'region': region, 'category': 'Furniture'}
        
        # Date features
        row['year'] = next_week_date.year
        row['month'] = next_week_date.month
        row['week_of_year'] = next_week_date.isocalendar()[1]
        
        # Use only historical weekly_sales for lags & rolling means
        hist_sales = history.loc[(history['region'] == region) & (history['weekly_sales'].notna()), 'weekly_sales']
        
        row['sales_lag_1'] = hist_sales.iloc[-1] if len(hist_sales) >= 1 else 0
        row['sales_lag_4'] = hist_sales.iloc[-4:].mean() if len(hist_sales) >= 4 else hist_sales.mean() if len(hist_sales) > 0 else 0
        row['sales_lag_12'] = hist_sales.iloc[-12:].mean() if len(hist_sales) >= 12 else hist_sales.mean() if len(hist_sales) > 0 else 0
        row['rolling_mean_4'] = hist_sales.iloc[-4:].mean() if len(hist_sales) >= 4 else hist_sales.mean() if len(hist_sales) > 0 else 0
        row['rolling_mean_8'] = hist_sales.iloc[-8:].mean() if len(hist_sales) >= 8 else hist_sales.mean() if len(hist_sales) > 0 else 0
        
        # Discount features
        last_discount = history.loc[history['region'] == region, 'avg_discount'].dropna()
        last_discount_val = last_discount.iloc[-1] if len(last_discount) > 0 else 0
        row['avg_discount'] = last_discount_val
        row['discount_lag_1'] = last_discount_val
        
        # Prepare DataFrame for model prediction
        row_df = pd.DataFrame([row])
        
        # Predict using trained XGBoost model
        row['ml_forecast'] = np.expm1(model.predict(row_df[feature_cols])[0])
        
        # Add prediction temporarily for recursive lags
        temp_row = row.copy()
        temp_row['weekly_sales'] = row['ml_forecast']
        history = pd.concat([history, pd.DataFrame([temp_row])], ignore_index=True)
        
        # For Tableau, keep weekly_sales = NaN
        row['weekly_sales'] = np.nan
        
        # Append to future predictions
        future_predictions.append(row)

# Combine historical + future
future_df = pd.DataFrame(future_predictions)
tableau_df = pd.concat([forecast_df, future_df], ignore_index=True)

# Recalculate errors only for historical data
tableau_df['error_naive'] = tableau_df['weekly_sales'] - tableau_df['naive_forecast']
tableau_df['error_ml'] = tableau_df['weekly_sales'] - tableau_df['ml_forecast']
tableau_df['abs_error_naive'] = tableau_df['error_naive'].abs()
tableau_df['abs_error_ml'] = tableau_df['error_ml'].abs()

# Save final CSV for Tableau
tableau_df.to_csv("../data/sales_forecast_trend_tableau.csv", index=False)

print("10-week recursive forecast ready for Tableau!")


10-week recursive forecast ready for Tableau!


  forecast_df[col] = forecast_df['weekly_sales'].shift(1).fillna(method='ffill').fillna(0)
  forecast_df[col] = forecast_df['weekly_sales'].shift(1).fillna(method='ffill').fillna(0)
  forecast_df[col] = forecast_df['weekly_sales'].shift(1).fillna(method='ffill').fillna(0)
  forecast_df[col] = forecast_df['weekly_sales'].shift(1).fillna(method='ffill').fillna(0)
  forecast_df[col] = forecast_df['weekly_sales'].shift(1).fillna(method='ffill').fillna(0)
  forecast_df[col] = forecast_df['weekly_sales'].shift(1).fillna(method='ffill').fillna(0)
  forecast_df['discount_lag_1'] = forecast_df['avg_discount'].shift(1).fillna(method='ffill').fillna(0)


In [31]:
final_df.to_csv("../data/sales_forecast_tableau.csv", index=False)