In [38]:
import pandas as pd
import holidays
from sklearn.model_selection import train_test_split
from statsmodels.tsa.arima.model import ARIMA
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [8]:
df=pd.read_csv("train.csv")
df

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10
...,...,...,...,...
912995,2017-12-27,10,50,63
912996,2017-12-28,10,50,59
912997,2017-12-29,10,50,74
912998,2017-12-30,10,50,62


In [9]:
df['date'].max()

'2017-12-31'

In [10]:
df['date'].min()

'2013-01-01'

In [15]:
df.groupby(['store'])['sales'].sum()

store
1     4315603
2     6120128
3     5435144
4     5012639
5     3631016
6     3627670
7     3320009
8     5856169
9     5025976
10    5360158
Name: sales, dtype: int64

In [16]:
df.dtypes

date     object
store     int64
item      int64
sales     int64
dtype: object

In [18]:
df['date']=pd.to_datetime(df['date'])
df.dtypes

date     datetime64[ns]
store             int64
item              int64
sales             int64
dtype: object

In [21]:
df['year']=df['date'].dt.year
df['month']=df['date'].dt.month
df['day']=df['date'].dt.day

In [30]:
df['is_weekend']=df['date'].dt.day_of_week>=5
df['is_weekend']=df['is_weekend'].astype(int)
year_range=list(range(2013,2017))
india_holidays=holidays.India(years=year_range)
df['is_holiday']=df['date'].isin(india_holidays)

  df['is_holiday']=df['date'].isin(india_holidays)


In [None]:
df['day_of_Week']=df['date'].dt.day_of_week
df['week_of_year']=df['date'].dt.isocalendar().week

Unnamed: 0,date,store,item,sales,year,month,day,is_weekend,is_holiday
0,2013-01-01,1,1,13,2013,1,1,0,False
1,2013-01-02,1,1,11,2013,1,2,0,False
2,2013-01-03,1,1,14,2013,1,3,0,False
3,2013-01-04,1,1,13,2013,1,4,0,False
4,2013-01-05,1,1,10,2013,1,5,1,False
...,...,...,...,...,...,...,...,...,...
912995,2017-12-27,10,50,63,2017,12,27,0,False
912996,2017-12-28,10,50,59,2017,12,28,0,False
912997,2017-12-29,10,50,74,2017,12,29,0,False
912998,2017-12-30,10,50,62,2017,12,30,1,False


In [36]:
df['sales_lag_1'] = df.groupby(['store', 'item'])['sales'].shift(1)
df['sales_lag_7'] = df.groupby(['store', 'item'])['sales'].shift(7)
df['rolling_mean_7'] = df.groupby(['store', 'item'])['sales'].transform(lambda x: x.shift(1).rolling(7).mean())
df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
df['is_month_end'] = df['date'].dt.is_month_end.astype(int)

In [37]:
df

Unnamed: 0,date,store,item,sales,year,month,day,is_weekend,is_holiday,sales_lag_1,sales_lag_7,rolling_mean_7,is_month_start,is_month_end
0,2013-01-01,1,1,13,2013,1,1,0,False,,,,1,0
1,2013-01-02,1,1,11,2013,1,2,0,False,13.0,,,0,0
2,2013-01-03,1,1,14,2013,1,3,0,False,11.0,,,0,0
3,2013-01-04,1,1,13,2013,1,4,0,False,14.0,,,0,0
4,2013-01-05,1,1,10,2013,1,5,1,False,13.0,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
912995,2017-12-27,10,50,63,2017,12,27,0,False,41.0,51.0,61.000000,0,0
912996,2017-12-28,10,50,59,2017,12,28,0,False,63.0,63.0,62.714286,0,0
912997,2017-12-29,10,50,74,2017,12,29,0,False,59.0,75.0,62.142857,0,0
912998,2017-12-30,10,50,62,2017,12,30,1,False,74.0,70.0,62.000000,0,0


Train test Split by date range

In [39]:
df = df.sort_values(by='date').reset_index(drop=True)
df

Unnamed: 0,date,store,item,sales,year,month,day,is_weekend,is_holiday,sales_lag_1,sales_lag_7,rolling_mean_7,is_month_start,is_month_end
0,2013-01-01,1,1,13,2013,1,1,0,False,,,,1,0
1,2013-01-01,7,12,26,2013,1,1,0,False,,,,1,0
2,2013-01-01,7,46,27,2013,1,1,0,False,,,,1,0
3,2013-01-01,8,12,54,2013,1,1,0,False,,,,1,0
4,2013-01-01,9,12,35,2013,1,1,0,False,,,,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
912995,2017-12-31,9,34,21,2017,12,31,1,False,20.0,22.0,20.857143,0,1
912996,2017-12-31,10,34,32,2017,12,31,1,False,21.0,29.0,23.142857,0,1
912997,2017-12-31,1,35,55,2017,12,31,1,False,66.0,53.0,52.000000,0,1
912998,2017-12-31,8,33,100,2017,12,31,1,False,67.0,86.0,67.142857,0,1


In [42]:
df['store'] = df['store'].astype('category')
df['item'] = df['item'].astype('category')

In [43]:
print(df[['sales_lag_1', 'sales_lag_7', 'rolling_mean_7']].isna().sum())

sales_lag_1        500
sales_lag_7       3500
rolling_mean_7    3500
dtype: int64


In [44]:
print(df[['sales_lag_1', 'sales_lag_7', 'rolling_mean_7']].isna().mean() * 100)

sales_lag_1       0.054765
sales_lag_7       0.383352
rolling_mean_7    0.383352
dtype: float64


In [45]:
df = df.dropna(subset=['sales_lag_1', 'sales_lag_7', 'rolling_mean_7']).reset_index(drop=True)

In [46]:
train_df = df[df['date'] < '2017-01-01']
test_df = df[df['date'] >= '2017-01-01']

In [47]:
test_df

Unnamed: 0,date,store,item,sales,year,month,day,is_weekend,is_holiday,sales_lag_1,sales_lag_7,rolling_mean_7,is_month_start,is_month_end
727000,2017-01-01,4,27,18,2017,1,1,1,False,25.0,27.0,19.285714,1,0
727001,2017-01-01,1,39,36,2017,1,1,1,False,45.0,41.0,32.714286,1,0
727002,2017-01-01,1,46,38,2017,1,1,1,False,54.0,47.0,43.000000,1,0
727003,2017-01-01,7,29,43,2017,1,1,1,False,54.0,44.0,38.714286,1,0
727004,2017-01-01,9,35,66,2017,1,1,1,False,59.0,66.0,56.428571,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
909495,2017-12-31,9,34,21,2017,12,31,1,False,20.0,22.0,20.857143,0,1
909496,2017-12-31,10,34,32,2017,12,31,1,False,21.0,29.0,23.142857,0,1
909497,2017-12-31,1,35,55,2017,12,31,1,False,66.0,53.0,52.000000,0,1
909498,2017-12-31,8,33,100,2017,12,31,1,False,67.0,86.0,67.142857,0,1


In [48]:
features=['store', 'item', 'year', 'month', 'day', 'is_weekend', 'is_holiday',
            'sales_lag_1', 'sales_lag_7', 'rolling_mean_7', 'is_month_start', 'is_month_end']

In [49]:
x_train=train_df[features]
y_train=train_df['sales']
x_test=test_df[features]
y_test=test_df['sales']

In [61]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from statsmodels.tsa.arima.model import ARIMA
import joblib

armia model 

In [80]:
def univariate_arima_forecast(train_df, test_df):
    sales_series = train_df.groupby('date')['sales'].sum()
    model = ARIMA(sales_series, order=(5, 1, 0))
    model_fit = model.fit()

    # Save model
    joblib.dump(model_fit, 'models/arima.pkl')

    # Forecast for test period
    steps = len(test_df['date'].unique())
    forecast = model_fit.forecast(steps=steps)

    # Assign forecast directly to sorted test dates
    forecast_full = test_df[['date']].drop_duplicates().copy()
    forecast_full = forecast_full.sort_values('date')
    forecast_full['forecast'] = forecast.values

    return forecast_full['forecast'].values, model_fit



linear regression

In [64]:
def multivariate_regression_forecast(train_df, test_df):
    features = ['store', 'item', 'year', 'month', 'day', 'is_weekend', 'is_holiday',
                'sales_lag_1', 'sales_lag_7', 'rolling_mean_7', 'is_month_start', 'is_month_end']
    
    X_train = train_df[features]
    y_train = train_df['sales']
    X_test = test_df[features]

    model = LinearRegression()
    model.fit(X_train, y_train)

    joblib.dump(model, 'models/linear_regression.pkl')
    y_pred = model.predict(X_test)

    return y_pred, model


Random Forest

In [65]:
def bagging_rf_forecast(train_df, test_df):
    features = ['store', 'item', 'year', 'month', 'day', 'is_weekend', 'is_holiday',
                'sales_lag_1', 'sales_lag_7', 'rolling_mean_7', 'is_month_start', 'is_month_end']
    
    X_train = train_df[features]
    y_train = train_df['sales']
    X_test = test_df[features]

    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    joblib.dump(model, 'models/random_forest.pkl')
    y_pred = model.predict(X_test)

    return y_pred, model


In [69]:
def boosting_xgb_forecast(train_df, test_df):
    features = ['store', 'item', 'year', 'month', 'day', 'is_weekend', 'is_holiday',
                'sales_lag_1', 'sales_lag_7', 'rolling_mean_7', 'is_month_start', 'is_month_end']
    
    X_train = train_df[features]
    y_train = train_df['sales']
    X_test = test_df[features]

    model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42,enable_categorical=True)
    model.fit(X_train, y_train)

    joblib.dump(model, 'models/xgboost.pkl')
    y_pred = model.predict(X_test)

    return y_pred, model


In [58]:
from sklearn.metrics import mean_absolute_error,mean_squared_error
import numpy as np

In [59]:
def evaluate_model(y_true, y_pred):
    print(f"MAE:  {mean_absolute_error(y_true, y_pred):.2f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):.2f}")

In [81]:
# Forecasting and retrieving trained models
y_arima_test, arima_model = univariate_arima_forecast(train_df, test_df)
print("arima completed")
y_reg_test, lr_model = multivariate_regression_forecast(train_df, test_df)
print("linear completed")
y_rf_test, rf_model = bagging_rf_forecast(train_df, test_df)
print("random forest completed")
y_xgb_test, xgb_model = boosting_xgb_forecast(train_df, test_df)
print("xgboost completed")

# Evaluation
y_true = test_df['sales'].values


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


arima completed
linear completed
random forest completed
xgboost completed


In [82]:
# Drop NaNs and align
y_arima_test = pd.Series(y_arima_test)
valid_idx = ~y_arima_test.isna()
y_arima_test = y_arima_test[valid_idx]
y_true_aligned = y_true[valid_idx]



IndexError: boolean index did not match indexed array along axis 0; size of axis is 182500 but size of corresponding boolean axis is 365

In [83]:
print("y_arima_test shape:", len(y_arima_test))
print("y_arima_test head:\n", y_arima_test[:10])  # works even if it's a Series or array

print("y_true shape:", len(y_true))
print("y_true head:\n", y_true[:10])  # numpy-style indexing



y_arima_test shape: 365
y_arima_test head:
 0    20760.978172
1    20564.393152
2    20126.785727
3    20723.464566
4    21686.559038
5    22152.921755
6    21488.629178
7    21091.996541
8    20703.784173
9    20855.574112
dtype: float64
y_true shape: 182500
y_true head:
 [18 36 38 43 66 53 24 81 49 30]


In [84]:
results = pd.DataFrame({
    "Model": ["ARIMA", "LinearRegression", "RandomForest", "XGBoost"],
    "MAE": [
        mean_absolute_error(y_true, y_arima_test),
        mean_absolute_error(y_true, y_reg_test),
        mean_absolute_error(y_true, y_rf_test),
        mean_absolute_error(y_true, y_xgb_test)
    ],
    "RMSE": [
        np.sqrt(mean_squared_error(y_true, y_arima_test)),
        np.sqrt(mean_squared_error(y_true, y_reg_test)),
        np.sqrt(mean_squared_error(y_true, y_rf_test)),
        np.sqrt(mean_squared_error(y_true, y_xgb_test))
    ]
})
print(results)

ValueError: Found input variables with inconsistent numbers of samples: [182500, 365]

In [85]:
# Aggregate y_true to date-level
test_df['y_true'] = y_true
y_true_agg = test_df.groupby('date')['y_true'].sum().sort_index().values

# Now compare with y_arima_test
print(len(y_true_agg), len(y_arima_test))  # Both should be 365


365 365


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['y_true'] = y_true
