In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller
import pmdarima as pm
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# --- 1. Data Loading and Exploration ---
train_data = pd.read_csv("train.csv")
features_data = pd.read_csv("features.csv")
stores_data = pd.read_csv("stores.csv")

data = pd.merge(train_data, features_data, on=['Store', 'Date', 'IsHoliday'], how='left')
data = pd.merge(data, stores_data, on=['Store'], how='left')

data['Date'] = pd.to_datetime(data['Date'])

print(data.head())
print(data.info())
print(data.isnull().sum())

plt.figure(figsize=(12, 6))
sns.histplot(data['Weekly_Sales'], bins=50)
plt.title('Distribution of Weekly Sales')
plt.show()

plt.figure(figsize=(12, 6))
data.groupby('Date')['Weekly_Sales'].sum().plot()
plt.title('Weekly Sales Over Time')
plt.ylabel('Total Weekly Sales')
plt.show()

# --- 2. Data Preprocessing ---
data.fillna(method='ffill', inplace=True)

def test_stationarity(timeseries):
    result = adfuller(timeseries, autolag='AIC')
    print('ADF Statistic:', result[0])
    print('p-value:', result[1])
    print('Critical Values:', result[4])

store_dept_sales = data[(data['Store'] == 1) & (data['Dept'] == 1)]['Weekly_Sales']
test_stationarity(store_dept_sales)

# --- 3. Feature Engineering ---
data['Lag_1'] = data.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(1)
data['Lag_2'] = data.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(2)
data['Lag_3'] = data.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(3)
data['Lag_4'] = data.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(4)
data.fillna(method='ffill', inplace=True)

# --- 4. Model Selection and Training ---
store_id = 1
dept_id = 1
store_dept_data = data[(data['Store'] == store_id) & (data['Dept'] == dept_id)].copy()

store_dept_data.set_index('Date', inplace=True)
store_dept_data.sort_index(inplace=True)

train_data = store_dept_data[:-4]
test_data = store_dept_data[-4:]

model = pm.auto_arima(train_data['Weekly_Sales'],
                      seasonal=False,
                      trace=True,
                      error_action='ignore',
                      suppress_warnings=True,
                      stepwise=True)

print(model.summary())

best_order = model.order
arima_model = ARIMA(train_data['Weekly_Sales'], order=best_order)
arima_model_fit = arima_model.fit()

# --- 5. Model Evaluation ---
forecast = arima_model_fit.predict(start=len(train_data), end=len(train_data) + len(test_data) - 1)

mae = mean_absolute_error(test_data['Weekly_Sales'], forecast)
mse = mean_squared_error(test_data['Weekly_Sales'], forecast)
rmse = np.sqrt(mse)

print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')

plt.figure(figsize=(12, 6))
plt.plot(train_data['Weekly_Sales'], label='Train')
plt.plot(test_data['Weekly_Sales'], label='Test')
plt.plot(forecast, label='Forecast')
plt.legend()
plt.title('Weekly Sales Forecast')
plt.show()

# --- 6. Forecasting (Future Predictions) ---
future_forecast = arima_model_fit.predict(start=len(store_dept_data), end=len(store_dept_data) + 3)

future_dates = pd.date_range(start=store_dept_data.index[-1] + pd.DateOffset(weeks=1), periods=4, freq='W')

future_forecast_df = pd.DataFrame({'Date': future_dates, 'Forecast': future_forecast})
future_forecast_df.set_index('Date', inplace=True)

print(future_forecast_df)

plt.figure(figsize=(12, 6))
plt.plot(store_dept_data['Weekly_Sales'], label='Historical')
plt.plot(future_forecast_df['Forecast'], label='Future Forecast')
plt.legend()
plt.title('Weekly Sales Forecast (Including Future)')
plt.show()