In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load the dataset
data = pd.read_csv(r"E:\FABIZ\MASTER II\data analysis project\retail_store_inventory.csv")
data['Date'] = pd.to_datetime(data['Date'])  # Ensure 'Date' is in datetime format
data.set_index('Date', inplace=True)        # Set 'Date' as the index

# Define target and features
target = 'Units Sold'
features = ['Store ID', 'Product ID', 'Category', 'Region', 'Inventory Level', 'Units Ordered',
            'Demand Forecast', 'Price', 'Discount', 'Weather Condition', 'Holiday/Promotion',
            'Competitor Pricing', 'Seasonality']

# Ensure all numeric columns are properly cast and handle missing values
for col in features:
    if data[col].dtype == 'object':
        if data[col].isnull().all():  # Handle completely empty columns
            data[col] = 0
        else:
            data[col] = pd.factorize(data[col])[0]  # Encode categorical features
    else:
        if data[col].isnull().all():  # Handle completely empty numeric columns
            data[col] = 0
        else:
            data[col] = pd.to_numeric(data[col], errors='coerce')
            data[col] = data[col].fillna(data[col].median())  # Fill NaN values with median

# Diagnostics: Check for missing or problematic columns
missing_cols = [col for col in features if col not in data.columns]
if missing_cols:
    print(f"Missing columns: {missing_cols}")
    features = [col for col in features if col not in missing_cols]  # Drop missing columns

print("All feature columns are present.")
print(data[features].isnull().sum())  # Check for NaN in each column

# Check for NaN or infinite values
for col in features:
    if data[col].isnull().sum() > 0 or np.isinf(data[col]).sum() > 0:
        print(f"Warning: {col} contains NaN or infinite values! Fixing...")
        data[col] = data[col].fillna(0)  # Replace NaN with 0
        data[col] = data[col].replace([np.inf, -np.inf], 0)  # Replace inf with 0

print("Data is clean and ready for modeling!")

# Train-Test Split
train_size = int(len(data) * 0.8)
train = data[:train_size]
test = data[train_size:]

X_train, X_test = train[features], test[features]
y_train, y_test = train[target], test[target]

# SARIMAX Forecast
sarimax_model = SARIMAX(
    y_train,
    exog=X_train,
    order=(1, 1, 1),
    seasonal_order=(1, 1, 1, 12)
)
sarimax_fit = sarimax_model.fit(disp=False)
sarimax_forecast = sarimax_fit.forecast(steps=len(X_test), exog=X_test)

# Machine Learning Model (Random Forest)
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_forecast = rf_model.predict(X_test)

# Model Evaluation (SARIMAX)
mae_sarimax = mean_absolute_error(y_test, sarimax_forecast)
rmse_sarimax = np.sqrt(mean_squared_error(y_test, sarimax_forecast))
print(f"SARIMAX Model MAE: {mae_sarimax:.2f}, RMSE: {rmse_sarimax:.2f}")

# Model Evaluation (Random Forest)
mae_rf = mean_absolute_error(y_test, rf_forecast)
rmse_rf = np.sqrt(mean_squared_error(y_test, rf_forecast))
print(f"Random Forest Model MAE: {mae_rf:.2f}, RMSE: {rmse_rf:.2f}")

# Visualization
plt.figure(figsize=(14, 7))
plt.plot(test.index, y_test, label="Actual Units Sold", marker='o')
plt.plot(test.index, sarimax_forecast, label="SARIMAX Forecast", marker='x')
plt.plot(test.index, rf_forecast, label="Random Forest Forecast", marker='s')
plt.title("Actual vs Forecasted Units Sold")
plt.xlabel("Date")
plt.ylabel("Units Sold")
plt.legend()
plt.show()


All feature columns are present.
Store ID              0
Product ID            0
Category              0
Region                0
Inventory Level       0
Units Ordered         0
Demand Forecast       0
Price                 0
Discount              0
Weather Condition     0
Holiday/Promotion     0
Competitor Pricing    0
Seasonality           0
dtype: int64
Data is clean and ready for modeling!


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
