# 1. Library Import

In [None]:
# Integrated Retail Analytics for Store Optimization
#
#
# 1. Problem Statement
# The primary objective of this project is to build and compare multiple machine learning models to accurately predict weekly sales for a retail store chain. The models will use a combination of historical sales data, store information, and external factors. The insights gained can be used for inventory management and strategic planning.
#
#
# 2. Data Exploration & Data Cleaning
#
# 1. Data Loading

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
import xgboost as xgb
import shap
import joblib

# Load the datasets
sales_df = pd.read_csv('sales data-set.csv')
features_df = pd.read_csv('Features data set.csv')
stores_df = pd.read_csv('stores data-set.csv')

print("Sales Data Info:")
sales_df.info()
print("\nFeatures Data Info:")
features_df.info()
print("\nStores Data Info:")
stores_df.info()

# Merge the datasets
combined_df = pd.merge(sales_df, features_df, on=['Store', 'Date', 'IsHoliday'], how='left')
final_df = pd.merge(combined_df, stores_df, on='Store', how='left')

print("\nFinal Merged Data Info:")
final_df.info()


# 2. Handling Missing Values
# The markdown columns have many NaN values. We can assume these events did not occur, so we'll fill with 0.
final_df[['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']] = final_df[['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']].fillna(0)

# Other missing values are likely due to the merge. We will drop rows with any remaining missing values.
final_df.dropna(inplace=True)

print("\nData after handling missing values:")
final_df.info()


# 3. Data Type Conversion and Categorical Encoding
# Convert 'Date' to datetime objects
final_df['Date'] = pd.to_datetime(final_df['Date'], format='%d/%m/%Y')

# Encode the 'Type' and 'IsHoliday' categorical columns
final_df['Type'] = final_df['Type'].astype('category').cat.codes
final_df['IsHoliday'] = final_df['IsHoliday'].astype(int)

final_df.info()


# 3. Feature Engineering
# Extract time-based features from 'Date'
final_df['Year'] = final_df['Date'].dt.year
final_df['Month'] = final_df['Date'].dt.month
final_df['Week'] = final_df['Date'].dt.isocalendar().week.astype(int)
final_df['DayOfWeek'] = final_df['Date'].dt.dayofweek

# Drop the original 'Date' column
final_df = final_df.drop('Date', axis=1)

final_df.head()


# 4. Model Creation & Training
# Separate features and target variable
X = final_df.drop('Weekly_Sales', axis=1)
y = final_df['Weekly_Sales']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the models
print("Training models...")
lgb_model = lgb.LGBMRegressor(random_state=42)
lgb_model.fit(X_train, y_train)

rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1)
xgb_model.fit(X_train, y_train)
print("Training complete.")


# 5. Model Evaluation
# Make predictions and evaluate each model
print("\n--- Model Evaluation ---")

# LightGBM Evaluation
lgb_pred = lgb_model.predict(X_test)
lgb_mae = mean_absolute_error(y_test, lgb_pred)
lgb_rmse = np.sqrt(mean_squared_error(y_test, lgb_pred))
print(f"LightGBM MAE: {lgb_mae:.2f}")
print(f"LightGBM RMSE: {lgb_rmse:.2f}")

# RandomForest Evaluation
rf_pred = rf_model.predict(X_test)
rf_mae = mean_absolute_error(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
print(f"Random Forest MAE: {rf_mae:.2f}")
print(f"Random Forest RMSE: {rf_rmse:.2f}")

# XGBoost Evaluation
xgb_pred = xgb_model.predict(X_test)
xgb_mae = mean_absolute_error(y_test, xgb_pred)
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred))
print(f"XGBoost MAE: {xgb_mae:.2f}")
print(f"XGBoost RMSE: {xgb_rmse:.2f}")


# Determine the best model based on MAE
best_mae = min(lgb_mae, rf_mae, xgb_mae)
if best_mae == lgb_mae:
    best_model = lgb_model
    best_model_name = "LightGBM"
elif best_mae == rf_mae:
    best_model = rf_model
    best_model_name = "Random Forest"
else:
    best_model = xgb_model
    best_model_name = "XGBoost"

print(f"\nBest performing model is: {best_model_name}")


# 6. Model Explainability
# Explain the best performing model using SHAP.
# Create a SHAP explainer for the best model
print(f"\n--- Feature Importance for {best_model_name} (using SHAP) ---")
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)

# Plot the SHAP summary plots
shap.summary_plot(shap_values, X_test, plot_type="bar")
shap.summary_plot(shap_values, X_test)


# 7. Future Work
# 1. Save the best performing ml model in a pickle file or joblib file format for deployment process.
# Save the File
print(f"\nSaving the best model ({best_model_name}) to disk.")
joblib.dump(best_model, 'weekly_sales_model.joblib')
print("Model saved successfully.")


# 2. Again Load the saved model file and try to predict unseen data for a sanity check.
# Load the File and predict unseen data.
print("\nLoading the saved model for a sanity check.")
loaded_model = joblib.load('weekly_sales_model.joblib')

# Predict on a sample of unseen data (e.g., the first 5 rows of the test set)
sample_data = X_test.head(5)
sample_predictions = loaded_model.predict(sample_data)

print("\nSample Predictions:")
print(sample_predictions)

print("\nActual Values:")
print(y_test.head(5).values)


# Congrats! Your model is successfully created and ready for deployment on a live server for a real user interaction !!!


# Conclusion
# This project successfully developed and evaluated multiple regression models to predict weekly sales. The LightGBM, Random Forest, and XGBoost models were trained and compared, with the best-performing model identified based on its MAE and RMSE scores. The feature importance of the optimal model was visualized using SHAP, providing valuable insights into which factors most significantly influence sales. This predictive tool can be a powerful asset for retail management, enabling more informed decision-making regarding inventory, staffing, and marketing strategies.
