# =========================================================================
# Restaurant Visitor Demand Forecasting
# =========================================================================
# Goal: Predict daily restaurant visitors using historical attendance,
# calendar effects (day of week, holidays), and restaurant attributes
# (genre, location, area).
# Key Tasks:
#   1. Load & clean data from air_visit_data, air_store_info, date_info.
#   2. Fix mojibake (garbled Japanese place names) in text columns.
#   3. Explore and visualize visitors over time and by key categories.
#   4. Engineer time-based and lag features (previous-day visitors, etc.).
#   5. Build preprocessing pipelines for numeric + categorical variables.
#   6. Train and compare multiple models (Linear, Ridge, Random Forest).
#   7. Evaluate on a time-based validation split and create diagnostics.
# Tools: Python, pandas, numpy, matplotlib, seaborn, scikit-learn, joblib
# Dataset: Kaggle (Restaurant Visitor dataset)

In [None]:
# ---------- IMPORT LIBRARIES ----------
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
import joblib
from pathlib import Path

In [None]:
# ---------- PROJECT SETUP ----------
project_dir = Path.cwd()

fig_dir     = project_dir / "figures"
models_dir  = project_dir / "models"
outputs_dir = project_dir / "outputs"

for d in [fig_dir, models_dir, outputs_dir]:
    d.mkdir(exist_ok=True)

def save_plot(filename, width=8, height=5, dpi=300):
    """Save the current Matplotlib figure into figures/."""
    plt.gcf().set_size_inches(width, height)
    plt.savefig(fig_dir / filename, dpi=dpi, bbox_inches="tight")

def save_output(df, filename):
    """Save a pandas DataFrame into outputs/ as CSV."""
    filepath = outputs_dir / filename
    df.to_csv(filepath, index=False)
    print(f"Saved output: {filepath}")
    return filepath

# -------------------- PLOT STYLE --------------------
sns.set(style="whitegrid")
plt.rc("font", size=12)
plt.rc("axes", labelsize=12, titlesize=13)
plt.rc("legend", fontsize=10)
plt.rc("xtick", labelsize=10)
plt.rc("ytick", labelsize=10)

In [None]:
# -------------------- LOAD DATA --------------------
air_visit_path = project_dir / "air_visit_data.csv"
air_store_path = project_dir / "air_store_info.csv"
date_info_path = project_dir / "date_info.csv"

air_visit = pd.read_csv(air_visit_path)
air_store = pd.read_csv(air_store_path)
date_info = pd.read_csv(date_info_path)

print("air_visit_data head:")
print(air_visit.head(), "\n")
print("air_store_info head:")
print(air_store.head(), "\n")
print("date_info head:")
print(date_info.head(), "\n")

print("air_visit_data info:")
print(air_visit.info(), "\n")
print("air_store_info info:")
print(air_store.info(), "\n")
print("date_info info:")
print(date_info.info(), "\n")

In [None]:
# ---------- FIX MOJIBAKE (GARBLED TEXT IN AREA/GENRE NAMES) ----------
def fix_mojibake(text):
    """
    Fix UTF-8 text that was incorrectly decoded as Latin-1.
    If conversion fails, return the original text.
    """
    try:
        return text.encode("latin1").decode("utf8")
    except Exception:
        return text

air_store["air_area_name"] = air_store["air_area_name"].astype(str).apply(fix_mojibake)
air_store["air_genre_name"] = air_store["air_genre_name"].astype(str).apply(fix_mojibake)

print("Unique air_area_name samples after mojibake fix:")
print(air_store["air_area_name"].drop_duplicates().head(15), "\n")

# ---------- MERGE DATASETS ----------
# Convert dates
air_visit["visit_date"] = pd.to_datetime(air_visit["visit_date"])
date_info["calendar_date"] = pd.to_datetime(date_info["calendar_date"])

# Rename calendar_date to visit_date
date_info_renamed = date_info.rename(columns={"calendar_date": "visit_date"})

# Merge: visit data + store info + calendar
df = (
    air_visit
    .merge(air_store, on="air_store_id", how="left")
    .merge(date_info_renamed, on="visit_date", how="left")
)

df = df.sort_values(["air_store_id", "visit_date"]).reset_index(drop=True)

print("Merged dataset head:")
print(df.head(), "\n")
print("Merged dataset info:")
print(df.info(), "\n")

In [None]:
# ---------- EXPLORATORY DATA ANALYSIS (EDA) ----------
print("Descriptive statistics for visitors:")
print(df["visitors"].describe(), "\n")

# --- Histogram of visitors (zoom to 99th percentile) ---
upper_vis = df["visitors"].quantile(0.99)

plt.figure()
plt.hist(df["visitors"], bins=40, range=(0, upper_vis), edgecolor="black")
plt.title("Distribution of Daily Visitors (0–99th percentile)")
plt.xlabel("Visitors (count per day)")
plt.ylabel("Frequency")
plt.tight_layout()
save_plot("visitors_histogram.png", width=12, height=5, dpi=300)
plt.show()

# Visitors by day of week
visitors_by_dow = df.groupby("day_of_week")["visitors"].mean().reindex(
    ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
)

plt.figure()
visitors_by_dow.plot(kind="bar")
plt.title("Average Visitors by Day of Week")
plt.xlabel("Day of Week")
plt.ylabel("Average Visitors")
plt.tight_layout()
save_plot("visitors_by_day_of_week.png")
plt.show()

# Visitors by holiday flag
plt.figure()
df.groupby("holiday_flg")["visitors"].mean().plot(kind="bar")
plt.title("Average Visitors: Holiday vs Non-Holiday")
plt.xlabel("holiday_flg (0 = Non-Holiday, 1 = Holiday)")
plt.ylabel("Average Visitors")
plt.tight_layout()
save_plot("visitors_by_holiday.png")
plt.show()

# Top restaurant genres
top_genres = df["air_genre_name"].value_counts().head(10)

plt.figure()
top_genres.plot(kind="bar")
plt.title("Top 10 Restaurant Genres (Count of Records)")
plt.xlabel("Genre")
plt.ylabel("Count")
plt.tight_layout()
save_plot("top_genres_count.png", width=9, height=5, dpi=300)
plt.show()

# Overall time series of average visitors across all stores
daily_visitors = df.groupby("visit_date")["visitors"].mean()

plt.figure()
daily_visitors.plot()
plt.title("Average Daily Visitors Over Time (All Stores)")
plt.xlabel("Date")
plt.ylabel("Average Visitors")
plt.tight_layout()
save_plot("avg_daily_visitors_over_time.png", width=10, height=5, dpi=300)
plt.show()

In [None]:
# ---------- FEATURE ENGINEERING ----------
df["year"] = df["visit_date"].dt.year
df["month"] = df["visit_date"].dt.month
df["day"] = df["visit_date"].dt.day
df["dayofweek_num"] = df["visit_date"].dt.dayofweek 
df["is_weekend"] = df["dayofweek_num"].isin([5, 6]).astype(int)

# Lag features: previous-day visitors & 7-day lag, plus 7-day rolling mean
df = df.sort_values(["air_store_id", "visit_date"])

df["visitors_lag1"] = df.groupby("air_store_id")["visitors"].shift(1)
df["visitors_lag7"] = df.groupby("air_store_id")["visitors"].shift(7)
df["visitors_roll7"] = (
    df.groupby("air_store_id")["visitors"]
    .shift(1)
    .rolling(window=7, min_periods=1)
    .mean()
)

feature_cols = [
    "visit_date",
    "air_store_id",
    "visitors",
    "year", "month", "day", "dayofweek_num", "is_weekend",
    "holiday_flg",
    "latitude", "longitude",
    "visitors_lag1", "visitors_lag7", "visitors_roll7",
    "air_genre_name", "air_area_name", "day_of_week"
]

df_model = df[feature_cols].dropna().reset_index(drop=True)

print("Modeling dataset shape after lag feature creation and dropna:", df_model.shape, "\n")

In [None]:
# ---------- TRAIN / VALIDATION SPLIT (TIME-BASED) ----------
split_date = df_model["visit_date"].quantile(0.8)
train_mask = df_model["visit_date"] <= split_date
valid_mask = df_model["visit_date"] > split_date

train_df = df_model[train_mask].copy()
valid_df = df_model[valid_mask].copy()

print("Train date range:", train_df["visit_date"].min(), "->", train_df["visit_date"].max())
print("Valid date range:", valid_df["visit_date"].min(), "->", valid_df["visit_date"].max(), "\n")

target_col = "visitors"

drop_cols = ["visit_date", "air_store_id", target_col]

X_train = train_df.drop(columns=drop_cols)
y_train = train_df[target_col]

X_valid = valid_df.drop(columns=drop_cols)
y_valid = valid_df[target_col]

numeric_features = [
    "year", "month", "day", "dayofweek_num", "is_weekend",
    "holiday_flg", "latitude", "longitude",
    "visitors_lag1", "visitors_lag7", "visitors_roll7"
]

categorical_features = [
    "air_genre_name",
    "air_area_name",
    "day_of_week"
]

In [None]:
# ---------- PREPROCESSING PIPELINE ----------
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessing = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ],
    remainder="drop"
)

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [None]:
# ---------- MODEL TRAINING & EVALUATION ----------
models = [
    ("LinearRegression", LinearRegression()),
    ("Ridge", Ridge(alpha=1.0, random_state=None)),
    ("RandomForest", RandomForestRegressor(
        n_estimators=150,
        max_depth=12,
        random_state=42,
        n_jobs=-1
    ))
]

results = []
fitted_pipelines = {}

for name, model in models:
    print(f"\nTraining {name}...")
    pipeline = Pipeline(steps=[
        ("preprocess", preprocessing),
        ("model", model)
    ])

    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_valid_pred = pipeline.predict(X_valid)

    train_rmse = rmse(y_train, y_train_pred)
    valid_rmse = rmse(y_valid, y_valid_pred)
    valid_r2 = r2_score(y_valid, y_valid_pred)

    print(f"{name} - Train RMSE: {train_rmse:.3f}")
    print(f"{name} - Valid RMSE: {valid_rmse:.3f}")
    print(f"{name} - Valid R²:   {valid_r2:.3f}")

    results.append({
        "model": name,
        "train_rmse": train_rmse,
        "valid_rmse": valid_rmse,
        "valid_r2": valid_r2
    })

    fitted_pipelines[name] = (pipeline, y_valid_pred)

results_df = pd.DataFrame(results)
print("\nModel performance summary:")
print(results_df, "\n")

save_output(results_df, "model_performance_summary.csv")

In [None]:
# ---------- DIAGNOSTIC PLOTS FOR BEST MODEL ----------
best_row = results_df.loc[results_df["valid_rmse"].idxmin()]
best_model_name = best_row["model"]
best_pipeline, best_valid_pred = fitted_pipelines[best_model_name]

print(f"Best model based on validation RMSE: {best_model_name}")

# Predicted vs True scatter (zoomed)
plt.figure()
plt.scatter(y_valid, best_valid_pred, alpha=0.3)
plt.xlim(0, 150)
plt.ylim(0, 150)
plt.plot([0, 150], [0, 150], "r--", linewidth=2)
plt.xlabel("True Visitors")
plt.ylabel("Predicted Visitors")
plt.title(f"{best_model_name} - Predicted vs True Visitors (Validation, Zoomed)")
plt.tight_layout()
save_plot(f"{best_model_name}_pred_vs_true_zoomed.png", width=7, height=6, dpi=300)
plt.show()

# --- Residual histogram (zoom to 99th percentile of |residuals|) ---
residuals = y_valid - best_valid_pred
res_cap = np.quantile(np.abs(residuals), 0.99)

plt.figure()
plt.hist(residuals, bins=40, range=(-res_cap, res_cap), edgecolor="black")
plt.title(f"{best_model_name} - Residuals Distribution (Validation, ±99th pct)")
plt.xlabel("Residual (True - Predicted visitors)")
plt.ylabel("Frequency")
plt.tight_layout()
save_plot(f"{best_model_name}_residuals_hist.png", width=10, height=6, dpi=300)
plt.show()

# Time plot of actual vs predicted (validation period, averaged across stores)
valid_df_plot = valid_df.copy()
valid_df_plot["predicted_visitors"] = best_valid_pred

daily_actual = valid_df_plot.groupby("visit_date")["visitors"].mean()
daily_pred = valid_df_plot.groupby("visit_date")["predicted_visitors"].mean()

plt.figure()
daily_actual.plot(label="Actual", linewidth=2)
daily_pred.plot(label="Predicted", linewidth=2)
plt.title(f"{best_model_name} - Average Visitors Over Time (Validation)")
plt.xlabel("Date")
plt.ylabel("Average Visitors")
plt.legend()
plt.tight_layout()
save_plot(f"{best_model_name}_daily_actual_vs_pred.png", width=10, height=5, dpi=300)
plt.show()

In [None]:
# ---------- FIT BEST MODEL ON FULL DATA ----------
X_full = df_model.drop(columns=drop_cols)
y_full = df_model[target_col]

best_pipeline.fit(X_full, y_full)

# --- SAVE BEST MODEL ---
model_path = models_dir / "restaurant_visitors_best_model.pkl"
joblib.dump(best_pipeline, model_path)
print(f"\nSaved best model to: {model_path}")

In [None]:
# ---------- DEMO PREDICTIONS ----------
demo_sample = X_valid.head(5)
demo_preds = best_pipeline.predict(demo_sample)

print("\nDemo predictions (first 5 validation rows):")
print("Predicted:", np.round(demo_preds, 2))
print("Actual   :", np.round(y_valid.head(5).values, 2))

# -------------------- PIPELINE COMPLETE --------------------
print("\nPipeline complete:")
print("- Project setup (figures/, outputs/, models/ directories)")
print("- Data loading & validation (air_visit_data, air_store_info, date_info)")
print("- Text cleaning (mojibake fix for area/genre names)")
print("- Dataset merge (visits + store metadata + calendar)")
print("- Exploratory analysis (visitor distributions, seasonality, holidays, genres)")
print("- Feature engineering (date parts, weekend flag, lag & rolling features)")
print("- Time-based train/validation split")
print("- Preprocessing pipeline (StandardScaler + OneHotEncoder)")
print("- Model training & comparison (Linear, Ridge, Random Forest)")
print("- Evaluation on validation set (RMSE, R², diagnostics plots)")
print("- Best model refit on full dataset")
print(f"- Final regression pipeline saved as {model_path.name}")
print("- All artifacts saved to /figures, /outputs, and /models")