In [1]:
# =============================================================================
# FINAL SUBMISSION FOR IITM MLP T3-2025 KAGGLE CONTEST: CINEMA AUDIENCE FORECASTING!
# =============================================================================

import pandas as pd
import numpy as np
from sklearn.metrics import r2_score


# =============================================================================
# 1. LOAD AND PREPARE DATA
#    - Load main visits and calendar information
#    - Clean extreme outliers per theater
#    - Add basic date-related features
# =============================================================================

print("\n[1/6] Loading data...")

base_path = "/kaggle/input/Cinema_Audience_Forecasting_challenge/"

# Load historical audience data
df = pd.read_csv(
    f"{base_path}booknow_visits/booknow_visits.csv",
    parse_dates=["show_date"]
)

# Load date information (holidays etc.)
date_info = pd.read_csv(
    f"{base_path}date_info/date_info.csv",
    parse_dates=["show_date"]
)

# Sort by theater and date
df = df.sort_values(["book_theater_id", "show_date"])

# Remove extreme outliers within each theater (1st–99th percentile clipping)
for theater in df["book_theater_id"].unique():
    mask = df["book_theater_id"] == theater
    p99 = df.loc[mask, "audience_count"].quantile(0.99)
    p1 = df.loc[mask, "audience_count"].quantile(0.01)
    df.loc[mask, "audience_count"] = df.loc[mask, "audience_count"].clip(p1, p99)

# Basic date features
df["dow"] = df["show_date"].dt.dayofweek
df["month"] = df["show_date"].dt.month
df["day"] = df["show_date"].dt.day

# Merge with holiday/date metadata
df = df.merge(date_info, on="show_date", how="left")

# Weekend and holiday indicators
df["is_weekend"] = df["dow"].isin([5, 6]).astype(int)
df["is_holiday"] = (df.get("holiday_flg", pd.Series(0, index=df.index)) == 1).astype(int)

print(f"  Loaded {len(df)} records")

# =============================================================================
# 2. FEATURE ENGINEERING
#    - Generate lag, rolling, and EMA features per theater
#    - Add simple momentum-based features
# =============================================================================

print("[2/6] Engineering features...")

df = df.sort_values(["book_theater_id", "show_date"])
grp = df.groupby("book_theater_id")["audience_count"]

# Lag features
df["lag_1"] = grp.shift(1)
df["lag_3"] = grp.shift(3)
df["lag_7"] = grp.shift(7)
df["lag_14"] = grp.shift(14)

# Rolling averages (shifted by 1 day to avoid leakage)
df["rolling_3"] = grp.rolling(3, min_periods=1).mean().shift(1).values
df["rolling_7"] = grp.rolling(7, min_periods=1).mean().shift(1).values
df["rolling_14"] = grp.rolling(14, min_periods=1).mean().shift(1).values
df["rolling_30"] = grp.rolling(30, min_periods=1).mean().shift(1).values

# Exponential moving averages
df["ema_7"] = grp.apply(lambda x: x.ewm(span=7, min_periods=1).mean().shift(1)) \
                 .reset_index(level=0, drop=True)
df["ema_14"] = grp.apply(lambda x: x.ewm(span=14, min_periods=1).mean().shift(1)) \
                  .reset_index(level=0, drop=True)

# Momentum-type features
df["diff_1"] = df["audience_count"] - df["lag_1"]
df["diff_7"] = df["audience_count"] - df["lag_7"]
df["momentum_7_14"] = df["rolling_7"] - df["rolling_14"]

# Month-start / end indicators
df["is_month_start"] = (df["day"] <= 7).astype(int)
df["is_month_end"] = (df["day"] >= 24).astype(int)

print("  Feature engineering completed.")

# =============================================================================
# 3. PROCESS BOOKINGS DATA
#    - Aggregate bookings into daily booking statistics
#    - Merge booking information with visits data
# =============================================================================

print("[3/6] Processing bookings...")

bookings = pd.read_csv(f"{base_path}booknow_booking/booknow_booking.csv")
bookings["show_date"] = pd.to_datetime(bookings["show_datetime"]).dt.date
bookings["show_date"] = pd.to_datetime(bookings["show_date"])

# Aggregate bookings per theater per date
book_agg = bookings.groupby(["book_theater_id", "show_date"])["tickets_booked"].agg([
    ("tickets_booked", "sum"),
    ("booking_count", "count"),
    ("avg_booking", "mean")
]).reset_index()

# Merge with main DataFrame
df = df.merge(book_agg, on=["book_theater_id", "show_date"], how="left")
df[["tickets_booked", "booking_count", "avg_booking"]] = \
    df[["tickets_booked", "booking_count", "avg_booking"]].fillna(0)

# Booking trend (7-day rolling)
df["tickets_rolling_7"] = df.groupby("book_theater_id")["tickets_booked"].transform(
    lambda x: x.shift(1).rolling(7, min_periods=1).mean()
).fillna(0)

print("  Bookings merged successfully.")

# =============================================================================
# 4. THEATER-LEVEL STATISTICS
#    - Compute long-term averages per theater per weekday/month
#    - Used as baseline predictors
# =============================================================================

print("[4/6] Computing theater statistics...")

theater_weekday_means = df.groupby(["book_theater_id", "dow"])["audience_count"].mean()
theater_month_means = df.groupby(["book_theater_id", "month"])["audience_count"].mean()
theater_means = df.groupby("book_theater_id")["audience_count"].mean()
global_mean = df["audience_count"].mean()

# Soft estimate of each theater’s effective capacity (95th percentile)
theater_capacity = df.groupby("book_theater_id")["audience_count"].quantile(0.95)

print(f"  Global mean audience: {global_mean:.1f}")

# =============================================================================
# 5. RULE-BASED PREDICTION FUNCTION
#    - Weighted combination of lag, rolling, booking and baseline theater stats
#    - Includes adjustments for holidays, weekends, month-end, and booking surges
# =============================================================================

weights = {
    "w_base_dow": 0.50,
    "w_base_month": 0.15,
    "w_rolling_7": 0.10,
    "w_rolling_14": 0.06,
    "w_rolling_30": 0.02,
    "w_lag_1": 0.05,
    "w_lag_7": 0.03,
    "w_lag_14": 0.02,
    "w_tickets": 0.30,
    "w_ema_7": 0.04,
    "w_ema_14": 0.02,
    "w_momentum": 0.02,
    "w_diff": 0.02,
    "w_holiday_boost": 0.08,
    "w_weekend_boost": 0.05
}

def predict(row):
    """Generate prediction using weighted statistical and temporal features."""
    
    key_dow = (row["book_theater_id"], row["dow"])
    key_month = (row["book_theater_id"], row["month"])

    # Baseline statistical predictions
    base_dow = theater_weekday_means.get(key_dow, theater_means.get(row["book_theater_id"], global_mean))
    base_month = theater_month_means.get(key_month, theater_means.get(row["book_theater_id"], global_mean))

    # Core weighted prediction
    pred = (
        weights["w_base_dow"] * base_dow +
        weights["w_base_month"] * base_month +
        weights["w_rolling_7"] * row["rolling_7"] +
        weights["w_rolling_14"] * row["rolling_14"] +
        weights["w_rolling_30"] * row["rolling_30"] +
        weights["w_lag_1"] * row["lag_1"] +
        weights["w_lag_7"] * row["lag_7"] +
        weights["w_lag_14"] * row["lag_14"] +
        weights["w_tickets"] * row["tickets_booked"] +
        weights["w_ema_7"] * row["ema_7"] +
        weights["w_ema_14"] * row["ema_14"] +
        weights["w_momentum"] * row["momentum_7_14"] +
        weights["w_diff"] * (row["diff_1"] + row["diff_7"]) / 2
    )

    # Holiday/Weekend effects
    if row["is_holiday"]:
        pred += weights["w_holiday_boost"] * base_dow

    if row["is_weekend"]:
        pred += weights["w_weekend_boost"] * row["rolling_7"]

    # Month-end spending effect
    if row["is_month_end"]:
        pred *= 1.03

    # Strong booking momentum
    if row["tickets_booked"] > row["tickets_rolling_7"] * 1.2:
        pred *= 1.02

    return pred

# =============================================================================
# 6. TRAINING EVALUATION
#    - Apply prediction function to historical data
#    - Compute training R²
# =============================================================================

print("[5/6] Training and evaluating...")

df_train = df.dropna(subset=[
    "lag_1", "rolling_7", "rolling_14", "lag_7", "lag_14",
    "ema_7", "ema_14"
])

df_train["predicted_audience_count"] = df_train.apply(predict, axis=1)

# Apply mild smoothing per theater
df_train["predicted_audience_count"] = df_train.groupby("book_theater_id")[
    "predicted_audience_count"
].transform(lambda x: x.rolling(2, min_periods=1).mean())

r2 = r2_score(df_train["audience_count"], df_train["predicted_audience_count"])

print(f"  Training R²: {r2:.4f}")
print(f"  Mean predicted: {df_train['predicted_audience_count'].mean():.1f}")
print(f"  Mean actual: {df_train['audience_count'].mean():.1f}")

# =============================================================================
# GENERATE TEST SET PREDICTIONS
# =============================================================================

print("[6/6] Generating test predictions...")

submission = pd.read_csv(f"{base_path}sample_submission/sample_submission.csv")

# Extract theater ID and date
submission[["book_theater_id", "show_date"]] = submission["ID"].str.rsplit("_", n=1, expand=True)
submission["show_date"] = pd.to_datetime(submission["show_date"])

# Recompute date-related features
submission["dow"] = submission["show_date"].dt.dayofweek
submission["month"] = submission["show_date"].dt.month
submission["day"] = submission["show_date"].dt.day
submission["is_weekend"] = submission["dow"].isin([5, 6]).astype(int)
submission["is_month_start"] = (submission["day"] <= 7).astype(int)
submission["is_month_end"] = (submission["day"] >= 24).astype(int)

# Holiday mapping
if "holiday_flg" in date_info.columns:
    holiday_map = date_info.set_index("show_date")["holiday_flg"]
    submission["is_holiday"] = submission["show_date"].map(holiday_map).fillna(0).astype(int)
else:
    submission["is_holiday"] = 0

# Map last known temporal features from training data
last = df.groupby("book_theater_id").last()

feature_cols = [
    "lag_1", "lag_3", "lag_7", "lag_14",
    "rolling_3", "rolling_7", "rolling_14", "rolling_30",
    "tickets_booked", "ema_7", "ema_14",
    "diff_1", "diff_7", "momentum_7_14",
    "tickets_rolling_7", "booking_count", "avg_booking"
]

for col in feature_cols:
    if col in last.columns:
        default_val = global_mean if ("diff" not in col and "momentum" not in col) else 0
        submission[col] = submission["book_theater_id"].map(last[col]).fillna(default_val)

# Compute predictions
submission["audience_count"] = submission.apply(predict, axis=1)

# Apply 2-day smoothing
submission = submission.sort_values(["book_theater_id", "show_date"])
submission["audience_count"] = submission.groupby("book_theater_id")["audience_count"] \
                                         .transform(lambda x: x.rolling(2, min_periods=1).mean())

# Final formatting
submission["audience_count"] = submission["audience_count"].clip(lower=1).round().astype(int)

final_submission = submission[["ID", "audience_count"]]
final_submission.to_csv("submission.csv", index=False)

print("Submission file generated (submission.csv)")
print(f"Total predictions: {len(final_submission)}")
print(f"Range: {final_submission['audience_count'].min()} – {final_submission['audience_count'].max()}")
print(f"Mean: {final_submission['audience_count'].mean():.1f}")
print(f"Median: {final_submission['audience_count'].median():.0f}")


[1/6] Loading data...


  34.    62.    72.    34.    32.    64.    94.    58.    52.    32.
  82.    36.    78.    76.    54.    50.    44.    38.    64.    84.
  62.    34.    48.    32.    22.    52.    66.    90.    80.    50.
  74.    90.    74.    70.    66.    76.    56.    54.    50.    82.
  90.   102.    40.    56.    68.    66.    54.   109.66  94.    86.
  42.    66.    38.    68.    68.   106.    68.    78.    78.    54.
  74.    76.    88.    78.    80.    48.    44.    64.    68.    88.
  56.    34.    52.    78.    48.    74.    74.    38.    34.    60.
  56.    56.    64.    94.    56.    74.    42.    30.    64.    58.
 104.    94.    50.    54.    56.    60.    72.    86.    58.    40.
  48.    32.    50.    56.    74.    48.    48.    58.    42.    38.
  68.    88.    44.    46.    40.    64.    50.    42.    88.    46.
  36.    22.    46.    46.    72.    68.    22.    52.    32.    52.
  42.    48.    72.    48.    56.    80.    38.    40.    72.    70.
  52.    54.    54.    42.    56. 

  Loaded 214046 records
[2/6] Engineering features...
  Feature engineering completed.
[3/6] Processing bookings...
  Bookings merged successfully.
[4/6] Computing theater statistics...
  Global mean audience: 41.4
[5/6] Training and evaluating...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["predicted_audience_count"] = df_train.apply(predict, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["predicted_audience_count"] = df_train.groupby("book_theater_id")[


  Training R²: 0.5585
  Mean predicted: 42.4
  Mean actual: 41.4
[6/6] Generating test predictions...
Submission file generated (submission.csv)
Total predictions: 38062
Range: 2 – 159
Mean: 42.4
Median: 38
