In [None]:
import numpy as np 
import pandas as pd 
import gc 
import warnings 
from dataclasses import dataclass 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

warnings.filterwarnings("ignore")

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

DATA_DIR = "/kaggle/input/web-traffic-time-series-forecasting"
OUT_PATH = "/kaggle/working/submission.csv"

TRAIN_1 = os.path.join(DATA_DIR, "train_1.csv.zip")
TRAIN_2 = os.path.join(DATA_DIR, "train_2.csv.zip")
KEY_1 = os.path.join(DATA_DIR, "key_1.csv.zip")
KEY_2 = os.path.join(DATA_DIR, "key_2.csv.zip")
SAMPLE_1 = os.path.join(DATA_DIR, "sample_submission_1.csv.zip")
SAMPLE_2 = os.path.join(DATA_DIR, "sample_submission_2.csv.zip")

Defined some functions below to assist me, adhering to competition requirements. 

In [None]:
def smape(y_true: np.ndarray, y_pred: np.ndarray, eps: float = 1e-8) -> float:
    """
    SMAPE = 2 * |y - yhat| / (|y| + |yhat|)
    """
    y_true = np.asarray(y_true, dtype=np.float64)
    y_pred = np.asarray(y_pred, dtype=np.float64)
    denom = np.abs(y_true) + np.abs(y_pred) + eps #to account for when it is zero in log1 space
    return float(np.mean(2.0 * np.abs(y_pred - y_true) / denom))
    
def detect_date_columns(df: pd.DataFrame, id_col: str = "Page") -> list[str]:
    """
    because training files are in wide format
    """
    cols = [c for c in df.columns if c != id_col] # Keep only columns that parse as dates
    parsed = pd.to_datetime(cols, errors="coerce")
    date_cols = [c for c, d in zip(cols, parsed) if not pd.isna(d)] # Sort by actual datetime
    date_cols = sorted(date_cols, key=lambda c: pd.to_datetime(c))
    return date_cols

def extract_date_from_page(key: pd.DataFrame) -> pd.DataFrame:
    page_parts = key["page"].str.rsplit("_", n=1, expand=True)
    key["page"] = page_parts[0]
    key["date"] = pd.to_datetime(page_parts[1], errors="coerce")

    if key["date"].isna().any(): #shld not happen 
        bad = key[key["date"].isna()].head(3)
        raise ValueError(
            "Some dates could not be parsed from Page column. Example rows:"
            + bad.to_string(index=False)
        )
    return key 

def read_key(path) -> pd.DataFrame:
    key = pd.read_csv(path, compression="zip")

    # standardising column names jic
    cols_lower = {c: c.lower() for c in key.columns}
    key = key.rename(columns=cols_lower)

    if "id" not in key.columns:
        raise ValueError(f"Key file at {path} did not contain an 'Id' column. Found: {list(key.columns)}")
    if "page" not in key.columns:
        raise ValueError(f"Key file at {path} did not contain a 'Page' column. Found: {list(key.columns)}")

    key = extract_date_from_page(key)

    return key[["id", "page", "date"]]

def concat_and_dedupe_predictions(preds_list, keep="first", verbose=True):
    """
    To enforce unique ids in the submission: I saw that key_1 and key_2 can contain overlapping Id.
    So ill be removing them deterministically beforehand.
    """
    submission = pd.concat(preds_list, axis=0, ignore_index=True)

    dup_mask = submission["Id"].duplicated(keep=False)
    if dup_mask.any():
        dups = submission.loc[dup_mask].sort_values("Id")
        dup_ids = dups["Id"].unique()

        if verbose:
            print(f"[WARN] Found {len(dup_ids)} duplicated Id(s) after concat.")
            print("Sample duplicated rows:")
            print(dups.head(10))
            
        submission = submission.drop_duplicates(subset="Id", keep=keep).reset_index(drop=True)

        if verbose:
            print(f"[INFO] Deduplicated using keep='{keep}'. Final rows: {len(submission)}")

    assert not submission["Id"].duplicated().any(), "Deduplication failed: still duplicated Ids."

    return submission

## **Crafting The Baseline Model**
I am using a median-based baseline in log1p space to handle the heavy-tailed and noisy nature of web traffic data. 
* Recent medians are for estimating the overall level
* Weekday medians are for capturing weekly seasonality


In [None]:
@dataclass
class BaselineStats:
    """Sufficient statistics"""
    pages: np.ndarray
    last_date: pd.Timestamp
    last28_median: np.ndarray
    weekday_median: np.ndarray
    global_median: float #acts as a fallback

def build_baseline_stats(
    train_df: pd.DataFrame,
    id_col: str = "Page",
    lookback_weekday: int = 56, #num of days for estimating weekday seasonality 
    lookback_level: int = 28,) # to estimate an overall level
    -> BaselineStats:

    date_cols = detect_date_columns(train_df, id_col=id_col)
    if len(date_cols) < max(lookback_weekday, lookback_level):
        raise ValueError(
            f"Not enough date columns ({len(date_cols)}) for lookback windows."
        )
    
    pages = train_df[id_col].astype(str).values #.astype(type) casts data from one type to another
    dates = pd.to_datetime(date_cols) 
    last_date = dates.max()

    y = train_df[date_cols].to_numpy(dtype=np.float32, copy=True)

    y[y < 0] = np.nan

    y_log = np.log1p(y)

    # median of last 28 days
    idx_level_start = len(date_cols) - lookback_level
    last_level_window = y_log[:, idx_level_start:] 
    last28_median = np.nanmedian(last_level_window, axis=1) #for the last 28 days, what is the median in log1p space? 

    # median per weekday over last 56 days
    idx_wk_start = len(date_cols) - lookback_weekday
    recent_dates = dates[idx_wk_start:]
    recent_y = y_log[:, idx_wk_start:] #this is an example of 2D slicing. so we take all rows but only the columns from the lookback weekday

    weekday_median = np.full((recent_y.shape[0], 7), np.nan, dtype=np.float32) 
    for wd in range(7):
        mask = (recent_dates.weekday == wd)
        if mask.sum() == 0: #contigency
            continue
        weekday_median[:, wd] = np.nanmedian(recent_y[:, mask], axis=1)

    # global median (log-space)
    global_median = float(np.nanmedian(y_log)) #median log-traffic value across all pages and all days

    del y, y_log, last_level_window, recent_y
    gc.collect()

    return BaselineStats(
        pages=pages,
        last_date=last_date,
        last28_median=last28_median.astype(np.float32),
        weekday_median=weekday_median.astype(np.float32),
        global_median=global_median,
    )

def predict_with_baseline(
    stats: BaselineStats,
    key_df: pd.DataFrame,
    alpha_weekday: float = 0.7) -> pd.DataFrame:

    page_to_idx = pd.Series(np.arange(len(stats.pages)), index=stats.pages)

    key = key_df.copy()
    key["idx"] = page_to_idx.reindex(key["page"]).to_numpy() #visualise this

    wd = key["date"].dt.weekday.to_numpy(dtype=np.int16)

    pred_log = np.full(len(key), stats.global_median, dtype=np.float32)

    valid = ~pd.isna(key["idx"].to_numpy())
    if valid.any():
        idx = key.loc[valid, "idx"].astype(np.int64).to_numpy()
        wd_valid = wd[valid]

        wmed = stats.weekday_median[idx, wd_valid]
        lmed = stats.last28_median[idx]

        wmed_filled = np.where(np.isfinite(wmed), wmed, lmed)

        lmed_filled = np.where(np.isfinite(lmed), lmed, stats.global_median)
        wmed_filled = np.where(np.isfinite(wmed_filled), wmed_filled, lmed_filled)

        pred_log_valid = alpha_weekday * wmed_filled + (1.0 - alpha_weekday) * lmed_filled
        pred_log[valid] = pred_log_valid.astype(np.float32)

    pred = np.expm1(pred_log).astype(np.float32)

    pred = np.where(np.isfinite(pred), pred, 0.0)
    pred = np.clip(pred, 0.0, None)

    out = pd.DataFrame({"Id": key["id"], "Visits": pred})
    return out

In [None]:
import matplotlib.pyplot as plt

def visualise_single_page(train_df: pd.DataFrame, page: str, lookback_weekday=56, lookback_level=28):

    date_cols = detect_date_columns(train_df)
    dates = pd.to_datetime(date_cols)

    y = train_df.loc[train_df["Page"] == page, date_cols].to_numpy(dtype=np.float32).flatten()
    y_log = np.log1p(y)

    level_window = y_log[-lookback_level:]
    weekday_window = y_log[-lookback_weekday:]
    weekday_dates = dates[-lookback_weekday:]

    last28 = np.nanmedian(level_window)
    weekday_meds = {
        wd: np.nanmedian(weekday_window[weekday_dates.weekday == wd])
        for wd in range(7)
    }

    fig, axes = plt.subplots(3, 1, figsize=(12, 9), sharex=True)

    axes[0].plot(dates, y)
    axes[0].set_title(f"Raw visits — {page}")

    axes[1].plot(dates, y_log)
    axes[1].axhline(last28, color="red", linestyle="--", label="last-28 median (log1p)")
    axes[1].legend()
    axes[1].set_title("log1p(visits)")

    axes[2].bar(range(7), [weekday_meds[w] for w in range(7)])
    axes[2].set_xticks(range(7))
    axes[2].set_xticklabels(["Mon","Tue","Wed","Thu","Fri","Sat","Sun"])
    axes[2].set_title("Weekday medians (log1p, last 56 days)")

    plt.tight_layout()
    plt.show()


def visualise_prediction_blend(stats: BaselineStats, page: str, weekday: int, alpha=0.7):

    idx = np.where(stats.pages == page)[0][0]

    wmed = stats.weekday_median[idx, weekday]
    lmed = stats.last28_median[idx]

    pred_log = alpha * wmed + (1 - alpha) * lmed
    pred = np.expm1(pred_log)

    print(f"Page: {page}")
    print(f"Weekday: {weekday} (0=Mon … 6=Sun)")
    print(f"weekday median (log1p): {wmed:.3f}")
    print(f"last-28 median (log1p): {lmed:.3f}")
    print(f"alpha_weekday: {alpha}")
    print(f"→ blended log1p prediction: {pred_log:.3f}")
    print(f"→ final Visits prediction: {pred:.1f}")

train_demo = pd.read_csv(TRAIN_1, compression="zip")
date_cols = detect_date_columns(train_demo, id_col="Page")
last_cols = date_cols[-120:]
train_demo = train_demo[["Page"] + last_cols]

page_demo = train_demo["Page"].iloc[0]

visualise_single_page(train_demo, page=page_demo)

stats_demo = build_baseline_stats(train_demo, id_col="Page", lookback_weekday=56, lookback_level=28)
visualise_prediction_blend(stats_demo, page=page_demo, weekday=0, alpha=0.7)

del train_demo, stats_demo
gc.collect()

## **A quick validation**
To verify that the pipeline is behaving, I am evaluating the baseline on a time-based holdout on a random subset of pages. 

In [None]:
def quick_time_validation(
    train_df: pd.DataFrame,
    n_pages: int = 5000,
    holdout_days: int = 60,
    id_col: str = "Page") -> float:
    """Quick SMAPE on a page sample with a time holdout."""

    df = train_df.copy()
    date_cols = detect_date_columns(df, id_col=id_col)

    if len(df) > n_pages:
        df = df.sample(n=n_pages, random_state=RANDOM_SEED)

    train_cols = date_cols[:-holdout_days]
    hold_cols  = date_cols[-holdout_days:]

    truncated = pd.concat([df[[id_col]], df[train_cols]], axis=1)
    stats = build_baseline_stats(truncated, id_col=id_col)

    pages = df[id_col].astype(str).values
    hold_dates = pd.to_datetime(hold_cols)

    page_rep = np.repeat(pages, len(hold_dates))
    date_rep = np.tile(hold_dates, len(pages))

    key_like = pd.DataFrame({
        "id": np.arange(len(page_rep), dtype=np.int64),
        "page": page_rep,
        "date": date_rep,
    })

    #we are still using what we trained the model with 
    pred_df = predict_with_baseline(stats, key_like)

    # ground truth
    y_true = df[hold_cols].to_numpy(dtype=np.float32)
    y_true = y_true.reshape(-1)

    y_pred = pred_df["Visits"].to_numpy(dtype=np.float32)

    y_true = np.where(np.isfinite(y_true), y_true, 0.0)

    score = smape(y_true, y_pred)

    del df, truncated, key_like, pred_df
    gc.collect()

    return score

In [None]:
train_val = pd.read_csv(TRAIN_1, compression="zip")
score = quick_time_validation(train_val, n_pages=1000, holdout_days=60)
print("Quick SMAPE (n=1000 pages, holdout=60d):", score)

del train_val
gc.collect()

# **The prediction**

Training data is processed sequentially, simultaneously would consume way too much memory. I merge predictions for each train-key pair later on. 

In [None]:
def process_chunk(train_path: str, key_path: str, id_col: str = "Page") -> pd.DataFrame:
    """Load a train chunk and its corresponding key, then produce predictions."""

    print(f"Reading key: {os.path.basename(key_path)}")
    key = read_key(key_path)
    print("Key shape:", key.shape)

    print(f"Reading train: {os.path.basename(train_path)}")
    train = pd.read_csv(train_path)
    print("Train shape:", train.shape)

    print("Building baseline statistics...")
    stats = build_baseline_stats(train, id_col=id_col)

    print("Predicting key rows...")
    preds = predict_with_baseline(stats, key)

    del train, key, stats
    gc.collect()

    return preds

preds_1 = process_chunk(TRAIN_1, KEY_1)
preds_2 = process_chunk(TRAIN_2, KEY_2)

submission = concat_and_dedupe_predictions([preds_1, preds_2], keep="first", verbose=True)
    
submission["Visits"] = submission["Visits"].fillna(0.0).astype(np.float32)
submission.to_csv(OUT_PATH, index=False)

print("Saved:", OUT_PATH)
print(submission.head())