# EDA Pipeline for RetailRocket Clickstream

## 1) Imports & configurations

In [14]:
# 1.1) Imports & environment

import os, json, math, time, random
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter, defaultdict, deque
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

# Reproducibility
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [13]:
# 1.2) CONFIGURATION — kindly adjust as needed (all parameters centralised here)
from pathlib import Path

#  Paths
DATA_DIR            = Path("C:/Users/Admin/Documents/WBS/Dissertation/Submission Related files/data")  # folder that holds the CSVs
EVENTS_CSV          = DATA_DIR / "events.csv"      # expected columns: session_id*, visitorid*, itemid, timestamp, event
ITEMS1_CSV          = DATA_DIR / "item_properties_part1.csv"
ITEMS2_CSV          = DATA_DIR / "item_properties_part2.csv"   
# Core switches
HAS_SESSION_ID      = False     # if False, sessionize via visitorid and SESSION_GAP_SEC
SESSION_GAP_SEC     = 30*60    # 30 minutes inactivity defines a new session when HAS_SESSION_ID=False
TS_DIV              = 1000        # keep original UNIX timestamp unit (1 if already seconds; 1000 if ms). We retain raw UNIX as per dissertation rationale.
EVENT_TYPES         = ["view", "addtocart", "transaction"]   # expected events (others will be grouped under "other")

# Sampling & performance
USE_STRATIFIED_SAMPLING = False    
SAMPLE_MAX_SESSIONS     = 200_000  # cap for heavy operations (hour/day patterns, n-grams)
RANDOM_SEED             = 42

# Plots & outputs
OUT_DIR             = Path("C:/Users/Admin/Documents/WBS/Dissertation/Submission Related files/Notebooks/Modelling/EDA_outputs"); OUT_DIR.mkdir(parents=True, exist_ok=True)
FIG_DIR             = OUT_DIR / "figs"; FIG_DIR.mkdir(parents=True, exist_ok=True)
TAB_DIR             = OUT_DIR / "tables"; TAB_DIR.mkdir(parents=True, exist_ok=True)
REPORTS_DIR         = OUT_DIR / "reports"; REPORTS_DIR.mkdir(parents=True, exist_ok=True)

SAVE_DPI            = 150
FIG_EXT             = "png"           # png recommended for dashboards; use "svg" for vector
STYLE_USE_SEABORN   = False           # keep False for portability; can switch to True if seaborn is installed
PANDAS_FLOAT_FMT    = "{:,.4f}".format

# Feature thresholds
TOP_N               = 20              # for top products/categories/3-grams
OUTLIER_EPM_THRESH  = 120             # events per minute threshold indicating possible automation/ bot or web scrapping
OUTLIER_DUR_PCTL    = 99.5            # sessions above this duration percentile are flagged

# Temporal granularity
HOUR_OF_DAY         = list(range(24))
DAYS_MAP            = {0:"Mon",1:"Tue",2:"Wed",3:"Thu",4:"Fri",5:"Sat",6:"Sun"}

# Exports
EXPORT_SESSION_FEATURES = True
SESSION_FEATURES_CSV    = OUT_DIR / "eda" / "session_features_base.csv"

# Behavioural funnels
FUNNEL_ORDER        = ["view", "addtocart", "transaction"]


## 2) Defining utilities to support pipeline

In [15]:
# 2) Utilities — tic/toc, saving, safe plotting, sampling

from contextlib import contextmanager
from datetime import datetime

_TIC_STACK = []

def tic(msg=None):
    """Start a timer; optionally print a message."""
    if msg:
        print(f"[tic] {msg}")
    _TIC_STACK.append(time.time())

def toc(msg=None):
    """End the most recent timer and print elapsed time."""
    if not _TIC_STACK:
        print("[toc] Warning: called without matching tic()")
        return 0.0
    elapsed = time.time() - _TIC_STACK.pop()
    if msg:
        print(f"[toc] {msg}: {elapsed:,.2f}s")
    else:
        print(f"[toc] {elapsed:,.2f}s")
    return elapsed

def ensure_dir(path: Path):
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)

def save_table(df: pd.DataFrame, path: Path):
    ensure_dir(path)
    df.to_csv(path, index=False)
    print(f"[saved] table -> {path} ({len(df):,} rows)")

def save_fig(fig, name: str):
    filename = FIG_DIR / f"{name}.{FIG_EXT}"
    ensure_dir(filename)
    fig.savefig(filename, dpi=SAVE_DPI, bbox_inches="tight")
    plt.close(fig)
    print(f"[saved] figure -> {filename}")

def maybe_sample_sessions(df: pd.DataFrame, session_col="session_id"):
    """Return df sampled by sessions if enabled and needed."""
    if not USE_STRATIFIED_SAMPLING:
        return df
    if "session_id" not in df.columns:
        return df
    sess_ids = df["session_id"].unique()
    if len(sess_ids) <= SAMPLE_MAX_SESSIONS:
        return df
    rng = np.random.default_rng(RANDOM_SEED)
    keep = rng.choice(sess_ids, size=SAMPLE_MAX_SESSIONS, replace=False)
    out = df[df["session_id"].isin(keep)].copy()
    print(f"[sample] sessions: {len(sess_ids):,} -> {len(keep):,}")
    return out

## 3) Data Loading

In [16]:
# 3) Data Loading — schema handling, timestamp normalization

REQUIRED_COLS_ANY = {"itemid", "timestamp", "event"}
OPTIONAL_COLS     = {"session_id", "visitorid"}

def read_events(path: Path) -> pd.DataFrame:
    tic(f"Reading events from {path}")
    df = pd.read_csv(path, usecols=None)
    # column name normalisation (lowercase)
    df.columns = [c.strip().lower() for c in df.columns]

    # Validating plausible column names
    colmap = {}
    # Mapping timestamp variants
    for cand in ["timestamp", "ts", "time", "eventtime", "event_time"]:
        if cand in df.columns:
            colmap["timestamp"] = cand; break
    # Mapping event variants
    for cand in ["event", "action", "type"]:
        if cand in df.columns:
            colmap["event"] = cand; break
    # Mapping itemid variants
    for cand in ["itemid", "item_id", "sku", "productid", "product_id"]:
        if cand in df.columns:
            colmap["itemid"] = cand; break
    # Mapping session/visitor variants
    for cand in ["session_id", "sessionid", "sid"]:
        if cand in df.columns:
            colmap["session_id"] = cand; break
    for cand in ["visitorid", "user_id", "userid", "uid"]:
        if cand in df.columns:
            colmap["visitorid"] = cand; break

    missing = [k for k in ["timestamp","event","itemid"] if k not in colmap]
    if missing:
        raise KeyError(f"Missing required columns: {missing}. Found columns = {list(df.columns)}")
    
    # Renaming in place to canonical names
    df = df.rename(columns={v:k for k,v in colmap.items()})
    # Retaining only the canonical columns
    keep_cols = ["timestamp","event","itemid"] + [c for c in ["session_id","visitorid"] if c in df.columns]
    df = df[keep_cols].copy()

    # Timestamp normalization, we retain UNIX epohs
    if TS_DIV != 1:
        df["timestamp"] = (df["timestamp"] // TS_DIV).astype("int64")
    else:
        df["timestamp"] = df["timestamp"].astype("int64")

    # Event canonicalisation
    df["event"] = df["event"].astype("string").str.lower().str.strip()
    df.loc[~df["event"].isin(EVENT_TYPES), "event"] = "other"

    # Adding datetime
    df["dt"] = pd.to_datetime(df["timestamp"], unit="s")

    toc("Loaded events")
    return df

# pulling events data
events = read_events(EVENTS_CSV)

def maybe_read_items(path):
    if not path:
        return None
    tic(f"Reading items metadata from {path}")
    df = pd.read_csv(path)
    # normalise column names
    df.columns = [c.strip().lower() for c in df.columns]
    # standardise a category column if present under common aliases
    if "category" not in df.columns:
        for cand in ("category_id", "cat", "categorycode", "category_code"):
            if cand in df.columns:
                df = df.rename(columns={cand: "category"})
                break

    toc("Reading items metadata")
    return df

# pulling items files 
items1 = maybe_read_items(ITEMS1_CSV)   # keep your variable names as-is
items2 = maybe_read_items(ITEMS2_CSV)  

# Combine items table
items = None
for _df in (items1, items2):
    if _df is None:
        continue
    items = _df if items is None else pd.concat([items, _df], ignore_index=True)

if items is not None:
    # optional: drop duplicate item ids
    # if key: items = items.drop_duplicates(subset=[key])
    pass
else:
    # No items metadata available
    items = None



# Sessionisation if needed
if not HAS_SESSION_ID:
    tic("Sessionising by visitorid + inactivity gap")
    events = events.sort_values(["visitorid","timestamp"]).reset_index(drop=True)
    # New session when gap > SESSION_GAP_SEC or visitor changes
    gap = events["timestamp"].diff().fillna(0)
    new_user = events["visitorid"].ne(events["visitorid"].shift(1))
    new_sess = (gap > SESSION_GAP_SEC) | (new_user)
    events["session_id"] = new_sess.cumsum().astype("int64")
    toc("Sessionised")

if "session_id" not in events.columns:
    raise KeyError("session_id is required (either provided or created via sessionisation).")

# Basic dataset facts
tic("Computing dataset overview")
n_rows = len(events)
n_sessions = events["session_id"].nunique()
n_users = events["visitorid"].nunique() if "visitorid" in events.columns else np.nan
n_items = events["itemid"].nunique()
t0, t1 = events["dt"].min(), events["dt"].max()
print(f"Rows={n_rows:,} | Sessions={n_sessions:,} | Users={n_users} | Items={n_items:,}")
print(f"Date range: {t0} -> {t1}")

overview = pd.DataFrame([{
    "rows": n_rows,
    "sessions": n_sessions,
    "users": n_users,
    "items": n_items,
    "date_start": t0,
    "date_end": t1
}])
save_table(overview, TAB_DIR / "dataset_overview.csv")
toc("Overview ready")

[tic] Reading events from C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\data\events.csv
[toc] Loaded events: 2.33s
[tic] Reading items metadata from C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\data\item_properties_part1.csv
[toc] Reading items metadata: 1.90s
[tic] Reading items metadata from C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\data\item_properties_part2.csv
[toc] Reading items metadata: 5.08s
[tic] Sessionising by visitorid + inactivity gap
[toc] Sessionised: 2.58s
[tic] Computing dataset overview
Rows=2,756,101 | Sessions=1,761,660 | Users=1407580 | Items=235,061
Date range: 2015-05-03 03:00:04 -> 2015-09-18 02:59:47
[saved] table -> C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\Notebooks\Modelling\EDA_outputs\tables\dataset_overview.csv (1 rows)
[toc] Overview ready: 0.23s


0.22786998748779297

## 4) Event distributions & proportions

In [17]:
# 4) Event distributions & proportions

tic("Event distribution")
evt_counts = events["event"].value_counts().rename_axis("event").reset_index(name="count")
evt_counts["prop"] = evt_counts["count"] / evt_counts["count"].sum()
save_table(evt_counts, TAB_DIR / "event_distribution.csv")

fig, ax = plt.subplots(figsize=(6,4))
ax.bar(evt_counts["event"], evt_counts["count"])
ax.set_xlabel("Event")
ax.set_ylabel("Count")
ax.set_title("Event Distribution (All)")
for i, v in enumerate(evt_counts["count"]):
    ax.text(i, v, f"{int(v):,}", ha="center", va="bottom", fontsize=8, rotation=0)
save_fig(fig, "eda_event_distribution_all")
toc("Event distribution ready")

[tic] Event distribution
[saved] table -> C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\Notebooks\Modelling\EDA_outputs\tables\event_distribution.csv (3 rows)
[saved] figure -> C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\Notebooks\Modelling\EDA_outputs\figs\eda_event_distribution_all.png
[toc] Event distribution ready: 0.99s


0.9925820827484131

## 5) Temporal patterns

In [18]:
# 5) Temporal patterns (hour-of-day, day-of-week)

df_time = maybe_sample_sessions(events)

# Hour-of-day temporal pattern
tic("Hour-of-day pattern")
hod = df_time.copy()
hod["hour"] = hod["dt"].dt.hour
hod_counts = hod.groupby(["hour","event"]).size().reset_index(name="count")
save_table(hod_counts, TAB_DIR / "temporal_hour_event_counts.csv")

fig, ax = plt.subplots(figsize=(7,4))
for e in EVENT_TYPES + (["other"] if "other" in hod["event"].unique() else []):
    sub = hod_counts[hod_counts["event"]==e]
    if not len(sub):
        continue
    ax.plot(sub["hour"], sub["count"], marker="o", label=e)
ax.set_xticks(HOUR_OF_DAY)
ax.set_xlabel("Hour of Day")
ax.set_ylabel("Events")
ax.set_title("Hourly Event Volume")
ax.legend()
save_fig(fig, "eda_hourly_event_volume")
toc("Hour-of-day ready")

# Day of week temporal pattern 
tic("Day-of-week pattern")
dow = df_time.copy()
dow["dow"] = dow["dt"].dt.dayofweek
dow_counts = dow.groupby(["dow","event"]).size().reset_index(name="count")
dow_counts["dow_label"] = dow_counts["dow"].map(DAYS_MAP)
save_table(dow_counts, TAB_DIR / "temporal_dow_event_counts.csv")

fig, ax = plt.subplots(figsize=(7,4))
for e in EVENT_TYPES + (["other"] if "other" in dow["event"].unique() else []):
    sub = dow_counts[dow_counts["event"]==e]
    if not len(sub):
        continue
    ax.plot(sub["dow_label"], sub["count"], marker="o", label=e)
ax.set_xlabel("Day of Week")
ax.set_ylabel("Events")
ax.set_title("Weekly Event Volume")
ax.legend()
save_fig(fig, "eda_weekly_event_volume")
toc("Day-of-week ready")

[tic] Hour-of-day pattern
[saved] table -> C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\Notebooks\Modelling\EDA_outputs\tables\temporal_hour_event_counts.csv (72 rows)
[saved] figure -> C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\Notebooks\Modelling\EDA_outputs\figs\eda_hourly_event_volume.png
[toc] Hour-of-day ready: 2.53s
[tic] Day-of-week pattern
[saved] table -> C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\Notebooks\Modelling\EDA_outputs\tables\temporal_dow_event_counts.csv (21 rows)
[saved] figure -> C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\Notebooks\Modelling\EDA_outputs\figs\eda_weekly_event_volume.png
[toc] Day-of-week ready: 2.25s


2.247720956802368

## 6) Session-level features

In [19]:
# 6) Session-level features (duration, intensity, diversity, last-event flags)

tic("Computing session-level features")
g = events.groupby("session_id", sort=False)

sess_first = g["timestamp"].min()
sess_last  = g["timestamp"].max()
duration_s = (sess_last - sess_first).rename("duration_sec")
n_events   = g.size().rename("events")
items_per_sess = g["itemid"].nunique().rename("item_diversity")

# events per minute (avoid /0 by by adding a small negligable value, 1e-9 to denominator)
events_per_min = (n_events / (duration_s/60.0 + 1e-9)).rename("events_per_min")

# last event type flags
last_event = g["event"].last().rename("last_event")
last_evt_flags = pd.get_dummies(last_event, prefix="last_evt", dtype=int)

sess_df = pd.concat([duration_s, n_events, items_per_sess, events_per_min, last_event, last_evt_flags], axis=1).reset_index()

# Saving raw features
FEATURES_DIR = OUT_DIR / "eda"; FEATURES_DIR.mkdir(parents=True, exist_ok=True)
save_table(sess_df, FEATURES_DIR / "session_features_base.csv")

# Distributions (log-scale a they are helpful for skew)
def _hist(series, title, xlabel, name):
    fig, ax = plt.subplots(figsize=(6,4))
    ax.hist(series, bins=50)
    ax.set_title(title)
    ax.set_xlabel(xlabel); ax.set_ylabel("Frequency")
    save_fig(fig, name)

_hist(np.log1p(sess_df["duration_sec"]), "Session Duration (log1p seconds)", "log1p(seconds)", "eda_hist_duration_log1p")
_hist(np.log1p(sess_df["events"]), "Session Events (log1p)", "log1p(events)", "eda_hist_events_log1p")
_hist(np.log1p(sess_df["events_per_min"]), "Events per Minute (log1p)", "log1p(epm)", "eda_hist_epm_log1p")
_hist(np.log1p(sess_df["item_diversity"]), "Item Diversity (log1p unique items)", "log1p(unique items)", "eda_hist_itemdiv_log1p")

# Percentiles summary
pctls = sess_df[["duration_sec","events","events_per_min","item_diversity"]].quantile([0.5,0.9,0.95,0.99]).reset_index(names=["quantile"])
save_table(pctls, TAB_DIR / "session_feature_percentiles.csv")
toc("Session-level features ready")

[tic] Computing session-level features
[saved] table -> C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\Notebooks\Modelling\EDA_outputs\eda\session_features_base.csv (1,761,660 rows)
[saved] figure -> C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\Notebooks\Modelling\EDA_outputs\figs\eda_hist_duration_log1p.png
[saved] figure -> C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\Notebooks\Modelling\EDA_outputs\figs\eda_hist_events_log1p.png
[saved] figure -> C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\Notebooks\Modelling\EDA_outputs\figs\eda_hist_epm_log1p.png
[saved] figure -> C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\Notebooks\Modelling\EDA_outputs\figs\eda_hist_itemdiv_log1p.png
[saved] table -> C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\Notebooks\Modelling\EDA_outputs\tables\session_feature_percentiles.csv (4 rows)
[toc] Session-level features ready: 12.45s

12.447735071182251

## 7) Behavioural funnel

In [20]:
# 7) Behavioural funnel (session-level presence of events in order)

tic("Computing funnel")
# Presence flags per session
presence = events.pivot_table(index="session_id", columns="event", values="timestamp", aggfunc="count", fill_value=0)
for e in FUNNEL_ORDER:
    if e not in presence.columns:
        presence[e] = 0
presence = presence[[e for e in FUNNEL_ORDER if e in presence.columns]].astype(int)

# Simple sequential funnel to identify proportion of sessions that reach each stage
total_sessions = presence.shape[0]
stage_props = []
running = total_sessions
mask = pd.Series([True]*total_sessions, index=presence.index)
for e in FUNNEL_ORDER:
    step = presence.loc[mask, e] > 0
    reached = int(step.sum())
    stage_props.append({"stage": e, "sessions_reached": reached, "prop_of_total": reached/total_sessions})
    # tighten mask for next stage: among those that reached current step
    mask = mask & (presence[e] > 0)

funnel_df = pd.DataFrame(stage_props)
save_table(funnel_df, TAB_DIR / "funnel_session_level.csv")

fig, ax = plt.subplots(figsize=(6,4))
ax.bar(funnel_df["stage"], funnel_df["sessions_reached"])
ax.set_title("Session-level Funnel")
ax.set_xlabel("Stage"); ax.set_ylabel("Sessions Reached")
for i, v in enumerate(funnel_df["sessions_reached"]):
    ax.text(i, v, f"{v:,}", ha="center", va="bottom", fontsize=8)
save_fig(fig, "eda_funnel_session_level")
toc("Funnel ready")

[tic] Computing funnel
[saved] table -> C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\Notebooks\Modelling\EDA_outputs\tables\funnel_session_level.csv (3 rows)
[saved] figure -> C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\Notebooks\Modelling\EDA_outputs\figs\eda_funnel_session_level.png
[toc] Funnel ready: 3.02s


3.017254114151001

# 8) Sequence patterns

In [21]:
# 8) Sequence patterns — event 3-grams and Markov transitions

# Prepare
seq_df = maybe_sample_sessions(events).sort_values(["session_id","timestamp"])

# 3-grams
tic("3-gram extraction")
def event_ngrams(seq, n=3):
    if len(seq) < n:
        return []
    return [tuple(seq[i:i+n]) for i in range(len(seq)-n+1)]

ngrams_counter = Counter()
for _, g in seq_df.groupby("session_id", sort=False):
    grams = event_ngrams(g["event"].tolist(), n=3)
    ngrams_counter.update(grams)

grams_df = pd.DataFrame(ngrams_counter.items(), columns=["ngram","count"]).sort_values("count", ascending=False)
grams_df["ngram_str"] = grams_df["ngram"].apply(lambda t: "->".join(t))
top_grams = grams_df.head(TOP_N).copy()
save_table(top_grams[["ngram_str","count"]], TAB_DIR / "top_event_3grams.csv")

fig, ax = plt.subplots(figsize=(8,5))
ax.barh(top_grams["ngram_str"][::-1], top_grams["count"][::-1])
ax.set_xlabel("Count"); ax.set_ylabel("3-gram"); ax.set_title("Top Event 3-grams")
save_fig(fig, "eda_top_3grams")
toc("3-grams ready")

# First-order Markov transition matrix among core events
tic("Transition matrix")
core = EVENT_TYPES  # ignore "other" for clarity
idx = {e:i for i,e in enumerate(core)}
m = np.zeros((len(core), len(core)), dtype=np.float64)

prev = None
prev_sess = None
for _, row in seq_df.iterrows():
    e = row["event"]
    s = row["session_id"]
    if e not in core:
        prev = None; prev_sess = s
        continue
    if prev is not None and prev_sess == s:
        m[idx[prev], idx[e]] += 1.0
    prev = e; prev_sess = s

# Row-normalise
row_sums = m.sum(axis=1, keepdims=True) + 1e-12
tm = m / row_sums

tm_df = pd.DataFrame(tm, index=core, columns=core)
save_table(tm_df.reset_index().rename(columns={"index":"from"}), TAB_DIR / "transition_matrix_core.csv")

fig, ax = plt.subplots(figsize=(5.5,4.5))
im = ax.imshow(tm, aspect="auto")
ax.set_xticks(range(len(core))); ax.set_xticklabels(core, rotation=45, ha="right")
ax.set_yticks(range(len(core))); ax.set_yticklabels(core)
ax.set_title("Markov Transition Probabilities (core events)")
fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
save_fig(fig, "eda_transition_matrix_core")
toc("Transition matrix ready")

[tic] 3-gram extraction
[saved] table -> C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\Notebooks\Modelling\EDA_outputs\tables\top_event_3grams.csv (20 rows)
[saved] figure -> C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\Notebooks\Modelling\EDA_outputs\figs\eda_top_3grams.png
[toc] 3-grams ready: 160.96s
[tic] Transition matrix
[saved] table -> C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\Notebooks\Modelling\EDA_outputs\tables\transition_matrix_core.csv (3 rows)
[saved] figure -> C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\Notebooks\Modelling\EDA_outputs\figs\eda_transition_matrix_core.png
[toc] Transition matrix ready: 189.15s


189.14519143104553

## 9) Outlier/bot heuristics and data quality checks

In [22]:
# 9) Outlier/bot heuristics and data quality checks

tic("Outlier heuristics & data quality")

# Outliers by events per minute and long duration
q = sess_df["duration_sec"].quantile(OUTLIER_DUR_PCTL/100.0)
outliers = sess_df[(sess_df["events_per_min"] > OUTLIER_EPM_THRESH) | (sess_df["duration_sec"] > q)].copy()
save_table(outliers, TAB_DIR / "potential_automation_or_outliers.csv")

# Duplicates & unknowns (data qulity checks)
dupe_rows = events.duplicated(subset=["session_id","timestamp","event","itemid"]).sum()
unknown_events = set(events.loc[~events["event"].isin(EVENT_TYPES), "event"].unique()) - {"other"}
dq = pd.DataFrame([
    {"check":"duplicate_rows", "value": int(dupe_rows)},
    {"check":"unknown_event_labels", "value": ",".join(sorted(map(str, unknown_events))) if unknown_events else ""}
])
save_table(dq, TAB_DIR / "data_quality_checks.csv")

toc("Outlier & quality checks ready")

[tic] Outlier heuristics & data quality
[saved] table -> C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\Notebooks\Modelling\EDA_outputs\tables\potential_automation_or_outliers.csv (1,389,829 rows)
[saved] table -> C:\Users\Admin\Documents\WBS\Dissertation\Submission Related files\Notebooks\Modelling\EDA_outputs\tables\data_quality_checks.csv (2 rows)
[toc] Outlier & quality checks ready: 8.81s


8.806152582168579