# 1) Imports & data

In [2]:
import pandas as pd
import numpy as np
from scipy import stats

# paths
DATA = r"D:\MGA\job\data analyst\flight-price-analytics\data"
FACT = f"{DATA}/fares_fact.csv"

df = pd.read_csv(
    FACT,
    parse_dates=["search_date","depart_date","snapshot_date"]
)
df["route"] = df["origin"] + "-" + df["destination"]
df["lead_time_days"] = (df["depart_date"] - df["search_date"]).dt.days
df["dow"] = df["search_date"].dt.dayofweek  # Mon=0..Sun=6
df = df.dropna(subset=["price"])


# 2) Utilities (Welch t-test, Mann–Whitney, effect sizes)

In [3]:
def cohen_d(a, b):
    a, b = np.array(a, float), np.array(b, float)
    va, vb = a.var(ddof=1), b.var(ddof=1)
    n1, n2 = len(a), len(b)
    s = np.sqrt(((n1-1)*va + (n2-1)*vb) / (n1+n2-2))
    return (a.mean() - b.mean()) / s if s > 0 else np.nan

def cliffs_delta(a, b):
    # robust, nonparametric effect size
    a, b = np.array(a), np.array(b)
    gt = sum(x > y for x in a for y in b)
    lt = sum(x < y for x in a for y in b)
    return (gt - lt) / (len(a)*len(b))

def welch_t(a, b):
    return stats.ttest_ind(a, b, equal_var=False, nan_policy='omit')

def mw_u(a, b):
    return stats.mannwhitneyu(a, b, alternative="two-sided")


# 3) 95% CI for the mean difference (Welch)

In [4]:
from scipy import stats
import numpy as np

def welch_ci(a, b, alpha=0.05):
    a, b = np.asarray(a), np.asarray(b)
    a = a[~np.isnan(a)]
    b = b[~np.isnan(b)]
    if len(a) < 2 or len(b) < 2:
        return np.nan, np.nan

    ma, mb = a.mean(), b.mean()
    va, vb = a.var(ddof=1), b.var(ddof=1)
    na, nb = len(a), len(b)
    se = np.sqrt(va/na + vb/nb)

    # Welch–Satterthwaite df
    df = (va/na + vb/nb)**2 / ((va**2)/((na**2)*(na-1)) + (vb**2)/((nb**2)*(nb-1)))
    if not np.isfinite(df) or se == 0:
        return np.nan, np.nan

    tcrit = stats.t.ppf(1 - alpha/2, df)
    # diff defined like your tables: weekend_mean - weekday_mean, etc.
    diff = mb - ma
    lo, hi = diff - tcrit*se, diff + tcrit*se
    return lo, hi


# 4) Multiple testing control (FDR/BH)

In [5]:
# --- FDR (Benjamini–Hochberg) helper ---
import numpy as np

def add_fdr(df, p_col="welch_p", out_col="welch_q"):
    p = df[p_col].to_numpy()
    m = len(p)
    order = np.argsort(p)
    ranks = np.empty_like(order)
    ranks[order] = np.arange(1, m+1)
    # raw BH
    q_raw = p * m / ranks
    # enforce monotone non-increasing when sorted by p
    q_mono = np.minimum.accumulate(q_raw[order][::-1])[::-1]
    df[out_col] = np.empty_like(p, dtype=float)
    df.loc[df.index[order], out_col] = q_mono
    return df


# 5)Weekend vs Weekday (price levels)

In [6]:
def test_weekend_weekday(g):
    wkdy = g.loc[g["dow"].between(0, 4), "price"]   # Mon–Fri
    wknd = g.loc[g["dow"].between(5, 6), "price"]   # Sat–Sun
    if len(wkdy) < 10 or len(wknd) < 10:
        return None
    t = welch_t(wkdy, wknd)
    u = mw_u(wkdy, wknd)
    ci_lo, ci_hi = welch_ci(wknd, wkdy)             # weekend - weekday CI

    return {
        "n_weekday": len(wkdy),
        "n_weekend": len(wknd),
        "weekday_mean": wkdy.mean(),
        "weekend_mean": wknd.mean(),
        "diff_mean": wknd.mean() - wkdy.mean(),
        "diff_ci_lo": ci_lo,
        "diff_ci_hi": ci_hi,
        "welch_t": t.statistic, "welch_p": t.pvalue,
        "cohen_d": cohen_d(wknd, wkdy),
        "mw_U": u.statistic, "mw_p": u.pvalue,
        "cliffs_delta": cliffs_delta(wknd, wkdy),
    }

# Safety check to avoid the earlier None issue
import pandas as pd
assert isinstance(df, pd.DataFrame), "df is not a DataFrame. Re-run the Imports & data cell."

wk_results = []
for r, g in df.groupby("route"):
    out = test_weekend_weekday(g)
    if out:
        out["route"] = r
        wk_results.append(out)

wk_results = pd.DataFrame(wk_results).sort_values("welch_p")
wk_results = add_fdr(wk_results, "welch_p", "welch_q")
wk_results.head()


Unnamed: 0,n_weekday,n_weekend,weekday_mean,weekend_mean,diff_mean,diff_ci_lo,diff_ci_hi,welch_t,welch_p,cohen_d,mw_U,mw_p,cliffs_delta,route,welch_q
85,1196,538,422.672241,431.150558,8.478317,-12.716605,-4.240029,-3.926004,9.3e-05,0.216517,288582.5,0.00059,0.103012,TYO-NYC,0.00818
82,1112,518,298.27518,303.760618,5.485438,-9.47912,-1.491756,-2.695474,0.007152,0.146472,264107.5,0.006898,0.082986,TYO-LAX,0.206368
74,526,253,444.536122,458.873518,14.337396,-25.262512,-3.41228,-2.578277,0.01021,0.195754,58953.5,0.009903,0.114001,SYD-LAX,0.206368
81,1878,803,245.921725,245.083437,-0.838288,0.195855,1.480721,2.559675,0.01058,-0.111297,801159.5,0.010086,-0.062522,TYO-DXB,0.206368
14,1440,582,395.286806,393.187285,-2.09952,0.467759,3.731282,2.524707,0.011725,-0.125203,440618.0,0.069321,-0.051494,DXB-NYC,0.206368


# 6) Short (≤14d) vs Long lead time (>14d)

In [7]:
def test_lead_time(g, cutoff=14):
    short = g.loc[g["lead_time_days"] <= cutoff, "price"]
    long  = g.loc[g["lead_time_days"] >  cutoff, "price"]
    if len(short) < 10 or len(long) < 10:
        return None

    t = welch_t(short, long)         # from your utilities
    u = mw_u(short, long)
    ci_lo, ci_hi = welch_ci(short, long)   # if you added the CI helper

    return {
        "n_short": len(short), "n_long": len(long),
        "short_mean": short.mean(), "long_mean": long.mean(),
        "diff_mean": short.mean() - long.mean(),   # (+) => short > long
        "welch_t": t.statistic, "welch_p": t.pvalue,
        "cohen_d": cohen_d(short, long),
        "mw_U": u.statistic, "mw_p": u.pvalue,
        "cliffs_delta": cliffs_delta(short, long),
        "diff_ci_lo": ci_lo,          # optional, if you added CI
        "diff_ci_hi": ci_hi,          # optional, if you added CI
    }

# ---- run for all routes ----
lt_rows = []                                       # <- list, not DataFrame
for r, g in df.groupby("route"):
    out = test_lead_time(g, cutoff=14)
    if out:
        out["route"] = r
        lt_rows.append(out)

lt_results = pd.DataFrame(lt_rows).sort_values("welch_p")
lt_results = add_fdr(lt_results, "welch_p", "welch_q")   # your BH helper
lt_results.head()



Unnamed: 0,n_short,n_long,short_mean,long_mean,diff_mean,welch_t,welch_p,cohen_d,mw_U,mw_p,cliffs_delta,diff_ci_lo,diff_ci_hi,route,welch_q
76,379,742,644.263852,601.109164,43.154688,11.439329,1.880567e-28,0.669105,197119.0,3.030941e-28,0.401895,-50.558209,-35.751167,SYD-NYC,1.6360929999999998e-26
39,51,1371,356.72549,322.342815,34.382675,14.941028,2.2103530000000003e-23,1.049968,60682.5,4.109835e-19,0.735745,-38.972976,-29.792373,LHR-NYC,9.615034e-22
33,142,638,479.809859,429.128527,50.681333,10.574113,6.149132e-22,0.835203,66214.0,7.044221e-18,0.461742,-60.119746,-41.242919,LAX-SYD,1.783248e-20
82,171,728,340.859649,310.940934,29.918715,9.867193,7.704756e-20,0.794967,89520.0,4.2931729999999997e-19,0.438211,-35.888051,-23.949379,TYO-LHR,1.675784e-18
21,158,731,351.85443,315.930233,35.924198,9.64085,1.122552e-18,0.850066,83940.5,3.5328519999999994e-19,0.45354,-43.266362,-28.582033,HND-LHR,1.9532400000000002e-17


# 7) Export tidy tables for Power BI

In [9]:
out_weekend = f"{DATA}/hyp_weekend_vs_weekday.csv"
out_lead    = f"{DATA}/hyp_short_vs_long.csv"

wk_results.to_csv(out_weekend, index=False)
lt_results.to_csv(out_lead, index=False)
print("Saved:", out_weekend, "and", out_lead)


Saved: D:\MGA\job\data analyst\flight-price-analytics\data/hyp_weekend_vs_weekday.csv and D:\MGA\job\data analyst\flight-price-analytics\data/hyp_short_vs_long.csv
