In [None]:
!pip install powerlaw
!pip install dataclass

Collecting powerlaw
  Downloading powerlaw-1.5-py3-none-any.whl.metadata (9.3 kB)
Downloading powerlaw-1.5-py3-none-any.whl (24 kB)
Installing collected packages: powerlaw
Successfully installed powerlaw-1.5
[31mERROR: Could not find a version that satisfies the requirement dataclass (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for dataclass[0m[31m
[0m

In [None]:
# heavy_tail_inspector.py
# Minimal, modular heavy-tail diagnostics for numeric columns in a dataset.

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple

try:
    # Optional: for power-law vs alternatives (Clauset-Shalizi-Newman style)
    from powerlaw import Fit as PowerLawFit
    _HAS_POWERLAW = True
except Exception:
    _HAS_POWERLAW = False

try:
    from scipy.stats import genpareto
    _HAS_SCIPY = True
except Exception:
    _HAS_SCIPY = False


@dataclass
class HeavyTailResult:
    n: int
    positive_fraction: float
    ccdf_tail_slope: float                # slope on log-log CCDF over top tail_frac
    ccdf_tail_r2: float
    hill_alpha: Optional[float]           # None if cannot compute
    hill_k: Optional[int]
    quantile_ratio: float                 # (Q99.9 - Q99) / (Q99 - Q90)
    gpd_xi: Optional[float]               # > 0 suggests heavy tail
    powerlaw_vs_exp_R: Optional[float]
    powerlaw_vs_exp_p: Optional[float]
    xmin: Optional[float]
    verdict: str
    notes: str


class HeavyTailInspector:
    """
    Usage:
        hti = HeavyTailInspector(tail_frac=0.1, min_positive=1000)
        res = hti.inspect(df, columns=["fare_amount","trip_distance"])
    """
    def __init__(self, tail_frac: float = 0.1, min_positive: int = 500, seed: int = 0):
        """
        tail_frac: fraction of largest observations used for CCDF slope & Hill (0.05–0.2 is typical)
        min_positive: minimum number of positive values required to run tests
        """
        if not (0.01 <= tail_frac <= 0.5):
            raise ValueError("tail_frac should be in [0.01, 0.5].")
        self.tail_frac = tail_frac
        self.min_positive = min_positive
        self.rng = np.random.default_rng(seed)

    # ---------------- Core API ----------------

    def inspect(self, df: pd.DataFrame, columns: List[str]) -> Dict[str, HeavyTailResult]:
        results = {}
        for col in columns:
            s = pd.to_numeric(df[col], errors="coerce").dropna()
            results[col] = self._inspect_series(s, col)
        return results

    # ------------- Helpers per column -------------

    def _inspect_series(self, s: pd.Series, colname: str) -> HeavyTailResult:
        s = s.replace([np.inf, -np.inf], np.nan).dropna()
        n_all = len(s)
        # Most tail methods require positive support; filter to positive part
        s_pos = s[s > 0]
        n_pos = len(s_pos)
        pos_frac = 0.0 if n_all == 0 else n_pos / n_all

        if n_pos < self.min_positive:
            return HeavyTailResult(
                n=n_all, positive_fraction=pos_frac,
                ccdf_tail_slope=np.nan, ccdf_tail_r2=np.nan,
                hill_alpha=None, hill_k=None,
                quantile_ratio=np.nan, gpd_xi=None,
                powerlaw_vs_exp_R=None, powerlaw_vs_exp_p=None,
                xmin=None, verdict="insufficient_data",
                notes=f"Need at least {self.min_positive} positive values; got {n_pos}."
            )

        x = np.sort(s_pos.values)
        # ---- Quantile ratio
        q90, q99, q999 = np.quantile(x, [0.90, 0.99, 0.999], method="linear")
        denom = max(q99 - q90, 1e-12)
        quant_ratio = (q999 - q99) / denom

        # ---- CCDF tail slope on log-log
        tail_start_idx = int((1 - self.tail_frac) * len(x))
        tail = x[tail_start_idx:]
        # CCDF for sorted x: P(X > x_i) = 1 - i/n
        ranks = np.arange(len(x)) + 1
        ccdf_all = 1 - ranks / len(x)
        ccdf_tail = ccdf_all[tail_start_idx:]

        # Drop zeros in log
        mask = (tail > 0) & (ccdf_tail > 0)
        lx = np.log(tail[mask])
        ly = np.log(ccdf_tail[mask])
        slope, r2 = self._ols_slope_r2(lx, ly)  # slope ≈ -alpha for a pure power law

        # ---- Hill estimator on top-k
        k = max(50, int(self.tail_frac * len(x)))
        hill_alpha = self._hill_alpha(x, k) if k < len(x) else None

        # ---- GPD fit for exceedances over high threshold
        xi = None
        if _HAS_SCIPY:
            u = np.quantile(x, 1 - self.tail_frac, method="linear")
            excess = x[x > u] - u
            if len(excess) >= 50:
                try:
                    # Fit (xi, loc, scale); we passed excess so loc ~ 0
                    xi, _, _ = genpareto.fit(excess, floc=0)
                except Exception:
                    xi = None

        # ---- powerlaw model comparison (optional)
        R = p = xmin = None
        if _HAS_POWERLAW:
            try:
                fit = PowerLawFit(x, discrete=False)
                xmin = float(fit.xmin)
                # Power-law vs exponential
                R, p = fit.distribution_compare('power_law', 'exponential')
            except Exception:
                R = p = xmin = None

        # ---- Verdict logic (simple rules; you can tweak thresholds)
        flags = 0
        notes = []

        # 1) CCDF tail reasonably linear and negative slope
        if (r2 is not None) and (r2 > 0.95) and (slope < -1.5):
            flags += 1
            notes.append("CCDF tail ~linear on log-log with steep slope.")

        # 2) Hill alpha suggests heaviness (alpha < 4 often used; < 2 very heavy)
        if (hill_alpha is not None) and (hill_alpha < 4.0):
            flags += 1
            notes.append(f"Hill α≈{hill_alpha:.2f} (<4).")

        # 3) Quantile spread ratio large
        if quant_ratio > 10:
            flags += 1
            notes.append(f"Quantile ratio={quant_ratio:.1f} (>10).")

        # 4) GPD xi > 0 implies heavy tail
        if (xi is not None) and (xi > 0):
            flags += 1
            notes.append(f"GPD ξ≈{xi:.3f} (>0).")

        # 5) powerlaw beats exponential
        if (R is not None) and (p is not None) and (R > 0) and (p < 0.05):
            flags += 1
            notes.append("Power-law >> exponential (LR test).")

        if flags >= 3:
            verdict = "heavy_tailed"
        elif flags == 2:
            verdict = "probably_heavy_tailed"
        elif flags == 1:
            verdict = "borderline"
        else:
            verdict = "not_heavy_tailed"

        return HeavyTailResult(
            n=n_all,
            positive_fraction=pos_frac,
            ccdf_tail_slope=float(slope),
            ccdf_tail_r2=float(r2),
            hill_alpha=None if hill_alpha is None else float(hill_alpha),
            hill_k=None if hill_alpha is None else int(k),
            quantile_ratio=float(quant_ratio),
            gpd_xi=None if xi is None else float(xi),
            powerlaw_vs_exp_R=None if R is None else float(R),
            powerlaw_vs_exp_p=None if p is None else float(p),
            xmin=None if xmin is None else float(xmin),
            verdict=verdict,
            notes="; ".join(notes) if notes else "No strong heavy-tail signals."
        )

    # ------------- Internal utilities -------------

    @staticmethod
    def _ols_slope_r2(x: np.ndarray, y: np.ndarray) -> Tuple[float, float]:
        # y = a + b x; return b and R^2
        if len(x) < 3:
            return (np.nan, np.nan)
        X = np.vstack([np.ones_like(x), x]).T
        beta, *_ = np.linalg.lstsq(X, y, rcond=None)
        yhat = X @ beta
        ss_res = np.sum((y - yhat) ** 2)
        ss_tot = np.sum((y - y.mean()) ** 2)
        r2 = 1 - ss_res / ss_tot if ss_tot > 0 else np.nan
        slope = beta[1]
        return slope, r2

    @staticmethod
    def _hill_alpha(x_sorted_pos: np.ndarray, k: int) -> Optional[float]:
        # x_sorted_pos must be ascending; take top k tail points
        if len(x_sorted_pos) <= k or k < 5:
            return None
        tail = x_sorted_pos[-k:]
        xk1 = x_sorted_pos[-(k+1)]
        logs = np.log(tail) - np.log(xk1)
        Hk_inv = np.mean(logs)
        if Hk_inv <= 0:
            return None
        return 1.0 / Hk_inv


# ---------------- Example usage ----------------
if __name__ == "__main__":
    # Example: adapt to your CSV and columns
    df = pd.DataFrame({
        "x": np.r_[np.random.lognormal(2, 0.6, 10_000),
                   (np.random.pareto(2.2, 5000) + 1) * 50]
    })
    inspector = HeavyTailInspector(tail_frac=0.1, min_positive=500)
    results = inspector.inspect(df, columns=["x"])
    for col, res in results.items():
        print(f"\nColumn: {col}")
        print(res)


Calculating best minimal value for power law fit

Column: x
HeavyTailResult(n=15000, positive_fraction=1.0, ccdf_tail_slope=-2.1711463657878705, ccdf_tail_r2=0.9983781583033131, hill_alpha=2.1608826379507677, hill_k=1500, quantile_ratio=2.700411756953102, gpd_xi=0.465106052759132, powerlaw_vs_exp_R=755.0861245977169, powerlaw_vs_exp_p=3.1059324695325984e-19, xmin=50.01204654059562, verdict='heavy_tailed', notes='CCDF tail ~linear on log-log with steep slope.; Hill α≈2.16 (<4).; GPD ξ≈0.465 (>0).; Power-law >> exponential (LR test).')


In [None]:
import pandas as pd

df = pd.read_parquet("green_tripdata_2025-08.parquet")


In [None]:
import pandas as pd
# from heavy_tail_inspector import HeavyTailInspector

# df = pd.read_csv("yellow_tripdata_2025-09.csv")  # your file
cols = ["fare_amount", "trip_distance", "tip_amount", "total_amount", 'congestion_surcharge', 'cbd_congestion_fee']

hti = HeavyTailInspector(tail_frac=0.10, min_positive=2000)
out = hti.inspect(df, columns=cols)

for c, r in out.items():
    print("\n", c)
    print("verdict:", r.verdict)
    print("notes  :", r.notes)
    print(f"n={r.n}, pos%={r.positive_fraction:.2%}")
    print(f"CCDF slope={r.ccdf_tail_slope:.2f}, R2={r.ccdf_tail_r2:.3f}")
    print(f"Hill α={r.hill_alpha}, k={r.hill_k}, Q-ratio={r.quantile_ratio:.1f}")
    print(f"GPD xi={r.gpd_xi}, LR(R)={r.powerlaw_vs_exp_R}, p={r.powerlaw_vs_exp_p}, xmin={r.xmin}")



 fare_amount
verdict: heavy_tailed
notes  : CCDF tail ~linear on log-log with steep slope.; Hill α≈2.16 (<4).; GPD ξ≈0.142 (>0).
n=46306, pos%=99.09%
CCDF slope=-2.65, R2=0.979
Hill α=2.1570278796919684, k=4588, Q-ratio=1.5
GPD xi=0.1415588539304969, LR(R)=None, p=None, xmin=None

 trip_distance
verdict: probably_heavy_tailed
notes  : Hill α≈2.57 (<4).; GPD ξ≈0.329 (>0).
n=46306, pos%=96.84%
CCDF slope=-1.82, R2=0.693
Hill α=2.5723018403409514, k=4484, Q-ratio=1.5
GPD xi=0.32862527417289994, LR(R)=None, p=None, xmin=None

 tip_amount
verdict: heavy_tailed
notes  : CCDF tail ~linear on log-log with steep slope.; Hill α≈2.43 (<4).; GPD ξ≈0.267 (>0).
n=46306, pos%=61.14%
CCDF slope=-2.68, R2=0.993
Hill α=2.4340867678928424, k=2831, Q-ratio=2.2
GPD xi=0.2669920567735478, LR(R)=None, p=None, xmin=None

 total_amount
verdict: heavy_tailed
notes  : CCDF tail ~linear on log-log with steep slope.; Hill α≈2.46 (<4).; GPD ξ≈0.156 (>0).
n=46306, pos%=99.53%
CCDF slope=-2.92, R2=0.986
Hill α=2.460

In [None]:
summary = df.describe().drop(columns = ['lpep_dropoff_datetime', 'lpep_pickup_datetime']).T
summary = summary[summary['mean'] > summary['25%']]

In [None]:
file_path = 'usa_00002.csv.gz'
import gzip
import shutil
with gzip.open(file_path, 'rb') as f_in:
    with open('usa_00002.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [None]:
import pandas as pd

df = pd.read_csv("usa_00002.csv")

In [None]:
df.columns

Index(['YEAR', 'SAMPLE', 'SERIAL', 'CBSERIAL', 'HHWT', 'CLUSTER', 'STRATA',
       'GQ', 'HHINCOME'],
      dtype='object')

In [None]:
import pandas as pd
# from heavy_tail_inspector import HeavyTailInspector

# df = pd.read_csv("yellow_tripdata_2025-09.csv")  # your file
cols = ["HHINCOME", "HHWT"]

hti = HeavyTailInspector(tail_frac=0.10, min_positive=2000)
out = hti.inspect(df, columns=cols)

for c, r in out.items():
    print("\n", c)
    print("verdict:", r.verdict)
    print("notes  :", r.notes)
    print(f"n={r.n}, pos%={r.positive_fraction:.2%}")
    print(f"CCDF slope={r.ccdf_tail_slope:.2f}, R2={r.ccdf_tail_r2:.3f}")
    print(f"Hill α={r.hill_alpha}, k={r.hill_k}, Q-ratio={r.quantile_ratio:.1f}")
    print(f"GPD xi={r.gpd_xi}, LR(R)={r.powerlaw_vs_exp_R}, p={r.powerlaw_vs_exp_p}, xmin={r.xmin}")


Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
xmin progress: 99%
 HHINCOME
verdict: heavy_tailed
notes  : Hill α≈0.52 (<4).; GPD ξ≈2.168 (>0).; Power-law >> exponential (LR test).
n=3405809, pos%=99.22%
CCDF slope=-0.47, R2=0.478
Hill α=0.5151137161512463, k=337915, Q-ratio=0.0
GPD xi=2.1680188162446465, LR(R)=3235307.5697062323, p=0.0, xmin=60000.0

 HHWT
verdict: heavy_tailed
notes  : CCDF tail ~linear on log-log with steep slope.; Hill α≈2.60 (<4).; GPD ξ≈0.036 (>0).; Power-law >> exponential (LR test).
n=3405809, pos%=100.00%
CCDF slope=-3.24, R2=0.963
Hill α=2.6004088038339823, k=340580, Q-ratio=1.1
GPD xi=0.03619342149294566, LR(R)=71.71443280434283, p=0.00016633806846291178, xmin=570.0
