In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from __future__ import annotations

import json
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import pandas as pd

# ---------------- Config ----------------
DATA_ROOT = Path("data")
OUTPUT_DIR = Path("outputs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

SECTION = "aggregated"  # we only handle 'aggregated' in this script
SAVE_FORMAT = "csv"     # 'csv' or 'parquet'

# Accept plural/singular folder names and normalize
TYPE_ALIASES = {
    "transaction": "transaction",
    "transactions": "transaction",
    "user": "user",
    "users": "user",
    "insurance": "insurance",
    "insurances": "insurance",
}

# -------------- Helpers ---------------
def _safe_get(d: dict, *keys, default=None):
    cur = d
    for k in keys:
        if not isinstance(cur, dict) or k not in cur:
            return default
        cur = cur[k]
    return cur

def detect_type_folders() -> Dict[str, str]:
    """
    Returns mapping of actual folder name -> normalized type ('transaction'|'user'|'insurance')
    under data/aggregated/* .
    """
    m: Dict[str, str] = {}
    root = DATA_ROOT / SECTION
    if not root.exists():
        return m
    for child in root.iterdir():
        if child.is_dir():
            norm = TYPE_ALIASES.get(child.name.lower())
            if norm:
                m[child.name] = norm
    return m

def parse_path(p: Path, norm_type_by_folder: Dict[str, str]) -> Optional[Dict[str, Optional[str]]]:
    """
    Supports (corrected state path):
      data/aggregated/<typeFolder>/country/india/<year>/<q>.json
      data/aggregated/<typeFolder>/country/india/state/<state>/<year>/<q>.json
    where <typeFolder> can be singular or plural; normalized to {'transaction','user','insurance'}.
    """
    try:
        parts = p.parts
        if "data" not in parts or "aggregated" not in parts:
            return None
        i = parts.index("data")
        if parts[i+1].lower() != "aggregated":
            return None

        type_folder = parts[i+2]
        typ = norm_type_by_folder.get(type_folder)
        if not typ:
            return None

        if parts[i+3].lower() != "country" or parts[i+4].lower() != "india":
            return None

        if not p.name.endswith(".json"):
            return None
        quarter = p.stem
        if quarter not in {"1","2","3","4"}:
            return None

        remainder = parts[i+5:-1]  # after 'india' up to file
        if len(remainder) == 1:
            year = remainder[0]
            geo_level = "country"
            state = None
        elif len(remainder) == 3 and remainder[0].lower() == "state":
            state = remainder[1]
            year = remainder[2]
            geo_level = "state"
        else:
            return None

        if not year.isdigit():
            return None

        return {
            "section": "aggregated",
            "type": typ,
            "geo_level": geo_level,
            "country": "india",
            "state": state,
            "year": int(year),
            "quarter": int(quarter),
            "source_path": str(p),
        }
    except Exception:
        return None

def iter_json_files() -> List[Tuple[Path, Dict[str, Optional[str]]]]:
    files = []
    type_map = detect_type_folders()
    # if nothing detected, still try known folders
    if not type_map:
        for fallback in ["transaction", "transactions", "user", "users", "insurance"]:
            type_map[fallback] = TYPE_ALIASES.get(fallback)
    for type_folder, norm_type in type_map.items():
        # Country
        for p in (DATA_ROOT / SECTION / type_folder / "country" / "india").glob("*/*.json"):
            meta = parse_path(p, type_map)
            if meta:
                files.append((p, meta))
        # State
        for p in (DATA_ROOT / SECTION / type_folder / "country" / "india" / "state").glob("*/*/*.json"):
            meta = parse_path(p, type_map)
            if meta:
                files.append((p, meta))
    return files

# -------------- Normalizers ---------------
def normalize_transaction_or_insurance(payload: dict, base: Dict) -> pd.DataFrame:
    rows = []
    items = _safe_get(payload, "data", "transactionData", default=[]) or []
    for item in items:
        name = item.get("name")
        instruments = item.get("paymentInstruments") or []
        total = next((pi for pi in instruments if (pi.get("type") or "").upper() == "TOTAL"), None)
        count = None if total is None else total.get("count")
        amount = None if total is None else total.get("amount")
        rows.append({
            **base,
            "category": name,
            "count": count,
            "amount": amount,
        })
    return pd.DataFrame(rows)

def normalize_user(payload: dict, base: Dict) -> pd.DataFrame:
    rows = []
    agg = _safe_get(payload, "data", "aggregated", default={}) or {}
    rows.append({
        **base,
        "brand": None,
        "registeredUsers": agg.get("registeredUsers"),
        "appOpens": agg.get("appOpens"),
        "brand_count": None,
        "brand_percentage": None,
    })
    by_device = _safe_get(payload, "data", "usersByDevice", default=[]) or []
    for d in by_device:
        rows.append({
            **base,
            "brand": d.get("brand"),
            "registeredUsers": None,
            "appOpens": None,
            "brand_count": d.get("count"),
            "brand_percentage": d.get("percentage"),
        })
    return pd.DataFrame(rows)

# -------------- Build ----------------
def collect_frames() -> Dict[Tuple[str, str], pd.DataFrame]:
    buckets: Dict[Tuple[str, str], List[pd.DataFrame]] = {}
    matched = iter_json_files()

    # quick report
    print("[i] Files matched:", len(matched))
    sample = [m[0] for m in matched[:10]]
    for s in sample:
        print("   •", s)

    for path, meta in matched:
        base = {
            "section": meta["section"],
            "type": meta["type"],
            "geo_level": meta["geo_level"],
            "country": meta["country"],
            "state": meta["state"],
            "year": meta["year"],
            "quarter": meta["quarter"],
            "period": f'{meta["year"]}-Q{meta["quarter"]}',
            "source_path": meta["source_path"],
        }
        try:
            with open(path, "r", encoding="utf-8") as f:
                payload = json.load(f)
        except Exception as e:
            print(f"[WARN] Failed to read {path}: {e}")
            continue

        if meta["type"] in ("transaction", "insurance"):
            df = normalize_transaction_or_insurance(payload, base)
        else:
            df = normalize_user(payload, base)

        buckets.setdefault((meta["type"], meta["geo_level"]), []).append(df)

    out: Dict[Tuple[str, str], pd.DataFrame] = {}
    for key, frames in buckets.items():
        if not frames:
            continue
        df = pd.concat(frames, ignore_index=True)
        common = ["section","type","geo_level","country","state","year","quarter","period","source_path"]
        optionals = ["category","count","amount","brand","registeredUsers","appOpens","brand_count","brand_percentage"]
        for c in optionals:
            if c not in df.columns:
                df[c] = pd.Series([None]*len(df))
        df = df[common + optionals]
        out[key] = df.sort_values(
            ["type","geo_level","state","year","quarter","category","brand"],
            na_position="last"
        ).reset_index(drop=True)
    return out

def save_outputs(dfs: Dict[Tuple[str, str], pd.DataFrame]) -> None:
    wanted = [
        ("insurance","country"),
        ("insurance","state"),
        ("transaction","country"),
        ("transaction","state"),
        ("user","country"),
        ("user","state"),
    ]
    for typ, geo in wanted:
        df = dfs.get((typ, geo), pd.DataFrame())
        fname = OUTPUT_DIR / f"aggregated_{typ}_{geo}.{ 'parquet' if SAVE_FORMAT=='parquet' else 'csv' }"
        if df.empty:
            print(f"[WARN] No rows for ({typ}, {geo}). Writing an empty file header to {fname}.")
            if SAVE_FORMAT == "parquet":
                # Parquet doesn't like purely empty w/o schema; write CSV header instead
                df.to_csv(fname.with_suffix(".csv"), index=False)
            else:
                df.to_csv(fname, index=False)
            continue
        if SAVE_FORMAT == "parquet":
            df.to_parquet(fname, index=False)
        else:
            df.to_csv(fname, index=False)
        print(f"[OK] Wrote {fname}  ({len(df):,} rows)")

def list_state_folders() -> List[str]:
    names = set()
    root = DATA_ROOT / SECTION
    if not root.exists():
        return []
    for type_folder in root.iterdir():
        if not type_folder.is_dir():
            continue
        if TYPE_ALIASES.get(type_folder.name.lower()) is None:
            continue
        state_root = type_folder / "country" / "india" / "state"
        if state_root.exists():
            for s in state_root.iterdir():
                if s.is_dir():
                    names.add(s.name)
    return sorted(names)

# -------------- Main ----------------
if __name__ == "__main__":
    print("[i] Scanning repository for aggregated JSON ...")
    dfs = collect_frames()
    save_outputs(dfs)
    states = list_state_folders()
    if states:
        print(f"[i] Found {len(states)} state folders:")
        for s in states:
            print("   -", s)
    else:
        print("[i] No state folders found under aggregated/*/country/india/state/*")


[i] Scanning repository for aggregated JSON ...
[i] Files matched: 2775
   • data/aggregated/transaction/country/india/2022/1.json
   • data/aggregated/transaction/country/india/2022/2.json
   • data/aggregated/transaction/country/india/2022/3.json
   • data/aggregated/transaction/country/india/2022/4.json
   • data/aggregated/transaction/country/india/2024/1.json
   • data/aggregated/transaction/country/india/2024/2.json
   • data/aggregated/transaction/country/india/2024/3.json
   • data/aggregated/transaction/country/india/2024/4.json
   • data/aggregated/transaction/country/india/2023/1.json
   • data/aggregated/transaction/country/india/2023/2.json
[OK] Wrote outputs/aggregated_insurance_country.csv  (19 rows)
[OK] Wrote outputs/aggregated_insurance_state.csv  (682 rows)
[OK] Wrote outputs/aggregated_transaction_country.csv  (140 rows)
[OK] Wrote outputs/aggregated_transaction_state.csv  (5,034 rows)
[OK] Wrote outputs/aggregated_user_country.csv  (215 rows)
[OK] Wrote outputs/agg