# Crowdless — Crowdness Demo

End-to-end notebook for training, exploring, and predicting a **crowdness score** from generic per-area time series JSON files under `./areas_output`.

**Input JSON expectation**
- Each file: `{ "area": "<name>", "items": [ { "datetime": "...", "<field_a>": <number>, "<field_b>": <number> }, ... ] }`
- Use `FIELD_MAP` below to map your JSON field names to `feature_x` and `feature_y`. Default mapping aligns with the current test files.

**Outputs**
- `areas_output/metrics_database.pkl` — processed DataFrame for fast lookup
- `areas_output/crowdness_model.json` — simple, tunable scoring rule parameters


## Setup

In [None]:
# If running locally and you need packages, uncomment:
# %pip install pandas numpy matplotlib


## Imports and Paths

In [None]:
from pathlib import Path
import json
from typing import Dict, Any

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

AREAS_DIR = Path("./areas_output")

# File outputs
DB_FILE = AREAS_DIR / "metrics_database.pkl"
MODEL_FILE = AREAS_DIR / "crowdness_model.json"

# Map your JSON numeric fields to generic features used by the model.
# Defaults match current test data; change as needed.
FIELD_MAP = {
    "feature_x": "temperature_celsius",          # rename to your field name
    "feature_y": "relative_humidity_percent",    # rename to your field name
}

AREAS_DIR, DB_FILE, MODEL_FILE, FIELD_MAP

## Build database from JSON files (Train)

In [None]:
def build_database(areas_dir: Path, db_path: Path, field_map: Dict[str, str]) -> pd.DataFrame:
    json_files = list(areas_dir.glob("*.json"))
    skip_files = {db_path.name, MODEL_FILE.name, "crowdness_scores.json"}
    rows = []

    if not json_files:
        raise FileNotFoundError(f"No JSON files found in {areas_dir.resolve()}.")

    for f in json_files:
        if f.name in skip_files:
            continue
        try:
            with open(f, "r", encoding="utf-8") as fh:
                data = json.load(fh)
        except Exception as e:
            print(f"Skipping {f.name}: {e}")
            continue

        area = data.get("area")
        if not area:
            print(f"Skipping {f.name}: missing 'area'")
            continue

        fx_key = field_map["feature_x"]
        fy_key = field_map["feature_y"]

        for item in data.get("items", []):
            dt = item.get("datetime")
            fx = item.get(fx_key)
            fy = item.get(fy_key)
            if dt and fx is not None and fy is not None:
                rows.append({
                    "area": area,
                    "datetime": pd.to_datetime(dt, utc=True),
                    "feature_x": float(fx),
                    "feature_y": float(fy),
                })

    if not rows:
        raise RuntimeError("No valid rows found in JSONs for the configured FIELD_MAP.")

    df = (
        pd.DataFrame(rows)
        .dropna()
        .drop_duplicates(subset=["area", "datetime"])
        .set_index("datetime")
    )

    df.to_pickle(db_path)
    return df

def default_model() -> Dict[str, float]:
    return {
        # Generic triangular scoring around an ideal point
        # feature_x: score 1.0 at ideal_x, 0.0 at ideal_x ± range_x
        "ideal_x": 22.0,
        "range_x": 15.0,

        # feature_y: penalize only when above ideal_y; 1.0 when <= ideal_y; 0.0 at ideal_y + range_y
        "ideal_y": 45.0,
        "range_y": 50.0,

        # Weights in the final score
        "weight_x": 0.7,
        "weight_y": 0.3,
    }

def save_model(model: Dict[str, float], path: Path):
    with open(path, "w", encoding="utf-8") as fh:
        json.dump(model, fh, indent=2)

try:
    df_trained = build_database(AREAS_DIR, DB_FILE, FIELD_MAP)
    model_rules = default_model()
    save_model(model_rules, MODEL_FILE)
    print(f"Saved {len(df_trained)} rows to {DB_FILE}")
    print(f"Saved model to {MODEL_FILE}")
except Exception as e:
    print(f"Train step: {e}")

## Explore database

In [None]:
def load_db(db_path: Path) -> pd.DataFrame:
    return pd.read_pickle(db_path)

def coverage_by_area(df: pd.DataFrame) -> pd.DataFrame:
    agg = df.reset_index().groupby("area").agg(
        start=("datetime", "min"),
        end=("datetime", "max"),
        rows=("datetime", "count"),
        mean_x=("feature_x", "mean"),
        mean_y=("feature_y", "mean"),
    ).sort_values("area")
    return agg

if DB_FILE.exists():
    db = load_db(DB_FILE)
    cov = coverage_by_area(db)
    display(cov)
else:
    print("Database not found. Run the train cell.")

## Scoring model

In [None]:
def load_model(path: Path) -> Dict[str, float]:
    with open(path, "r", encoding="utf-8") as fh:
        return json.load(fh)

def score_from_features(feature_x: float, feature_y: float, model: Dict[str, float]) -> float:
    # feature_x triangular score
    x_ideal = model["ideal_x"]
    x_range = model["range_x"]
    x_score = max(0.0, 1.0 - abs(feature_x - x_ideal) / x_range)

    # feature_y penalty when above ideal
    y_ideal = model["ideal_y"]
    y_range = model["range_y"]
    if feature_y > y_ideal:
        y_score = max(0.0, 1.0 - (feature_y - y_ideal) / y_range)
    else:
        y_score = 1.0

    wx = model["weight_x"]
    wy = model["weight_y"]
    final = (x_score * wx + y_score * wy) * 100.0
    return float(final)

if MODEL_FILE.exists():
    mdl = load_model(MODEL_FILE)
    print(mdl)
else:
    print("Model file not found. Run the train cell.")

### Score vs. feature_x

In [None]:
if MODEL_FILE.exists():
    mdl = load_model(MODEL_FILE)
    xs = np.linspace(-5, 45, 300)
    y_fixed = mdl["ideal_y"]
    scores = [score_from_features(x, y_fixed, mdl) for x in xs]

    plt.figure()
    plt.plot(xs, scores)
    plt.xlabel("feature_x")
    plt.ylabel("Score (0–100)")
    plt.title("Score vs feature_x (feature_y fixed at ideal_y)")
    plt.show()
else:
    print("Model file not found.")

### Score vs. feature_y

In [None]:
if MODEL_FILE.exists():
    mdl = load_model(MODEL_FILE)
    ys = np.linspace(0, 100, 300)
    x_fixed = mdl["ideal_x"]
    scores = [score_from_features(x_fixed, y, mdl) for y in ys]

    plt.figure()
    plt.plot(ys, scores)
    plt.xlabel("feature_y")
    plt.ylabel("Score (0–100)")
    plt.title("Score vs feature_y (feature_x fixed at ideal_x)")
    plt.show()
else:
    print("Model file not found.")

### Score heatmap (feature_x × feature_y)

In [None]:
if MODEL_FILE.exists():
    mdl = load_model(MODEL_FILE)
    xs = np.linspace(-5, 45, 101)
    ys = np.linspace(0, 100, 101)
    grid = np.zeros((len(ys), len(xs)))
    for i, y in enumerate(ys):
        for j, x in enumerate(xs):
            grid[i, j] = score_from_features(x, y, mdl)

    plt.figure()
    plt.imshow(grid, origin='lower', aspect='auto', extent=[xs.min(), xs.max(), ys.min(), ys.max()])
    plt.colorbar(label="Score (0–100)")
    plt.xlabel("feature_x")
    plt.ylabel("feature_y")
    plt.title("Score heatmap")
    plt.show()
else:
    print("Model file not found.")

## Predict for an area and time

In [None]:
def predict_area_time(area: str, dt: str, db: pd.DataFrame, model: Dict[str, float]) -> dict:
    input_dt = pd.to_datetime(dt, utc=True)
    area_db = db[db['area'].str.lower() == area.lower()]
    if area_db.empty:
        raise ValueError(f"No data for area '{area}'. Available: {sorted(set(db['area']))}")

    idx = area_db.index.get_indexer([input_dt], method='nearest')[0]
    if idx == -1:
        raise RuntimeError("Could not find nearest timestamp.")

    row = area_db.iloc[idx]
    matched_time = row.name
    fx = float(row['feature_x'])
    fy = float(row['feature_y'])
    score = score_from_features(fx, fy, model)

    return {
        "input_area": area,
        "input_time": input_dt.isoformat(),
        "matched_time": matched_time.isoformat(),
        "time_diff": str(abs(matched_time - input_dt)),
        "area_canonical": row['area'],
        "feature_x": fx,
        "feature_y": fy,
        "score": score,
    }

try:
    if DB_FILE.exists() and MODEL_FILE.exists():
        db = pd.read_pickle(DB_FILE)
        mdl = load_model(MODEL_FILE)
        example = predict_area_time("Syntagma", "2025-07-01T13:00:00Z", db, mdl)
        example
    else:
        print("Run training first.")
except Exception as e:
    print(e)

### Batch scoring

In [None]:
def batch_predict(pairs, db: pd.DataFrame, model: Dict[str, float]):
    out = []
    for area, dt in pairs:
        try:
            out.append(predict_area_time(area, dt, db, model))
        except Exception as e:
            out.append({"input_area": area, "input_time": dt, "error": str(e)})
    return pd.DataFrame(out)

try:
    if DB_FILE.exists() and MODEL_FILE.exists():
        db = pd.read_pickle(DB_FILE)
        mdl = load_model(MODEL_FILE)
        sample_pairs = [
            ("Syntagma", "2025-07-01T10:00:00Z"),
            ("Syntagma", "2025-07-01T18:00:00Z"),
            ("Kallithea", "2025-08-15T09:00:00Z"),
        ]
        batch_df = batch_predict(sample_pairs, db, mdl)
        display(batch_df)
    else:
        print("Run training first.")
except Exception as e:
    print(e)

## Notes

- Adjust `FIELD_MAP` to point to the numeric fields you want to use as `feature_x` and `feature_y`.
- Tune `MODEL_FILE` parameters to change the shape and weights of the scoring function.
