In [1]:
"""
Stage 4 — Evolution Differencing between successive tags.

Inputs (from Stage 3):
  data/<repo_slug>/curated/versions.csv
  data/<repo_slug>/curated/modules.csv
  data/<repo_slug>/curated/edges.csv
  data/<repo_slug>/curated/metrics.csv

Outputs (curated):
  changes_modules.csv  (version_from,version_to,tag_from,tag_to, change_type, module)
  changes_edges.csv    (version_from,version_to,tag_from,tag_to, change_type, src_module, dst_module)
  changes_metrics.csv  (version_from,version_to,tag_from,tag_to, module, fan_in_delta, fan_out_delta, cyclomatic_delta, centrality_delta)
  drift_summary.csv    (per tag pair rollups: counts + cycles + spike indicators)
"""
from __future__ import annotations
from pathlib import Path
from dataclasses import dataclass
from typing import Tuple, Set, Dict, List
import pandas as pd
import networkx as nx
from tqdm import tqdm

# ------------------- CONFIG -------------------
REPO_SLUG = "fastapi"
CURATED_DIR = Path(f"../data/{REPO_SLUG}/curated")

# Spike thresholds (tune as needed)
FAN_IN_SPIKE_ABS = 10         # absolute increase
FAN_IN_SPIKE_REL = 0.5        # +50% relative
CYCLO_SPIKE_ABS = 10          # absolute cyclomatic increase
CYCLO_SPIKE_REL = 0.5         # +50% relative
MAX_CYCLE_ENUM = 1000         # cap on cycles enumeration to keep runtime sane

In [2]:
# ------------------- IO -------------------
def load_df(name: str) -> pd.DataFrame:
    p_parquet = CURATED_DIR / f"{name}.parquet"
    p_csv = CURATED_DIR / f"{name}.csv"
    if p_parquet.exists():
        return pd.read_parquet(p_parquet)
    return pd.read_csv(p_csv)

def save_csv(df: pd.DataFrame, name: str):
    out = CURATED_DIR / name
    df.to_csv(out, index=False)
    return out

# ------------------- UTILS -------------------
def build_graph(edges_df: pd.DataFrame) -> nx.DiGraph:
    G = nx.DiGraph()
    if not edges_df.empty:
        G.add_edges_from(edges_df[["src_module","dst_module"]].itertuples(index=False, name=None))
    return G

def count_cycles_delta(G1: nx.DiGraph, G2: nx.DiGraph) -> Tuple[int,int]:
    """
    Approx cycles via strongly-connected components (SCC) of size>1.
    Returns (#cycles_like in G2 not in G1, #cycles_like removed from G1).
    We approximate cycle presence by the set of nodes in SCCs>1.
    """
    def scc_nodes(G):
        return frozenset(frozenset(c) for c in nx.strongly_connected_components(G) if len(c) > 1)
    s1, s2 = scc_nodes(G1), scc_nodes(G2)
    new = len([c for c in s2 if c not in s1])
    removed = len([c for c in s1 if c not in s2])
    return new, removed

def metrics_spike_flags(prev: pd.Series, curr: pd.Series) -> Dict[str, bool]:
    flags = {}
    # fan_in spike
    fi0, fi1 = prev.get("fan_in", 0), curr.get("fan_in", 0)
    flags["fan_in_spike"] = (fi1 - fi0 >= FAN_IN_SPIKE_ABS) or (fi0 > 0 and (fi1 - fi0)/fi0 >= FAN_IN_SPIKE_REL)
    # cyclomatic spike
    c0, c1 = prev.get("cyclomatic", 0), curr.get("cyclomatic", 0)
    flags["cyclomatic_spike"] = (c1 - c0 >= CYCLO_SPIKE_ABS) or (c0 > 0 and (c1 - c0)/c0 >= CYCLO_SPIKE_REL)
    return flags


In [3]:
# ------------------- MAIN -------------------
versions = load_df("versions").sort_values("date").reset_index(drop=True)
modules  = load_df("modules")
edges    = load_df("edges")
metrics  = load_df("metrics")

# Prepare outputs
changes_modules_rows = []
changes_edges_rows   = []
changes_metrics_rows = []
drift_summary_rows   = []

# Pair successive tags
pairs = list(zip(versions.iloc[:-1].itertuples(index=False), versions.iloc[1:].itertuples(index=False)))


In [None]:
for prev_row, curr_row in tqdm(pairs, desc="Stage 4 — tag pairs"):
    v0, v1 = int(getattr(prev_row, "id")), int(getattr(curr_row, "id"))
    t0, t1 = str(getattr(prev_row, "tag")), str(getattr(curr_row, "tag"))
    
    # Slice per tag
    m0 = modules[modules["tag"] == t0].copy()
    m1 = modules[modules["tag"] == t1].copy()
    e0 = edges[edges["tag"] == t0].copy()
    e1 = edges[edges["tag"] == t1].copy()
    x0 = metrics[metrics["tag"] == t0].set_index("module")
    x1 = metrics[metrics["tag"] == t1].set_index("module")
    
    # 1) Modules added/removed
    set_m0 = set(m0["module"])
    set_m1 = set(m1["module"])
    added_modules   = sorted(set_m1 - set_m0)
    removed_modules = sorted(set_m0 - set_m1)

    for mod in added_modules:
        changes_modules_rows.append({"version_from": v0, "version_to": v1, "tag_from": t0, "tag_to": t1,
                                        "change_type": "module_added", "module": mod})
    for mod in removed_modules:
        changes_modules_rows.append({"version_from": v0, "version_to": v1, "tag_from": t0, "tag_to": t1,
                                        "change_type": "module_removed", "module": mod})

    # 2) Edges added/removed
    set_e0 = set(map(tuple, e0[["src_module","dst_module"]].itertuples(index=False, name=None)))
    set_e1 = set(map(tuple, e1[["src_module","dst_module"]].itertuples(index=False, name=None)))
    added_edges   = sorted(set_e1 - set_e0)
    removed_edges = sorted(set_e0 - set_e1)

    for s, d in added_edges:
        changes_edges_rows.append({"version_from": v0, "version_to": v1, "tag_from": t0, "tag_to": t1,
                                    "change_type": "edge_added", "src_module": s, "dst_module": d})
    for s, d in removed_edges:
        changes_edges_rows.append({"version_from": v0, "version_to": v1, "tag_from": t0, "tag_to": t1,
                                    "change_type": "edge_removed", "src_module": s, "dst_module": d})

    # 3) Metrics deltas (only modules in both)
    common = sorted(set_m0 & set_m1)
    for mod in common:
        prev_metrics = x0.loc[mod] if mod in x0.index else pd.Series()
        curr_metrics = x1.loc[mod] if mod in x1.index else pd.Series()
        row = {
            "version_from": v0, "version_to": v1, "tag_from": t0, "tag_to": t1, "module": mod,
            "fan_in_delta":     int(curr_metrics.get("fan_in", 0)) - int(prev_metrics.get("fan_in", 0)),
            "fan_out_delta":    int(curr_metrics.get("fan_out", 0)) - int(prev_metrics.get("fan_out", 0)),
            "cyclomatic_delta": int(curr_metrics.get("cyclomatic", 0)) - int(prev_metrics.get("cyclomatic", 0)),
            "centrality_delta": float(curr_metrics.get("centrality_degree", 0.0)) - float(prev_metrics.get("centrality_degree", 0.0)),
        }
        changes_metrics_rows.append(row)

    # 4) Cycle/Drift summary (approx)
    G0 = build_graph(e0)
    G1 = build_graph(e1)
    new_cycles, removed_cycles = count_cycles_delta(G0, G1)

    # 5) Spike detection quick stats
    spikes_fan_in = 0
    spikes_cyclo  = 0
    for mod in common:
        prev_metrics = x0.loc[mod] if mod in x0.index else pd.Series()
        curr_metrics = x1.loc[mod] if mod in x1.index else pd.Series()
        flags = metrics_spike_flags(prev_metrics, curr_metrics)
        spikes_fan_in += int(flags["fan_in_spike"])
        spikes_cyclo  += int(flags["cyclomatic_spike"])

    drift_summary_rows.append({
        "version_from": v0, "version_to": v1, "tag_from": t0, "tag_to": t1,
        "modules_added": len(added_modules),
        "modules_removed": len(removed_modules),
        "edges_added": len(added_edges),
        "edges_removed": len(removed_edges),
        "new_cycles": new_cycles,
        "removed_cycles": removed_cycles,
        "modules_with_fan_in_spike": spikes_fan_in,
        "modules_with_cyclomatic_spike": spikes_cyclo
    })

Stage 4 — tag pairs: 100%|██████████| 24/24 [00:01<00:00, 12.26it/s]


In [6]:
# Write outputs
changes_modules = pd.DataFrame(changes_modules_rows,
    columns=["version_from","version_to","tag_from","tag_to","change_type","module"])
changes_edges = pd.DataFrame(changes_edges_rows,
    columns=["version_from","version_to","tag_from","tag_to","change_type","src_module","dst_module"])
changes_metrics = pd.DataFrame(changes_metrics_rows,
    columns=["version_from","version_to","tag_from","tag_to","module","fan_in_delta","fan_out_delta","cyclomatic_delta","centrality_delta"])
drift_summary = pd.DataFrame(drift_summary_rows,
    columns=["version_from","version_to","tag_from","tag_to","modules_added","modules_removed","edges_added","edges_removed","new_cycles","removed_cycles","modules_with_fan_in_spike","modules_with_cyclomatic_spike"])

save_csv(changes_modules, "changes_modules.csv")
save_csv(changes_edges, "changes_edges.csv")
save_csv(changes_metrics, "changes_metrics.csv")
save_csv(drift_summary, "drift_summary.csv")

print("OK ✅ Stage 4 complete")
print(" - changes_modules.csv")
print(" - changes_edges.csv")
print(" - changes_metrics.csv")
print(" - drift_summary.csv")

OK ✅ Stage 4 complete
 - changes_modules.csv
 - changes_edges.csv
 - changes_metrics.csv
 - drift_summary.csv
