In [28]:
import os, json, hashlib, tempfile, shutil
from datetime import datetime
from pathlib import Path

import pandas as pd
from dateutil.tz import tzutc
from git import Repo as GitRepo
from pydriller import Repository
from tqdm import tqdm

# ---------- Config ----------
REPO_PATH = Path("../external/fastapi")              # change if needed
REPO_SLUG = "fastapi"
OUT_DIR = Path(f"../data/{REPO_SLUG}")
RAW_DIR = OUT_DIR / "raw"
CURATED_DIR = OUT_DIR / "curated"
LOGS_DIR = OUT_DIR / "logs"
CURATED_DIR.mkdir(parents=True, exist_ok=True)
RAW_DIR.mkdir(parents=True, exist_ok=True)
LOGS_DIR.mkdir(parents=True, exist_ok=True)


# Limit how many most-recent tags to process (None for all)
MAX_TAGS = 25

# ---------- Helpers ----------
def atomic_write_csv(df: pd.DataFrame, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    with tempfile.NamedTemporaryFile("w", delete=False, dir=str(path.parent), suffix=".csv") as tmp:
        df.to_csv(tmp.name, index=False)
        tmp_path = Path(tmp.name)
    tmp_path.replace(path)

def sha256_bytes(b: bytes) -> str:
    return hashlib.sha256(b).hexdigest()

def now_iso() -> str:
    return datetime.utcnow().replace(tzinfo=tzutc()).isoformat()

def get_tags_sorted(repo_path: Path):
    g = GitRepo(str(repo_path))
    tags = sorted(g.tags, key=lambda t: t.commit.committed_datetime)  # oldest→newest
    # Return (tag_name, commit_sha, commit_datetime)
    return [(str(t), t.commit.hexsha, t.commit.committed_datetime) for t in tags]

def build_versions_df(tag_tuples, repo_slug):
    rows = []
    for i, (tag, sha, dt) in enumerate(tag_tuples, start=1):
        rows.append({
            "id": i,
            "repo_slug": repo_slug,
            "tag": tag,
            "commit": sha,
            # "date": pd.Timestamp(dt, tz="UTC"),
            "date": pd.Timestamp(dt).tz_convert("UTC"),

        })
    df = pd.DataFrame(rows, columns=["id", "repo_slug", "tag", "commit", "date"])
    # Ensure stable types
    df["id"] = df["id"].astype("int64")
    df["repo_slug"] = df["repo_slug"].astype("string")
    df["tag"] = df["tag"].astype("string")
    df["commit"] = df["commit"].astype("string")
    return df

def build_tag_commit_df(tag_tuples, repo_path):
    """
    One row per TAGGED COMMIT only (not full history).
    """
    rows = []
    for tag, sha, dt in tag_tuples:
        rows.append({
            "repo_slug": REPO_SLUG,
            "tag": tag,
            "commit": sha,
            "author_name": None,
            "author_email": None,
            # "authored_date": pd.Timestamp(dt, tz="UTC"),
            "authored_date": pd.Timestamp(dt).tz_convert("UTC"),
            "committer_name": None,
            "committer_email": None,
            # "committed_date": pd.Timestamp(dt, tz="UTC"),
            "committed_date": pd.Timestamp(dt).tz_convert("UTC"),
            "message": f"Tagged release {tag}",
        })
    df = pd.DataFrame(rows, columns=[
        "repo_slug","tag","commit",
        "author_name","author_email","authored_date",
        "committer_name","committer_email","committed_date",
        "message"
    ])
    for c in ["repo_slug","tag","commit","author_name","author_email","committer_name","committer_email","message"]:
        df[c] = df[c].astype("string")
    return df

def build_files_changed_df(tag_tuples, repo_path):
    """
    For each tag’s commit, list files changed in that commit vs its parent(s).
    This stays light (no full history).
    """
    rows = []
    # PyDriller can iterate specific commits by hash
    for tag, sha, _ in tqdm(tag_tuples, desc="Files for tagged commits"):
        for commit in Repository(path_to_repo=str(repo_path), single=sha).traverse_commits():
            for m in commit.modified_files:
                rows.append({
                    "repo_slug": REPO_SLUG,
                    "tag": tag,
                    "commit": commit.hash,
                    "filename": m.new_path or m.old_path or m.filename,
                    "change_type": str(m.change_type.name) if hasattr(m.change_type, "name") else str(m.change_type),
                    "added_lines": int(m.added_lines or 0),
                    "deleted_lines": int(m.deleted_lines or 0),
                })
    df = pd.DataFrame(rows, columns=[
        "repo_slug","tag","commit","filename","change_type","added_lines","deleted_lines"
    ])
    if not df.empty:
        df["repo_slug"] = df["repo_slug"].astype("string")
        df["tag"] = df["tag"].astype("string")
        df["commit"] = df["commit"].astype("string")
        df["filename"] = df["filename"].astype("string")
        df["change_type"] = df["change_type"].astype("string")
        df["added_lines"] = df["added_lines"].astype("int64")
        df["deleted_lines"] = df["deleted_lines"].astype("int64")
    return df

def write_run_metadata(tag_tuples):
    meta = {
        "repo_slug": REPO_SLUG,
        "repo_path": str(REPO_PATH.resolve()),
        "timestamp_utc": now_iso(),
        "tag_count": len(tag_tuples),
        "limit": MAX_TAGS,
        "notes": "Stage 2 extraction (tags only; single tagged commit per tag)."
    }
    out = RAW_DIR / "run_metadata.json"
    with tempfile.NamedTemporaryFile("w", delete=False, dir=str(out.parent), suffix=".json") as tmp:
        json.dump(meta, tmp, indent=2, default=str)
        tmp_path = Path(tmp.name)
    tmp_path.replace(out)


In [29]:
# 1) Discover and (optionally) trim tags
tag_tuples_all = get_tags_sorted(REPO_PATH)

if MAX_TAGS is not None:
    tag_tuples = tag_tuples_all[-MAX_TAGS:]  # most recent N
else:
    tag_tuples = tag_tuples_all

In [30]:
# 2) Build dataframes
versions_df = build_versions_df(tag_tuples, REPO_SLUG)
commits_df  = build_tag_commit_df(tag_tuples, REPO_PATH)
files_df    = build_files_changed_df(tag_tuples, REPO_PATH)

# 3) Integrity checks
assert versions_df["tag"].is_unique, "Duplicate tags in versions."
assert set(commits_df["tag"]) == set(versions_df["tag"]), "Commits/versions tag mismatch."
if not files_df.empty:
    assert set(files_df["tag"]).issubset(set(versions_df["tag"])), "Files refer to unknown tags."

# 4) Save curated CSVs (atomic)
atomic_write_csv(versions_df.sort_values("date"), CURATED_DIR / "versions.csv")
atomic_write_csv(commits_df.sort_values("committed_date"), CURATED_DIR / "commits.csv")
atomic_write_csv(files_df.sort_values(["tag","filename"]) if not files_df.empty else files_df,
                    CURATED_DIR / "files_changed.csv")


# 5) Parquet sidecars for speed (Stage 3 reads)
if not versions_df.empty:
    versions_df.to_parquet(CURATED_DIR / "versions.parquet", index=False)
if not commits_df.empty:
    commits_df.to_parquet(CURATED_DIR / "commits.parquet", index=False)
if not files_df.empty:
    files_df.to_parquet(CURATED_DIR / "files_changed.parquet", index=False)

# 6) Raw log/meta
write_run_metadata(tag_tuples)

print(f"OK ✅  Wrote {len(versions_df)} versions, {len(commits_df)} tagged commits,"
        f" {0 if files_df is None else len(files_df)} file-change rows.")

Files for tagged commits: 100%|██████████| 25/25 [00:13<00:00,  1.81it/s]

OK ✅  Wrote 25 versions, 25 tagged commits, 48 file-change rows.



