In [None]:
from pydriller import Repository
import pandas as pd, re
from pathlib import Path

# 0) repo location (leave as-is if you cloned Flask to ~/flask)
repo_path = str(Path("~/flask").expanduser().resolve())

# 1) simple keyword rule to tag bug-fix commits
BUG_PATTERN = re.compile(r"(fix|bug|issue|patch|error)", re.IGNORECASE)

# helpers for different PyDriller versions
def added(m):   return getattr(m, "added", getattr(m, "added_lines", 0)) or 0
def removed(m): return getattr(m, "removed", getattr(m, "deleted_lines", 0)) or 0

# 2) collect ~300 commits (fast)
rows = []
for i, c in enumerate(Repository(repo_path).traverse_commits(), start=1):
    msg = c.msg or ""
    files = c.modified_files or []
    rows.append({
        "commit_hash": c.hash,
        "files_changed": len(files),
        "lines_added": sum(added(m) for m in files),
        "lines_deleted": sum(removed(m) for m in files),
        "message_length": len(msg),
        "label_bugfix": 1 if BUG_PATTERN.search(msg) else 0
    })
    if i % 50 == 0: print(f"Processed {i} commits...")
    if i >= 300: break

df = pd.DataFrame(rows)

# 3) save inside /Projects/bug-risk/data
Path("data").mkdir(exist_ok=True)
out = Path("data/commits_small.csv")
df.to_csv(out, index=False)

print("âœ… Saved:", out.resolve(), "rows:", len(df))
df.head()
