In [1]:
#1
# imports and paths
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Ensure project root on path
project_root = os.getcwd()
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Utils
from src.data_utils import parse_list_col
from src.model_utils import (
    build_priors, build_global_prior, recommend_with_backoff, recommend_reviewers, hit_at_k,
    prepare_features, evaluate_with_threshold, train_logistic_baseline, evaluate_multilabel, FeatureBuilder
)

CLEAN_DIR = "../data/clean"
PARQ_PATH = os.path.join(CLEAN_DIR, "prs_clean.parquet")
CSV_PATH  = os.path.join(CLEAN_DIR, "prs_clean.csv")


In [2]:
#2
# load cleaned data try parquet if not csv
if os.path.exists(PARQ_PATH):
    df = pd.read_parquet(PARQ_PATH)
    print(f"Loaded Parquet ✓ rows={len(df)}")
elif os.path.exists(CSV_PATH):
    df = pd.read_csv(CSV_PATH, parse_dates=["created_at"])
    # rehydrate list columns if needed
    if "reviewers_list" in df.columns:
        df["reviewers_list"] = parse_list_col(df["reviewers_list"])
    if "labels_list" in df.columns:
        df["labels_list"] = parse_list_col(df["labels_list"])
    print(f"Loaded CSV ✓ rows={len(df)} (lists parsed)")
else:
    raise FileNotFoundError("No cleaned dataset found. Run data_load_and_clean first.")

display(df.head(3))
print("Columns:", list(df.columns))


Loaded Parquet ✓ rows=53


Unnamed: 0,id,number,title,author,body,created_at,state,labels,reviewers,labels_list,reviewers_list,body_clean,keyphrases
0,2801771679,55290,Fixed rows_processor in SQLInsertRowsOperator,dabla,While testing the new providers in Airflow 3.0...,2025-09-05 07:43:33+00:00,closed,area:providers;provider:common-sql,eladkal,"[area:providers, provider:common-sql]",[eladkal],While testing the new providers in Airflow 3.0...,the new providers | Airflow 3.0.6 | I | an iss...
1,2799915276,55269,Clear ti_summaries cache for a dagrun when a t...,tirkarthi,closes #55251,2025-09-04 15:35:43+00:00,closed,area:UI,pierrejeambrun,[area:ui],[pierrejeambrun],closes #55251,
2,2799094678,55261,Display the correct shared project while runni...,amoghrajesh,"<img width=""1179"" height=""549"" alt=""image"" src...",2025-09-04 11:43:18+00:00,closed,area:dev-tools;backport-to-v3-0-test,potiuk,"[area:dev-tools, backport-to-v3-0-test]",[potiuk],That isn't right ^ --- ^ Add meaningful descri...,That | meaningful description | more informati...


Columns: ['id', 'number', 'title', 'author', 'body', 'created_at', 'state', 'labels', 'reviewers', 'labels_list', 'reviewers_list', 'body_clean', 'keyphrases']


In [3]:
#3
# time based splits
# Ensure datetime & sort
df["created_at"] = pd.to_datetime(df["created_at"], utc=True, errors="coerce")
df = df.dropna(subset=["created_at"]).sort_values("created_at").reset_index(drop=True)

cut_idx = int(0.8 * len(df))  # 80/20 by time
df_tr = df.iloc[:cut_idx].copy()
df_te = df.iloc[cut_idx:].copy()

def set_of_reviewers(frame: pd.DataFrame):
    return set(r for lst in frame["reviewers_list"] for r in (lst or []))

tr_revs = set_of_reviewers(df_tr)
te_revs = set_of_reviewers(df_te)
unseen = te_revs - tr_revs

print(f"Train PRs: {len(df_tr)} | Test PRs: {len(df_te)}")
print(f"Unique reviewers — train: {len(tr_revs)} | test: {len(te_revs)}")
print(f"Unseen reviewers in test: {len(unseen)} ({(len(unseen)/max(1,len(te_revs))):.1%})")


Train PRs: 42 | Test PRs: 11
Unique reviewers — train: 12 | test: 8
Unseen reviewers in test: 2 (25.0%)


In [4]:
#4
# heuristic priors + hit@K with backoff:
pA_tr, pL_tr = build_priors(df_tr)
pG_tr = build_global_prior(df_tr)  # more robust than approximating

for K in (1, 3, 5, 10):
    score = hit_at_k(df_te, pA_tr, pL_tr, k=K, p_global=pG_tr, w_author=0.5, w_labels=0.5)
    print(f"Heuristic Hit@{K}: {score:.3f}")


Heuristic Hit@1: 0.364
Heuristic Hit@3: 0.455
Heuristic Hit@5: 0.455
Heuristic Hit@10: 0.545


In [5]:
#5
# single PR recommentation
row0 = df_te.iloc[0] if len(df_te) else df_tr.iloc[0]
author = row0["author"]
labels = row0["labels_list"]

print("Author:", author)
print("Labels:", labels)
print("New API:", recommend_with_backoff(author, labels, pA_tr, pL_tr, pG_tr, k=5))
print("Legacy  :", recommend_reviewers(author, labels, pA_tr, pL_tr, k=5))


Author: uranusjr
Labels: ['kind:documentation' 'area:task-sdk']
New API: ['potiuk', 'jedcunningham', 'jason810496', 'kaxil']
Legacy  : ['potiuk', 'jedcunningham', 'jason810496', 'kaxil']


In [6]:
#6
# fit vocab (author/labels/reviewers + tfidf) on FULL df (no target leakage beyond label names)
fb = FeatureBuilder(use_body=True, use_keyphrases=True)

# 1) Fit on full df to capture the full reviewer label space
fb.fit(df)

# 2) Transform train/test using the same builder
X_tr, Y_tr = fb.transform(df_tr)
X_te, Y_te = fb.transform(df_te)

print("X_tr:", X_tr.shape, "X_te:", X_te.shape)
print("Y_tr:", Y_tr.shape, "Y_te:", Y_te.shape)
assert Y_te.index.equals(df_te.index)


X_tr: (42, 420) X_te: (11, 420)
Y_tr: (42, 14) Y_te: (11, 14)


In [7]:
# 7
# train logistic baseline + default evaluation
clf = train_logistic_baseline(X_tr, Y_tr)
metrics = evaluate_multilabel(clf, X_te, Y_te)  # uses threshold=0.2 inside utils
print("Baseline @threshold=0.2:", metrics)

Baseline @threshold=0.2: {'micro_f1': 0.21739130434782608, 'micro_precision': 0.13513513513513514, 'micro_recall': 0.5555555555555556}


In [8]:
#8
# threshold sweep to see precision/recall tradeoff
for t in [0.05, 0.1, 0.2, 0.3]:
    print(t, evaluate_with_threshold(clf, X_te, Y_te, thr=t))

0.05 {'micro_f1': 0.140625, 'micro_precision': 0.07563025210084033, 'micro_recall': 1.0}
0.1 {'micro_f1': 0.13953488372093023, 'micro_precision': 0.07792207792207792, 'micro_recall': 0.6666666666666666}
0.2 {'micro_f1': 0.21739130434782608, 'micro_precision': 0.13513513513513514, 'micro_recall': 0.5555555555555556}
0.3 {'micro_f1': 0.3225806451612903, 'micro_precision': 0.22727272727272727, 'micro_recall': 0.5555555555555556}


In [9]:
#9
# Summary
# === Final Summary Cell ===
summary = {}

# Heuristic Hit@K
for K in (1, 3, 5, 10):
    summary[f"Heuristic_Hit@{K}"] = round(
        hit_at_k(df_te, pA_tr, pL_tr, k=K, p_global=pG_tr), 3
    )

# ML metrics at chosen threshold (say 0.30)
thr_best = 0.30
ml_metrics = evaluate_with_threshold(clf, X_te, Y_te, thr=thr_best)
for k, v in ml_metrics.items():
    summary[f"ML@thr={thr_best}_{k}"] = round(v, 3)

# ML Hit@K (top-K decoding)
probs = clf.predict_proba(X_te)
probs = np.column_stack([p[:, 1] if p.ndim == 2 else p for p in probs])
class_names = clf.output_labels_

def hit_at_k_from_probs(Y_true, probs, class_names, k):
    hits, total = 0, 0
    for i in range(Y_true.shape[0]):
        truth = set(Y_true.iloc[i].index[Y_true.iloc[i].values.astype(bool)])
        if not truth:
            total += 1
            continue
        topk_idx = np.argsort(-probs[i])[:k]
        preds = {class_names[j] for j in topk_idx}
        if truth & preds:
            hits += 1
        total += 1
    return hits / total if total else 0.0

for K in (1, 3, 5, 10):
    summary[f"ML_Top{K}_Hit@{K}"] = round(hit_at_k_from_probs(Y_te, probs, class_names, K), 3)

# Display as a nice DataFrame
import pandas as pd
summary_df = pd.DataFrame(summary, index=[0]).T.reset_index()
summary_df.columns = ["Metric", "Value"]
display(summary_df)

summary = {}

# Heuristic Hit@K
for K in (1, 3, 5, 10):
    summary[f"Heuristic_Hit@{K}"] = round(
        hit_at_k(df_te, pA_tr, pL_tr, k=K, p_global=pG_tr), 3
    )

# ML metrics at chosen threshold (say 0.30)
thr_best = 0.30
ml_metrics = evaluate_with_threshold(clf, X_te, Y_te, thr=thr_best)
for k, v in ml_metrics.items():
    summary[f"ML@thr={thr_best}_{k}"] = round(v, 3)

# ML Hit@K (top-K decoding)
probs = clf.predict_proba(X_te)
probs = np.column_stack([p[:, 1] if p.ndim == 2 else p for p in probs])
class_names = clf.output_labels_

def hit_at_k_from_probs(Y_true, probs, class_names, k):
    hits, total = 0, 0
    for i in range(Y_true.shape[0]):
        truth = set(Y_true.iloc[i].index[Y_true.iloc[i].values.astype(bool)])
        if not truth:
            total += 1
            continue
        topk_idx = np.argsort(-probs[i])[:k]
        preds = {class_names[j] for j in topk_idx}
        if truth & preds:
            hits += 1
        total += 1
    return hits / total if total else 0.0

for K in (1, 3, 5, 10):
    summary[f"ML_Top{K}_Hit@{K}"] = round(hit_at_k_from_probs(Y_te, probs, class_names, K), 3)

# Display as a nice DataFrame
import pandas as pd
summary_df = pd.DataFrame(summary, index=[0]).T.reset_index()
summary_df.columns = ["Metric", "Value"]
display(summary_df)


Unnamed: 0,Metric,Value
0,Heuristic_Hit@1,0.364
1,Heuristic_Hit@3,0.455
2,Heuristic_Hit@5,0.455
3,Heuristic_Hit@10,0.545
4,ML@thr=0.3_micro_f1,0.323
5,ML@thr=0.3_micro_precision,0.227
6,ML@thr=0.3_micro_recall,0.556
7,ML_Top1_Hit@1,0.0
8,ML_Top3_Hit@3,0.273
9,ML_Top5_Hit@5,0.364


Unnamed: 0,Metric,Value
0,Heuristic_Hit@1,0.364
1,Heuristic_Hit@3,0.455
2,Heuristic_Hit@5,0.455
3,Heuristic_Hit@10,0.545
4,ML@thr=0.3_micro_f1,0.323
5,ML@thr=0.3_micro_precision,0.227
6,ML@thr=0.3_micro_recall,0.556
7,ML_Top1_Hit@1,0.0
8,ML_Top3_Hit@3,0.273
9,ML_Top5_Hit@5,0.364
