In [1]:
import pandas as pd
import requests
import time
import os
from tqdm import tqdm

# Enable tqdm for pandas
tqdm.pandas()

In [None]:
# =============================================================================
# CONFIGURATION: GITHUB TOKEN
# =============================================================================
GITHUB_TOKEN = "INSERT-TOKEN"

headers = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}


In [18]:
# =============================================================================
# LOAD DATA
# =============================================================================

# Reading from the output of Step 4
path_to_file = 'output_files/prs_reviews.csv'

if not os.path.exists(path_to_file):
    print(f"❌ Error: File not found at {path_to_file}")
    print("Please check if you ran the previous step (Step 4) successfully.")
else:
    closed_prs = pd.read_csv(path_to_file)
    print(f"✅ File loaded: {len(closed_prs)} rows.")

✅ File loaded: 9052 rows.


In [19]:
# =============================================================================
# FILTER
# =============================================================================

# Filter only PRs that actually have review data.
# We check if the string length is > 2 (an empty list "[]" has length 2).
# We use .copy() to avoid "SettingWithCopyWarning" later.
has_reviews = closed_prs[closed_prs['reviews_data'].str.len() > 2].copy()
has_reviews = has_reviews[has_reviews['agent'] != 'Human']

print(f"PRs with reviews found: {len(has_reviews)}")
has_reviews.head(2)

PRs with reviews found: 2230


Unnamed: 0,id,number,user,user_id,agent,repo_url,html_url,reviews_url,reviews_data,review_counts_map
1674,3214876564,5490,wtfsayo,82053242,Claude_Code,https://api.github.com/repos/elizaOS/eliza,https://github.com/elizaOS/eliza/pull/5490,https://api.github.com/repos/elizaOS/eliza/pul...,[{'user': 'copilot-pull-request-reviewer[bot]'...,"{'copilot-pull-request-reviewer[bot]': 3, 'cur..."
1676,3164503419,40,hjanuschka,2891702,Claude_Code,https://api.github.com/repos/amantus-ai/vibetu...,https://github.com/amantus-ai/vibetunnel/pull/40,https://api.github.com/repos/amantus-ai/vibetu...,"[{'user': 'coderabbitai[bot]', 'state': 'COMME...",{'coderabbitai[bot]': 2}


In [20]:
# --- 1. Column Preparation ---
# Ensure columns exist to avoid assignment errors later
if 'closed_by_user' not in has_reviews.columns:
    has_reviews['closed_by_user'] = None
if 'closing_method' not in has_reviews.columns:
    has_reviews['closing_method'] = None

# Filter only those missing information (to save API calls if you need to re-run)
prs_to_process = has_reviews[has_reviews['closed_by_user'].isna()]

print(f"Processing {len(prs_to_process)} pending PRs...")

Processing 2230 pending PRs...


In [21]:
# =============================================================================
# API EXECUTION
# =============================================================================

def get_pr_closure_info(row):
    """
    Fetches who closed the PR and how (Merged vs Closed).
    Handles Rate Limits automatically by sleeping.
    """
    url_pr = f"{row['repo_url']}/pulls/{row['number']}"
    
    try:
        # ATTEMPT 1: PR Endpoint
        response = requests.get(url_pr, headers=headers, timeout=10)
        
        # Rate Limit Handling (Recursion)
        if response.status_code == 403 and 'X-RateLimit-Reset' in response.headers:
            reset_time = int(response.headers['X-RateLimit-Reset'])
            sleep_time = reset_time - time.time() + 10 # +10s safety buffer
            if sleep_time > 0:
                print(f"\n⛔ Rate Limit reached! Sleeping {sleep_time/60:.1f} min...")
                time.sleep(sleep_time)
                return get_pr_closure_info(row) # Retry

        if response.status_code == 200:
            data = response.json()
            # Case 1: Merged by someone
            if data.get('merged_by'):
                return data['merged_by']['login'], 'Merged'
            # Case 2: Closed manually without merge
            if data.get('state') == 'closed' and data.get('closed_by'):
                # Sometimes it says closed but was actually merged (double check)
                if data.get('merged_at'): 
                    return data['closed_by']['login'], 'Merged'
                return data['closed_by']['login'], 'Closed'

        # ATTEMPT 2: Timeline Fallback (if PR endpoint is ambiguous or missing data)
        url_events = f"{row['repo_url']}/issues/{row['number']}/events"
        resp_events = requests.get(url_events, headers=headers, timeout=10)
        
        # Rate Limit on Timeline
        if resp_events.status_code == 403 and 'X-RateLimit-Reset' in resp_events.headers:
            reset_time = int(resp_events.headers['X-RateLimit-Reset'])
            sleep_time = reset_time - time.time() + 10
            if sleep_time > 0:
                print(f"\n⛔ Rate Limit (Timeline)! Sleeping {sleep_time/60:.1f} min...")
                time.sleep(sleep_time)
                return get_pr_closure_info(row)
        
        if resp_events.status_code == 200:
            events = resp_events.json()
            # Look for Merge event
            for event in events:
                if event['event'] == 'merged':
                    actor = event.get('actor')
                    return (actor.get('login') if actor else 'Ghost/Deleted'), 'Merged'
            # Look for Closed event
            for event in events:
                if event['event'] == 'closed':
                    actor = event.get('actor')
                    return (actor.get('login') if actor else 'Ghost/Deleted'), 'Closed'

    except Exception as e:
        print(f"\nError in PR {row.get('number')}: {e}")
        
    return None, None

# --- EXECUTION LOOP ---
print("Starting processing...")

# Iterate safely using the DataFrame index
for idx, row in tqdm(prs_to_process.iterrows(), total=len(prs_to_process)):
    user, method = get_pr_closure_info(row)
    
    if user:
        # Update the main DataFrame using the index
        has_reviews.loc[idx, 'closed_by_user'] = user
        has_reviews.loc[idx, 'closing_method'] = method

print("\nFinished!")

Starting processing...


100%|███████████████████████████████████████| 2230/2230 [36:34<00:00,  1.02it/s]


Finished!





In [22]:
# Check if value is a string AND does not contain '[bot]'
has_reviews['has_human_closing_user'] = has_reviews['closed_by_user'].apply(
    lambda x: isinstance(x, str) and '[bot]' not in x
)

# Display sample to verify
has_reviews[['number', 'closed_by_user', 'closing_method', 'has_human_closing_user']].head()

Unnamed: 0,number,closed_by_user,closing_method,has_human_closing_user
1674,5490,wtfsayo,Merged,True
1676,40,steipete,Closed,True
1681,1727,sanity,Merged,True
1682,5828,jdx,Merged,True
1685,86,mark14wu,Merged,True


In [23]:
# =============================================================================
# SAVE RESULTS
# =============================================================================

output_csv = 'output_files/prs_reviews_with_closure.csv'
output_parquet = 'output_files/prs_reviews_with_closure.parquet'

# Save as CSV (Safety copy)
has_reviews.to_csv(output_csv, index=False)
print(f"✅ CSV saved to: {output_csv}")

# Save as Parquet (Main file for next steps)
try:
    has_reviews.to_parquet(output_parquet, index=False)
    print(f"✅ Parquet saved to: {output_parquet}")
except Exception as e:
    print(f"⚠️ Error saving Parquet: {e}")

✅ CSV saved to: output_files/prs_reviews_with_closure.csv
✅ Parquet saved to: output_files/prs_reviews_with_closure.parquet
