In [None]:
import pandas as pd
import requests
import time
from tqdm import tqdm
from collections import Counter


In [None]:
GITHUB_TOKEN = "INSERT_TOKEN"
headers = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

In [None]:
path_to_file = r'output_files\prs_reviews.csv'
closed_prs = pd.read_csv(path_to_file)

In [None]:
has_reviews = closed_prs[closed_prs['reviews_data'].str.len() > 2]
has_reviews

In [None]:
# --- 1. Column Preparation ---
# Ensure columns exist to avoid assignment errors later
if 'closed_by_user' not in has_reviews.columns:
    has_reviews['closed_by_user'] = None
if 'closing_method' not in has_reviews.columns:
    has_reviews['closing_method'] = None

# Filter only those missing information (to save API calls if re-run)
prs_para_processar = has_reviews[has_reviews['closed_by_user'].isna()].copy()

print(f"Processing {len(prs_para_processar)} PRs...")

In [None]:
# --- SMART FUNCTION WITH AUTO-SLEEP ---
def get_pr_closure_info(row):
    url_pr = f"{row['repo_url']}/pulls/{row['number']}"
    
    try:
        # ATTEMPT 1: PR Endpoint
        response = requests.get(url_pr, headers=headers, timeout=10)
        
        # If Rate Limit (403), sleep and retry
        if response.status_code == 403 and 'X-RateLimit-Reset' in response.headers:
            reset_time = int(response.headers['X-RateLimit-Reset'])
            sleep_time = reset_time - time.time() + 10 # +10s safety buffer
            if sleep_time > 0:
                print(f"\n⛔ Rate Limit reached! Sleeping {sleep_time/60:.1f} min...")
                time.sleep(sleep_time)
                return get_pr_closure_info(row) # Retry (recursion)

        if response.status_code == 200:
            data = response.json()
            if data.get('merged_by'):
                return data['merged_by']['login'], 'Merged'
            if data.get('state') == 'closed' and data.get('closed_by'):
                if data.get('merged_at'): 
                    return data['closed_by']['login'], 'Merged'
                return data['closed_by']['login'], 'Closed'

        # ATTEMPT 2: Timeline Fallback
        url_events = f"{row['repo_url']}/issues/{row['number']}/events"
        resp_events = requests.get(url_events, headers=headers, timeout=10)
        
        # Rate Limit on Timeline
        if resp_events.status_code == 403 and 'X-RateLimit-Reset' in resp_events.headers:
            reset_time = int(resp_events.headers['X-RateLimit-Reset'])
            sleep_time = reset_time - time.time() + 10
            if sleep_time > 0:
                print(f"\n⛔ Rate Limit (Timeline)! Sleeping {sleep_time/60:.1f} min...")
                time.sleep(sleep_time)
                return get_pr_closure_info(row)
        
        if resp_events.status_code == 200:
            events = resp_events.json()
            # Look for Merge
            for event in events:
                if event['event'] == 'merged':
                    actor = event.get('actor')
                    return (actor.get('login') if actor else 'Ghost/Deleted'), 'Merged'
            # Look for Closed
            for event in events:
                if event['event'] == 'closed':
                    actor = event.get('actor')
                    return (actor.get('login') if actor else 'Ghost/Deleted'), 'Closed'

    except Exception as e:
        print(f"\nError in PR {row.get('number')}: {e}")
        
    return None, None

# --- LOOP EXECUTION ---
print("Starting processing with Auto-Sleep enabled...")

# Filter only what needs processing
prs_para_processar = has_reviews[has_reviews['closed_by_user'].isna()]

for idx, row in tqdm(prs_para_processar.iterrows(), total=len(prs_para_processar)):
    user, method = get_pr_closure_info(row)
    
    if user:
        # Use .loc to avoid "must be a scalar" error
        has_reviews.loc[idx, 'closed_by_user'] = user
        has_reviews.loc[idx, 'closing_method'] = method

print("\nFinished!")
has_reviews.head()

In [None]:
# Check if value is a string AND does not contain '[bot]'
has_reviews['has_human_closing_user'] = has_reviews['closed_by_user'].apply(
    lambda x: isinstance(x, str) and '[bot]' not in x
)

# Display the result
has_reviews

In [None]:
# If you want to save:
has_reviews.to_parquet(r'output_files\prs_reviews.parquet', index=False)