In [8]:
import pandas as pd
import requests
import time
import os
from tqdm import tqdm
from collections import Counter

# Enable tqdm for pandas to show progress bars
tqdm.pandas()

In [None]:
# =============================================================================
# CONFIGURATION: GITHUB TOKEN
# =============================================================================
# You MUST replace 'INSERT_TOKEN' with your actual GitHub Personal Access Token.
# Otherwise, you will hit the rate limit immediately (403 error).
GITHUB_TOKEN = "INSERT-TOKEN"

headers = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

In [3]:
# =============================================================================
# LOAD DATA
# =============================================================================

# Using the shared folder shortcut we created
path_to_file = os.path.expanduser('output_files/fix_prs_revision.csv')

if not os.path.exists(path_to_file):
    print(f"❌ Error: File not found at {path_to_file}")
    print("Please make sure the 'MSR_Dados' shortcut exists in your home folder.")
else:
    # Read CSV directly
    prs = pd.read_csv(path_to_file)
    print(f"✅ Data loaded successfully: {len(prs)} rows.")
    
    # Check for required columns
    required_cols = ['repo_url', 'number', 'state']
    missing = [c for c in required_cols if c not in prs.columns]
    if missing:
        print(f"⚠️ Warning: Missing columns: {missing}")

✅ Data loaded successfully: 9052 rows.


In [4]:
# =============================================================================
# HELPER FUNCTIONS
# =============================================================================

def count_reviews_per_user(reviews_list):
    """
    Counts how many times each user reviewed based on the raw list.
    """
    if not reviews_list:
        return {}
    
    # Extract logins
    users = [r['user'] for r in reviews_list if r.get('user')]
    
    # Count occurrences
    return dict(Counter(users))

def get_pr_reviewers(url):
    """
    Fetches reviews from GitHub API. Handles Rate Limiting by sleeping.
    """
    try:
        response = requests.get(url, headers=headers)
        
        # Rate Limit Handling
        if response.status_code == 403:
            print(f"Rate limit hit at {url}. Sleeping for 65 seconds...")
            time.sleep(65)
            response = requests.get(url, headers=headers)

        if response.status_code == 200:
            reviews_data = response.json()
            
            reviewers_list = []
            for review in reviews_data:
                # Check if 'user' object exists (sometimes deleted users are null)
                if review.get('user'):
                    reviewers_list.append({
                        'user': review['user']['login'],
                        'state': review['state'],
                        'body': review['body'],
                        'submitted_at': review['submitted_at']
                    })
            return reviewers_list
            
        else:
            # Silence 404s or other errors to keep the loop going
            return []
            
    except Exception as e:
        print(f"Error in URL {url}: {e}")
        return []

In [None]:
# =============================================================================
# EXECUTION
# =============================================================================

# Filter closed PRs
closed_prs = prs[prs['state'] != 'open'].copy()
closed_prs['number'] = closed_prs['number'].astype(str)

# Build the URL
closed_prs['reviews_url'] = (
    closed_prs['repo_url'] 
    + '/pulls/' 
    + closed_prs['number'] 
    + '/reviews'
)

print(f"Starting API calls for {len(closed_prs)} PRs...")
print("This might take a while... (Use the bar below to track progress)")

# --- FIX HERE ---
# Instead of .progress_apply(), we use a direct loop with tqdm.
# This is more stable and avoids the '_is_builtin_func' error.
reviews_data_list = [get_pr_reviewers(url) for url in tqdm(closed_prs['reviews_url'])]

# Assign the list back to the DataFrame
closed_prs['reviews_data'] = reviews_data_list

# Process the counts locally
closed_prs['review_counts_map'] = closed_prs['reviews_data'].apply(count_reviews_per_user)

print("Processing complete.")

Starting API calls for 9052 PRs...
This might take a while... (Use the bar below to track progress)



  0%|                                                  | 0/9052 [00:00<?, ?it/s][A
  0%|                                        | 1/9052 [00:00<1:52:41,  1.34it/s][A
  0%|                                        | 2/9052 [00:01<1:46:23,  1.42it/s][A
  0%|                                        | 3/9052 [00:02<1:41:42,  1.48it/s][A
  0%|                                        | 4/9052 [00:02<1:36:06,  1.57it/s][A
  0%|                                        | 5/9052 [00:03<1:47:16,  1.41it/s][A
  0%|                                        | 6/9052 [00:04<1:50:17,  1.37it/s][A
  0%|                                        | 7/9052 [00:04<1:39:15,  1.52it/s][A
  0%|                                        | 8/9052 [00:05<1:44:59,  1.44it/s][A
  0%|                                        | 9/9052 [00:06<1:36:32,  1.56it/s][A
  0%|                                       | 10/9052 [00:07<1:52:08,  1.34it/s][A
  0%|                                       | 11/9052 [00:07<1:44:14,  1.45

In [9]:
closed_prs

Unnamed: 0,id,number,user,user_id,agent,title,body,state,created_at,closed_at,merged_at,repo_url,html_url,reviews_url,reviews_data,review_counts_map
0,2438086945,88748,iamrajjoshi,33237075,Human,:bug: fix: update how we fetch workflow_id and...,i realized i made a mistake for how i fetch th...,closed,2025-04-03T21:36:59Z,2025-04-04T15:10:57Z,2025-04-04T15:10:57Z,https://api.github.com/repos/getsentry/sentry,https://github.com/getsentry/sentry/pull/88748,https://api.github.com/repos/getsentry/sentry/...,"[{'user': 'GabeVillalobos', 'state': 'COMMENTE...",{'GabeVillalobos': 2}
1,2265431531,83085,ArthurKnaus,7033940,Human,fix(org-stats): Require project membership,### Problem\r\n\r\nIf the user is not member o...,closed,2025-01-08T07:47:13Z,2025-01-08T08:49:40Z,2025-01-08T08:49:40Z,https://api.github.com/repos/getsentry/sentry,https://github.com/getsentry/sentry/pull/83085,https://api.github.com/repos/getsentry/sentry/...,"[{'user': 'obostjancic', 'state': 'APPROVED', ...",{'obostjancic': 1}
2,2622011651,94465,bukzor,640328,Human,fix(dev): mktemp: too few X's in template,"For maximum compatibility, busybox mktemp requ...",closed,2025-06-26T18:54:10Z,2025-06-26T19:57:23Z,2025-06-26T19:57:23Z,https://api.github.com/repos/getsentry/sentry,https://github.com/getsentry/sentry/pull/94465,https://api.github.com/repos/getsentry/sentry/...,"[{'user': 'joshuarli', 'state': 'APPROVED', 'b...",{'joshuarli': 1}
3,2565399631,92785,dashed,139499,Human,fix(billing): Update calculateCategoryPrepaidU...,Closes https://linear.app/getsentry/issue/BIL-...,closed,2025-06-03T22:22:51Z,2025-06-05T18:13:54Z,2025-06-05T18:13:54Z,https://api.github.com/repos/getsentry/sentry,https://github.com/getsentry/sentry/pull/92785,https://api.github.com/repos/getsentry/sentry/...,"[{'user': 'isabellaenriquez', 'state': 'APPROV...",{'isabellaenriquez': 1}
4,2374801945,86438,brendanhsentry,171613822,Human,fix: copy updates to checkout page,closes https://github.com/getsentry/getsentry/...,closed,2025-03-05T22:39:12Z,2025-03-06T16:57:20Z,2025-03-06T16:57:20Z,https://api.github.com/repos/getsentry/sentry,https://github.com/getsentry/sentry/pull/86438,https://api.github.com/repos/getsentry/sentry/...,"[{'user': 'isabellaenriquez', 'state': 'APPROV...","{'isabellaenriquez': 1, 'dashed': 1}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9047,3260452571,1542,devin-ai-integration[bot],158243242,Devin,Fix CSS color assertions in test_lambdas.py,# Fix CSS color assertions in test_lambdas.py\...,closed,2025-07-24T16:14:38Z,2025-07-25T11:29:31Z,2025-07-25T11:29:31Z,https://api.github.com/repos/reflex-dev/reflex...,https://github.com/reflex-dev/reflex-web/pull/...,https://api.github.com/repos/reflex-dev/reflex...,"[{'user': 'greptile-apps[bot]', 'state': 'COMM...","{'greptile-apps[bot]': 1, 'Lendemor': 1}"
9048,2857103111,2151,devin-ai-integration[bot],158243242,Devin,fix: Initialize storage in StringKnowledgeSource,Fixes #2150\n\n## Issue\nStringKnowledgeSource...,closed,2025-02-17T08:16:22Z,2025-02-25T16:39:20Z,,https://api.github.com/repos/crewAIInc/crewAI,https://github.com/crewAIInc/crewAI/pull/2151,https://api.github.com/repos/crewAIInc/crewAI/...,[],{}
9049,2857279950,8459,devin-ai-integration[bot],158243242,Devin,Add missing OpenSSL TLSEXT status response codes,Fixes the build failure in the OpenSSL coexist...,closed,2025-02-17T09:32:13Z,2025-02-17T09:34:11Z,,https://api.github.com/repos/wolfSSL/wolfssl,https://github.com/wolfSSL/wolfssl/pull/8459,https://api.github.com/repos/wolfSSL/wolfssl/p...,[],{}
9050,2857942945,2,devin-ai-integration[bot],158243242,Devin,fix: improve dark mode input focus and toggle ...,# UI Improvements: Dark Mode Input Focus and T...,closed,2025-02-17T13:57:22Z,2025-02-17T14:40:12Z,2025-02-17T14:40:12Z,https://api.github.com/repos/jina-ai/deepsearc...,https://github.com/jina-ai/deepsearch-ui/pull/2,https://api.github.com/repos/jina-ai/deepsearc...,[],{}


In [10]:
closed_prs[['id', 'number', 'user', 'user_id', 'agent', 
       'repo_url', 'html_url',
       'reviews_url', 'reviews_data',
       'review_counts_map']].to_csv('output_files/prs_reviews.csv',index= False)
closed_prs[['id', 'number', 'user', 'user_id', 'agent', 
       'repo_url', 'html_url',
       'reviews_url', 'reviews_data',
       'review_counts_map']].to_parquet('output_files/prs_reviews.parquet',index= False)
