In [None]:
import pandas as pd
import requests
import time
from tqdm import tqdm
from collections import Counter


In [None]:
GITHUB_TOKEN = "INSERT_TOKEN"
headers = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

In [None]:
path_to_file = r'output_files\\fix_prs_with_issues_and_files_and_tests.parquet'
prs = pd.read_parquet(path_to_file)

In [None]:
def count_reviews_per_user(reviews_list):
    # If the list is empty or null, return empty dictionary
    if not reviews_list:
        return {}
    
    # 1. Extract only user logins from the reviews list
    # reviews_list is something like: [{'user': 'devA', ...}, {'user': 'devA', ...}, {'user': 'devB', ...}]
    users = [r['user'] for r in reviews_list]
    
    # 2. Count the occurrence of each user
    # The result will be something like: {'devA': 2, 'devB': 1}
    return dict(Counter(users))

def get_pr_reviewers(url):
    try:
        response = requests.get(url, headers=headers)
        
        if response.status_code == 403:
            print("Rate limit reached. Pausing 60s...")
            time.sleep(60)
            response = requests.get(url, headers=headers)

        if response.status_code == 200:
            reviews_data = response.json()
            
            reviewers_list = []
            for review in reviews_data:
                # Check if 'user' object exists (sometimes bots or deleted users return null)
                if review.get('user'):
                    reviewers_list.append({
                        'user': review['user']['login'],
                        'state': review['state'],
                        'body': review['body'],
                        'submitted_at': review['submitted_at'] # Useful if you want to know when they reviewed
                    })
            return reviewers_list
            
        else:
            # If 404 or other error, return empty list
            return []
            
    except Exception as e:
        print(f"Error in URL {url}: {e}")
        return []

In [None]:
closed_prs = prs[prs['state'] != 'open']
closed_prs['number'] = closed_prs['number'].astype(str)

# Cria a URL espec√≠fica para buscar as reviews daquela PR
closed_prs['reviews_url'] = (
    closed_prs['repo_url'] 
    + '/pulls/' 
    + closed_prs['number'] 
    + '/reviews'
)

In [None]:
# Apply function to the column containing raw review data
closed_prs['reviews_data'] = closed_prs['reviews_url'].apply(get_pr_reviewers)
closed_prs['review_counts_map'] = closed_prs['reviews_data'].apply(count_reviews_per_user)

# --- Results Visualization ---

# Display first rows to check
closed_prs[['number', 'review_counts_map']].head()

# Example of how to access specific data:
# If you want to know how many times user 'x' reviewed in the first row of the DF:
# print(closed_prs.iloc[0]['review_counts_map'].get('username', 0))

In [None]:
closed_prs[['id', 'number', 'user', 'user_id', 'agent', 
       'repo_url', 'html_url',
       'reviews_url', 'reviews_data',
       'review_counts_map']].to_csv('output_files/prs_reviews.csv',index= False)
closed_prs[['id', 'number', 'user', 'user_id', 'agent', 
       'repo_url', 'html_url',
       'reviews_url', 'reviews_data',
       'review_counts_map']].to_parquet('output_files/prs_reviews.parquet',index= False)
