In [None]:
from datasets import load_dataset
import pandas as pd
import ast
from collections import Counter
import requests
import time

In [None]:
reviews = pd.read_parquet('output_files/prs_reviews.parquet')
reduced_reviews = reviews[['id','number','agent','repo_url','html_url','closing_method','reviews_data','review_counts_map','has_human_closing_user']]

In [None]:
prs_with_review = reduced_reviews[reduced_reviews['reviews_data'].str.len() > 2]
rejected_prs_with_review = prs_with_review[prs_with_review['closing_method'] == 'Closed']
rejected_prs_with_review_with_human_closing = rejected_prs_with_review[rejected_prs_with_review['has_human_closing_user'] == True]
agentic_rejected_prs_with_review_with_human_closing = rejected_prs_with_review_with_human_closing[rejected_prs_with_review_with_human_closing['agent'] != 'Human']

In [None]:
agentic_rejected_prs_with_review_with_human_closing['review_comment_url'] = (
    agentic_rejected_prs_with_review_with_human_closing['repo_url'] 
    + '/pulls/' 
    + agentic_rejected_prs_with_review_with_human_closing['number'].astype(str) 
    + '/comments'
)

agentic_rejected_prs_with_review_with_human_closing['pr_comment_url'] = (
    agentic_rejected_prs_with_review_with_human_closing['repo_url'] 
    + '/issues/' 
    + agentic_rejected_prs_with_review_with_human_closing['number'].astype(str) 
    + '/comments'
)
agentic_rejected_prs_with_review_with_human_closing

In [None]:
# Insira seu Token do GitHub aqui para aumentar o limite de requisições
GITHUB_TOKEN = "INSERT_TOKEN"
headers = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}


In [None]:
def fetch_review_comments(url):
    """
    Fetches review comments and extracts: user(login, type), body, author_association
    """
    if not url or pd.isna(url):
        return []
    
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            comments = response.json()
            extracted_data = []
            for c in comments:
                # Safe extraction using .get() to avoid errors if key doesn't exist
                user_data = c.get('user', {})
                extracted_data.append({
                    'user_login': user_data.get('login'),
                    'user_type': user_data.get('type'),
                    'body': c.get('body'),
                    'author_association': c.get('author_association')
                })
            return extracted_data
        else:
            print(f"Error {response.status_code} at URL: {url}")
            return []
    except Exception as e:
        print(f"Connection error: {e}")
        return []

def fetch_issue_comments(url):
    """
    Fetches issue/PR comments and extracts fields above + performed_via_github_app
    """
    if not url or pd.isna(url):
        return []

    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            comments = response.json()
            extracted_data = []
            for c in comments:
                user_data = c.get('user', {})
                app_data = c.get('performed_via_github_app') # Can be None/Null
                
                # Handling app data (can be None if it is a human)
                app_slug = app_data.get('slug') if app_data else None
                app_name = app_data.get('name') if app_data else None

                extracted_data.append({
                    'user_login': user_data.get('login'),
                    'user_type': user_data.get('type'),
                    'body': c.get('body'),
                    'author_association': c.get('author_association'),
                    'app_slug': app_slug,
                    'app_name': app_name
                })
            return extracted_data
        else:
            print(f"Error {response.status_code} at URL: {url}")
            return []
    except Exception as e:
        print(f"Connection error: {e}")
        return []

In [None]:
print("Starting Review Comments extraction...")
agentic_rejected_prs_with_review_with_human_closing['review_comments_data'] = (
    agentic_rejected_prs_with_review_with_human_closing['review_comment_url']
    .apply(fetch_review_comments)
)

print("Starting PR/Issue Comments extraction...")
agentic_rejected_prs_with_review_with_human_closing['pr_comments_data'] = (
    agentic_rejected_prs_with_review_with_human_closing['pr_comment_url']
    .apply(fetch_issue_comments)
)

# Visualizing the result
print("Finished!")

In [None]:
agentic_rejected_prs_with_review_with_human_closing

In [None]:
review_comments = agentic_rejected_prs_with_review_with_human_closing[['id', 'number', 'agent', 'html_url',
'reviews_data','review_comment_url','review_comments_data']]
comments_dataset = agentic_rejected_prs_with_review_with_human_closing[['id', 'number', 'agent', 'html_url',
       'reviews_data',
        'review_comments_data',
       'pr_comments_data']]


In [None]:
all_reviews_type_subset = comments_dataset[['id','number','agent','html_url','reviews_data']]
all_reviews_type_subset["reviews_data_parsed"] = all_reviews_type_subset["reviews_data"].apply(
    lambda x: ast.literal_eval(x) if pd.notna(x) and x.strip() != "" else []
)

exploded_all_reviews_type_subset = (
    all_reviews_type_subset
        .explode("reviews_data_parsed")
        .reset_index(drop=True)
        .pipe(lambda d: pd.concat(
            [d.drop(columns=["reviews_data_parsed"]),
             pd.json_normalize(d["reviews_data_parsed"])],
            axis=1
        ))
)
exploded_all_reviews_type_subset.drop(['reviews_data','submitted_at'],inplace = True, axis = 1)

with_body_all_reviews_type_subset = exploded_all_reviews_type_subset[exploded_all_reviews_type_subset['body'] != '']
final_all_reviews_type_subset = with_body_all_reviews_type_subset[~with_body_all_reviews_type_subset['user'].str.contains('[bot]', regex=False, na=False)]
final_all_reviews_type_subset

In [None]:
review_comment_subset = comments_dataset[['id','number','agent','html_url','review_comments_data']]
exploded_review_comment_subset = (
    review_comment_subset[review_comment_subset["review_comments_data"].str.len() > 2]
        .explode("review_comments_data")
        .reset_index(drop=True)
        .pipe(lambda d: pd.concat(
            [d.drop(columns=["review_comments_data"]),
             pd.json_normalize(d["review_comments_data"])],
            axis=1
        ))
)
final_review_comment_subset = exploded_review_comment_subset
final_review_comment_subset

In [None]:
pr_comment_subset = comments_dataset[['id','number','agent','html_url','pr_comments_data']]
exploded_pr_comments_data = (
    pr_comment_subset[pr_comment_subset["pr_comments_data"].str.len() > 2]
        .explode("pr_comments_data")
        .reset_index(drop=True)
        .pipe(lambda d: pd.concat(
            [d.drop(columns=["pr_comments_data"]),
             pd.json_normalize(d["pr_comments_data"])],
            axis=1
        ))
)

final_pr_comments_data = exploded_pr_comments_data.drop(['app_slug','app_name'],axis = 1)
final_pr_comments_data

In [None]:
final_all_reviews_type_subset.to_csv(r'output_files/comments/comments_extracted_from_review_type.csv', index = False)
final_review_comment_subset[final_review_comment_subset['user_type'] == 'User'].to_csv(r'output_files/comments/comments_extracted_from_review_comment.csv', index = False)
final_pr_comments_data[final_pr_comments_data['user_type'] == 'User'].to_csv(r'output_files/comments/comments_extracted_from_pr_comment.csv', index = False)