In [1]:
import pandas as pd
import os

In [2]:
# =============================================================================
# LOAD DATA
# =============================================================================

base_path = 'output_files/comments/'
file_review_type = f'{base_path}comments_extracted_from_review_type.csv'
file_review_comment = f'{base_path}comments_extracted_from_review_comment.csv'
file_pr_comment = f'{base_path}comments_extracted_from_pr_comment.csv'

# Check if files exist before reading
if os.path.exists(file_review_type) and os.path.exists(file_review_comment) and os.path.exists(file_pr_comment):
    df_reviews = pd.read_csv(file_review_type)
    df_code_comments = pd.read_csv(file_review_comment)
    df_general_comments = pd.read_csv(file_pr_comment)
    print("✅ All 3 comment files loaded successfully.")
else:
    print("❌ Error: One or more input files are missing.")
    print("Please ensure you ran Step 6 successfully.")

✅ All 3 comment files loaded successfully.


In [3]:
# =============================================================================
# CONSOLIDATE UNIQUE PRS
# =============================================================================

# We only need the PR identifiers to create a list for manual analysis
columns_of_interest = ['id', 'number', 'agent', 'html_url']

# Concatenate all sources of comments to ensure we cover every PR that had ANY activity
# (Some might only have a review verdict, others only a code comment, etc.)
all_prs_concat = pd.concat([
    df_reviews[columns_of_interest], 
    df_code_comments[columns_of_interest],
    df_general_comments[columns_of_interest]
])

# Drop duplicates to get a unique list of PRs to analyze
unique_prs = all_prs_concat.drop_duplicates()

print(f"Total unique PRs with agent activity found: {len(unique_prs)}")
unique_prs.head()

Total unique PRs with agent activity found: 201


Unnamed: 0,id,number,agent,html_url
0,2914277614,29902,Claude_Code,https://github.com/geneontology/go-ontology/pu...
2,2959025892,24145,Claude_Code,https://github.com/apache/pulsar/pull/24145
3,3234200031,16803,Copilot,https://github.com/microsoft/vscode-jupyter/pu...
5,3190608451,16762,Copilot,https://github.com/microsoft/vscode-jupyter/pu...
6,3167223032,36397,Copilot,https://github.com/microsoft/playwright/pull/3...


In [4]:
# =============================================================================
# SAMPLING (BATCH 1)
# =============================================================================

# Create a random sample of 50 PRs for manual qualitative analysis
# random_state=50 ensures reproducibility (you always get the same 50 PRs)
sample_batch = unique_prs.sample(n=50, random_state=50)

# Save the generic sample file
output_sample = f'{base_path}50_sample.csv'
sample_batch.to_csv(output_sample, index=False)
print(f"✅ Sample saved to: {output_sample}")

✅ Sample saved to: output_files/comments/50_sample.csv


In [5]:
# =============================================================================
# SAVE REMAINING PRS
# =============================================================================

# Create a dataset of all PRs that were NOT included in the sample above.
# This is useful if you need to perform a second round of analysis later.

remaining_prs = unique_prs[~unique_prs['id'].isin(sample_batch['id'].tolist())]

output_remaining = f'{base_path}remaining_prs.csv'
remaining_prs.to_csv(output_remaining, index=False)

print(f"✅ Remaining {len(remaining_prs)} PRs saved to: {output_remaining}")

✅ Remaining 151 PRs saved to: output_files/comments/remaining_prs.csv
