In [3]:
import os
import pandas as pd
from datasets import load_dataset
import re
import requests
from tqdm import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
token = "INSERT-TOKEN"
headers = {"Authorization": f"token {token}"}

In [5]:
# Specific Hash provided
REVISION_HASH = "eee0408a277826d88fc0ca5fa07d2fc325c96af1"
OUTPUT_CSV = r'output_files/fix_prs_revision.csv'

os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)

if os.path.exists(OUTPUT_CSV):
    # --- PATH 1: FILE ALREADY EXISTS ---
    print(f"File {OUTPUT_CSV} found. Reading data from disk...")
    all_fix_prs = pd.read_csv(OUTPUT_CSV)

else:
    # --- PATH 2: FILE DOES NOT EXIST (DOWNLOADING SPECIFIC VERSION) ---
    print(f"Local file not found. Downloading dataset revision: {REVISION_HASH}")
    
    # Loading 'pull_request' with specific revision
    aidev_pop = load_dataset("hao-li/AIDev", "pull_request", revision=REVISION_HASH)
    pandas_aidev_pop = aidev_pop['train'].to_pandas()
    
    # Loading 'pr_task_type' with specific revision
    task_types = load_dataset("hao-li/AIDev", "pr_task_type", revision=REVISION_HASH)
    pandas_task_types = task_types['train'].to_pandas()
    pandas_task_types.rename(columns={'confidence': 'type_confidence'}, inplace=True)
    
    # Merge AIDev
    aidev_pop_with_types = pd.merge(pandas_aidev_pop, pandas_task_types[['id','type','type_confidence']], on='id')

    # Loading 'human_pull_request' with specific revision
    human_prs = load_dataset("hao-li/AIDev", "human_pull_request", revision=REVISION_HASH)
    pandas_human_prs = human_prs['train'].to_pandas()

    # Loading 'human_pr_task_type' with specific revision
    human_task_types = load_dataset("hao-li/AIDev", "human_pr_task_type", revision=REVISION_HASH)
    pandas_human_task_types = human_task_types['train'].to_pandas()
    pandas_human_task_types.rename(columns={'confidence': 'type_confidence'}, inplace=True)
    
    # Merge Human
    human_prs_with_types = pd.merge(pandas_human_prs, pandas_human_task_types[['id','type']], on='id')
    
    # Filtering only 'fix'
    fix_human_prs = human_prs_with_types[human_prs_with_types['type'] == 'fix']
    fix_agent_prs = aidev_pop_with_types[aidev_pop_with_types['type'] == 'fix']
    
    cols_to_keep = ['id','number','user','user_id','agent','title','body','state',
                    'created_at','closed_at','merged_at','repo_url','html_url']
    
    all_fix_prs = pd.concat([
        fix_human_prs[cols_to_keep],
        fix_agent_prs[cols_to_keep]
    ])
    
    try:
        all_fix_prs = all_fix_prs[all_fix_prs['state'] !='open']
        all_fix_prs.to_csv(OUTPUT_CSV, index=False)
        print(f"Success: Data from revision {REVISION_HASH} saved to {OUTPUT_CSV}")
    except Exception as e:
        print(f"ERROR: Failed to save CSV file. Error: {e}")

# Display final result stats
print(f"Total rows loaded: {len(all_fix_prs)}")
all_fix_prs.head()

Local file not found. Downloading dataset revision: eee0408a277826d88fc0ca5fa07d2fc325c96af1


Generating train split: 33596 examples [00:00, 115009.42 examples/s]
Generating train split: 33596 examples [00:00, 687331.23 examples/s]
Generating train split: 6618 examples [00:00, 28286.03 examples/s]
Generating train split: 6618 examples [00:00, 431197.44 examples/s]


Success: Data from revision eee0408a277826d88fc0ca5fa07d2fc325c96af1 saved to output_files/fix_prs_revision.csv
Total rows loaded: 9052


Unnamed: 0,id,number,user,user_id,agent,title,body,state,created_at,closed_at,merged_at,repo_url,html_url
2,2438086945,88748,iamrajjoshi,33237075,Human,:bug: fix: update how we fetch workflow_id and...,i realized i made a mistake for how i fetch th...,closed,2025-04-03T21:36:59Z,2025-04-04T15:10:57Z,2025-04-04T15:10:57Z,https://api.github.com/repos/getsentry/sentry,https://github.com/getsentry/sentry/pull/88748
3,2265431531,83085,ArthurKnaus,7033940,Human,fix(org-stats): Require project membership,### Problem\r\n\r\nIf the user is not member o...,closed,2025-01-08T07:47:13Z,2025-01-08T08:49:40Z,2025-01-08T08:49:40Z,https://api.github.com/repos/getsentry/sentry,https://github.com/getsentry/sentry/pull/83085
10,2622011651,94465,bukzor,640328,Human,fix(dev): mktemp: too few X's in template,"For maximum compatibility, busybox mktemp requ...",closed,2025-06-26T18:54:10Z,2025-06-26T19:57:23Z,2025-06-26T19:57:23Z,https://api.github.com/repos/getsentry/sentry,https://github.com/getsentry/sentry/pull/94465
16,2565399631,92785,dashed,139499,Human,fix(billing): Update calculateCategoryPrepaidU...,Closes https://linear.app/getsentry/issue/BIL-...,closed,2025-06-03T22:22:51Z,2025-06-05T18:13:54Z,2025-06-05T18:13:54Z,https://api.github.com/repos/getsentry/sentry,https://github.com/getsentry/sentry/pull/92785
18,2374801945,86438,brendanhsentry,171613822,Human,fix: copy updates to checkout page,closes https://github.com/getsentry/getsentry/...,closed,2025-03-05T22:39:12Z,2025-03-06T16:57:20Z,2025-03-06T16:57:20Z,https://api.github.com/repos/getsentry/sentry,https://github.com/getsentry/sentry/pull/86438
