In [None]:
from datasets import load_dataset
import requests
import pandas as pd
from tqdm import tqdm
import os
import json
import re
from urllib.parse import urlparse
import glob
import time
import numpy as np

In [None]:
token = "INSERT_TOKEN"
headers = {"Authorization": f"token {token}"}

In [None]:
OUTPUT_CSV = r'output_files/fix_prs.csv'

os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)

if os.path.exists(OUTPUT_CSV):
    # --- PATH 1: FILE ALREADY EXISTS ---
    print(f"File {OUTPUT_CSV} found. Reading data from disk...")
    all_fix_prs = pd.read_csv(OUTPUT_CSV)

else:
    # --- PATH 2: FILE DOES NOT EXIST ---
    print(f"File {OUTPUT_CSV} not found.")
    # --- Your original code to load and process ---
    aidev_pop = load_dataset("hao-li/AIDev", "pull_request")
    pandas_aidev_pop = aidev_pop['train'].to_pandas()
    
    task_types = load_dataset("hao-li/AIDev", "pr_task_type")
    pandas_task_types = task_types['train'].to_pandas()
    pandas_task_types.rename(columns={'confidence': 'type_confidence'}, inplace=True)
    aidev_pop_with_types = pd.merge(pandas_aidev_pop, pandas_task_types[['id','type','type_confidence']], on='id')

    human_prs = load_dataset("hao-li/AIDev", "human_pull_request")
    pandas_human_prs = human_prs['train'].to_pandas()

    human_task_types = load_dataset("hao-li/AIDev", "human_pr_task_type")
    pandas_human_task_types = human_task_types['train'].to_pandas()
    pandas_human_task_types.rename(columns={'confidence': 'type_confidence'}, inplace=True)
    human_prs_with_types = pd.merge(pandas_human_prs, pandas_human_task_types[['id','type']], on='id')
    fix_human_prs = human_prs_with_types[human_prs_with_types['type'] == 'fix']
    fix_agent_prs = aidev_pop_with_types[aidev_pop_with_types['type'] == 'fix']
    
    cols_to_keep = ['id','number','user','user_id','agent','title','body','state',
                    'created_at','closed_at','merged_at','repo_url','html_url']
    
    all_fix_prs = pd.concat([
        fix_human_prs[cols_to_keep],
        fix_agent_prs[cols_to_keep]
    ])
    
    try:
        all_fix_prs.to_csv(OUTPUT_CSV, index=False)
    except Exception as e:
        print(f"ERROR: Failed to save CSV file. Error: {e}")
all_fix_prs

In [None]:
# Function to extract possible issues from text
def extract_issue_numbers(text):
    if not text:
        return []
    # Looks for patterns like #123 or owner/repo#123
    pattern = r'(?:\b[\w-]+/[\w-]+)?#(\d+)\b'
    return [int(m) for m in re.findall(pattern, text)]

# Applying to dataframe
all_fix_prs['possible_issues'] = all_fix_prs.apply(
    lambda row: extract_issue_numbers(str(row['title']) + ' ' + str(row['body'])),
    axis=1
)
all_fix_prs['possible_issues'] = all_fix_prs['possible_issues'].apply(lambda x: list(set(x)) if isinstance(x, list) else x)

In [None]:
# ----- MODIFIED HELPER FUNCTION -----
def _get_issue_type(url: str, session: requests.Session) -> str:
    """
    Makes a GET call to the /issues API and determines if
    it is a pure Issue, a Pull Request, or does not exist.
    Returns: 'issue', 'pull_request', 'not_found'
    """
    try:
        # CRUCIAL CHANGE: session.get() instead of session.head()
        # We need the JSON body to check for the 'pull_request' field
        response = session.get(url, timeout=10) 
        
        # 404 Not Found or 410 Gone = Does not exist
        if response.status_code in (404, 410):
            return 'not_found'
        
        # Handle Rate Limit (same as original)
        if response.status_code == 403 and 'X-RateLimit-Remaining' in response.headers:
            if int(response.headers['X-RateLimit-Remaining']) == 0:
                print("Rate limit reached. Waiting 15 minutes...")
                time.sleep(15 * 60)
                return _get_issue_type(url, session) # Retry

        # Other errors (500, 403-Forbidden, etc.)
        if response.status_code != 200:
            return 'not_found' # Treat other errors as 'not_found'

        # --- SUCCESS (200 OK) ---
        data = response.json()
        
        # THE CHECK YOU REQUESTED:
        if 'pull_request' in data:
            return 'pull_request' # It's a PR
        else:
            return 'issue' # It's a pure Issue

    except requests.exceptions.RequestException:
        return 'not_found' # Network error
        

# ----- MODIFIED MAIN FUNCTION -----

def validar_issues_do_repo(df: pd.DataFrame, github_token: str) -> pd.DataFrame:
    """
    Receives a DataFrame with 'repo_url' and 'possible_issues' (list)
    and returns the DataFrame with a new column 'matched_issues'
    containing only the numbers of PURE Issues (not PRs).
    """
    
    print("Starting refactoring: Optimizing API calls...")
    
    # --- 1. SESSION AND CACHE SETUP ---
    
    session = requests.Session()
    session.headers.update({
        'Authorization': f'token {github_token}',
        'Accept': 'application/vnd.github.v3+json',
        'X-GitHub-Api-Version': '2022-11-28'
    })
    
    # Cache stores the TYPE of the issue
    # Format: {(repo_url, issue_num): str} (e.g., 'issue', 'pull_request')
    validation_cache = {}

    # --- 2. FIND UNIQUE CHECKS (No changes) ---
    
    df_clean = df.dropna(subset=['repo_url', 'possible_issues'])
    df_exploded = df_clean.explode('possible_issues')
    df_exploded = df_exploded.dropna(subset=['possible_issues'])
    df_exploded['possible_issues'] = df_exploded['possible_issues'].astype(int)
    checks_needed = df_exploded[['repo_url', 'possible_issues']].drop_duplicates()
    
    print(f"Optimization: {len(df)} rows reduced to {len(checks_needed)} unique API calls.")

    # --- 3. EXECUTE API CHECKS (MODIFIED) ---
    
    for _, row in tqdm(checks_needed.iterrows(), total=len(checks_needed), desc="Validating issues via API"):
        repo_url = row['repo_url']
        issue_num = row['possible_issues']
        cache_key = (repo_url, issue_num)
        
        if issue_num == 0:
            validation_cache[cache_key] = 'not_found' # CHANGED
            continue
            
        api_url = f"{repo_url}/issues/{issue_num}"
        
        # Call our NEW helper and save the TYPE in cache
        validation_cache[cache_key] = _get_issue_type(api_url, session) # CHANGED

    # --- 4. MAP RESULTS BACK (MODIFIED) ---
    
    print("Mapping results back to DataFrame...")
    
    def find_matches(row):
        repo = row['repo_url']
        issues = row['possible_issues']
        
        if not isinstance(issues, list) or pd.isna(repo):
            return [] 
            
        matched_list = []
        for num in issues:
            # Get the type ('issue', 'pull_request', 'not_found') from cache
            cached_result = validation_cache.get((repo, num), 'not_found')
            
            # ADD TO LIST ONLY IF IT IS A PURE ISSUE
            if cached_result == 'issue': # CHANGED
                matched_list.append(num)
        
        return matched_list

    # Apply mapping function (fast, as it only uses cache)
    df['matched_issues'] = df.apply(find_matches, axis=1)
    
    print("Refactoring complete.")
    return df

In [None]:
OUTPUT_PARQUET = r'output_files\possibles_issues.parquet'
if os.path.exists(OUTPUT_PARQUET):
    # --- PATH 1: FILE ALREADY EXISTS ---
    print(f"File {OUTPUT_PARQUET} found. Reading data from disk...")
    
    df_com_matches = pd.read_parquet(OUTPUT_PARQUET)
    
    # Add cleaning for list columns (the numpy.ndarray problem)
    def clean_parquet_list(item):
        if isinstance(item, list):
            return item
        if isinstance(item, np.ndarray):
            return item.tolist() # Converts NumPy array to list
        return [] # Converts None, NaN, etc., to empty list
    # Clean columns we know are lists
    list_cols = ['possible_issues', 'matched_issues']
    for col in list_cols:
        if col in df_com_matches.columns:
            df_com_matches[col] = df_com_matches[col].apply(clean_parquet_list)

else:
    # --- PATH 2: FILE DOES NOT EXIST ---
    print(f"File {OUTPUT_PARQUET} not found.")
    print("Starting issue validation via API (may take a while)...")
    df_com_matches = validar_issues_do_repo(all_fix_prs, token)
    print(f"API processing complete. Saving results to {OUTPUT_PARQUET}...")
    try:
        # Ensure directory exists
        os.makedirs(os.path.dirname(OUTPUT_PARQUET), exist_ok=True) 
        df_com_matches.to_parquet(OUTPUT_PARQUET, index=False)
        print("File saved successfully.")
    except Exception as e:
        print(f"ERROR: Failed to save Parquet file. Error: {e}")
df_com_matches[['repo_url', 'possible_issues', 'matched_issues']].head()

In [None]:
PR_BATCH_SIZE = 50 
PARQUET_FILE = r'output_files\linked_issues.parquet' 
CHECKPOINT_FILE = r'output_files\linked_issues_checkpoint.json'

# --- HELPER FUNCTION 1: Sub-Query Generator (No changes) ---
def generate_pr_sub_query(pr_number: int) -> str:
    """
    Creates the GraphQL query part for a single PR using an alias.
    Ex: pr123: pullRequest(number: 123) { ... }
    """
    return f"""
    pr{pr_number}: pullRequest(number: {pr_number}) {{
      closingIssuesReferences(first: 10) {{
        nodes {{
          number
        }}
      }}
    }}
    """

# --- HELPER FUNCTION 2: Optimized API Function (No changes) ---
def get_linked_issues_batched(owner: str, repo: str, pr_numbers: list, headers: dict) -> dict:
    """
    Fetches linked issues for a BATCH of PR numbers
    from a SINGLE repository in ONE GraphQL call.
    """
    graphql_url = "https://api.github.com/graphql"
    
    # 1. Build dynamic query
    sub_queries = "\n".join([generate_pr_sub_query(n) for n in pr_numbers])
    query = f"""
    query ($owner: String!, $repo: String!) {{
      repository(owner: $owner, name: $repo) {{
        {sub_queries}
      }}
    }}
    """
    
    variables = {"owner": owner, "repo": repo}
    
    # Dictionary to store {pr_number: [issue_list]}
    results = {}
    
    try:
        r = requests.post(graphql_url, json={"query": query, "variables": variables}, headers=headers, timeout=30)
        r.raise_for_status()
        data = r.json()
        
        # 2. Handle partial GraphQL errors
        if 'errors' in data:
            print(f"   GraphQL Error in {owner}/{repo}: {data['errors'][0]['message'][:100]}...")
            for num in pr_numbers:
                results[num] = [] 
            return results
            
        repo_data = data.get('data', {}).get('repository')
        if not repo_data:
            raise Exception("'repository' not found in response.")

        # 3. Parse the response
        for pr_alias, pr_data in repo_data.items():
            pr_num = int(pr_alias[2:]) 
            
            if pr_data is None:
                results[pr_num] = []
                continue
                
            nodes = pr_data.get('closingIssuesReferences', {}).get('nodes', [])
            results[pr_num] = [n['number'] for n in nodes]
            
        return results

    except Exception as e:
        print(f"   Network/Timeout Exception in {owner}/{repo} (Batch starting with PR {pr_numbers[0]}): {e}")
        for num in pr_numbers:
            results[num] = []
        return results

# --- OPTIMIZED MAIN LOGIC (MODIFIED) ---

# 1. Check if PARQUET FILE exists
if not os.path.exists(PARQUET_FILE):
    print("Starting optimized search for linked issues...")
    
    if os.path.exists(CHECKPOINT_FILE):
        print(f"Loading progress from checkpoint: {CHECKPOINT_FILE}")
        with open(CHECKPOINT_FILE, 'r', encoding='utf-8') as f:
            repo_results_map = json.load(f)
    else:
        repo_results_map = {}

    repos_para_processar = df_com_matches[
        ~df_com_matches['repo_url'].isin(repo_results_map.keys())
    ]
    grouped = repos_para_processar.groupby('repo_url')
    
    print(f"Total repositories to process: {len(grouped)}")
    pbar = tqdm(grouped, desc="Processing Repositories")

    for repo_url, group in pbar:
        try:
            owner_repo = "/".join(repo_url.rstrip("/").split("/")[-2:])
            owner, repo = owner_repo.split("/")
            pbar.set_postfix_str(f"{owner}/{repo}")
        except Exception as e:
            print(f"Error parsing repo URL {repo_url}: {e}. Skipping...")
            repo_results_map[repo_url] = {} 
            continue
        
        pr_numbers = group['number'].unique().tolist()
        repo_batch_results = {} 

        for i in range(0, len(pr_numbers), PR_BATCH_SIZE):
            batch_pr_numbers = pr_numbers[i:i+PR_BATCH_SIZE]
            batch_results = get_linked_issues_batched(owner, repo, batch_pr_numbers, headers)
            repo_batch_results.update(batch_results)

        repo_results_map[repo_url] = repo_batch_results
        try:
            with open(CHECKPOINT_FILE, 'w', encoding='utf-8') as f:
                json.dump(repo_results_map, f, indent=2)
        except Exception as e:
            print(f"ALERT: Failed to save checkpoint: {e}")

    # --- END OF LOOP ---
    
    print("API processing complete. Mapping results back to DataFrame...")

    def map_results(row):
        return repo_results_map.get(row['repo_url'], {}).get(row['number'], [])

    df_com_matches['linked_issues'] = df_com_matches.apply(map_results, axis=1)
    
    # 7. Save final PARQUET (MODIFIED)
    print(f"Saving results to {PARQUET_FILE}...")
    # Parquet saves list [1, 2] as a list, not as string "[1, 2]"
    df_com_matches.to_parquet(PARQUET_FILE, index=False)
    
    # Clear checkpoint
    if os.path.exists(CHECKPOINT_FILE):
        os.remove(CHECKPOINT_FILE)

    df_com_issues_linkadas = df_com_matches.copy()
else:
    # --- PARQUET READING (MUCH SIMPLER) ---
    print(f"File {PARQUET_FILE} already exists, reading from disk.")
    df_com_issues_linkadas = pd.read_parquet(PARQUET_FILE)
    
    def clean_parquet_list(item):
        # Case 1: Is a Python list (rare, but can happen)
        if isinstance(item, list):
            return item
            
        # Case 2: Is a NumPy array (most likely)
        if isinstance(item, np.ndarray):
            return item.tolist() # <-- Converts array to Python list
            
        # Case 3: Is None, NaN, or anything else
        return [] # Returns empty list

    df_com_issues_linkadas['linked_issues'] = df_com_issues_linkadas['linked_issues'].apply(clean_parquet_list)

df_com_issues_linkadas

In [None]:
key_cols = ['id', 'number', 'repo_url']
df_final = df_com_issues_linkadas.copy()
df_final = df_final.set_index(key_cols)
df_matched_indexed = df_com_matches.set_index(key_cols)

df_final.update(df_matched_indexed[['possible_issues','matched_issues']])

df_final = df_final.reset_index()

df_final['issues'] = df_final.apply(
    lambda row: list(set(row['matched_issues']) | set(row['linked_issues'])),
    axis=1
)
prs_com_issue = df_final[df_final['issues'].str.len() > 0]
prs_com_issue

In [None]:
v1 = pd.read_csv("fixes_with_issues_linked_or_on_body.csv")
v1.rename(columns = {'linked_issues':'issues'}, inplace = True)

In [None]:
v1_keys = v1[['id']].drop_duplicates()
novo_keys = prs_com_issue[['id']].drop_duplicates()

print(f"Total unique PRs in v1 (Old): {len(v1_keys)}")
print(f"Total unique PRs in prs_com_issue (New): {len(novo_keys)}")
print("-" * 30)
df_diff = pd.merge(
    v1_keys,
    novo_keys,
    on='id',
    how='outer',
    indicator=True  # <-- Creates '_merge' column
)

print("Difference Analysis (PR Count):")
contagem = df_diff['_merge'].value_counts()
print(contagem)
print("-" * 30)

# 1. NEW PRs (Found in 'new' but NOT in 'v1')
novos_ids = df_diff[df_diff['_merge'] == 'right_only']['id']
df_novos_encontrados = prs_com_issue[prs_com_issue['id'].isin(novos_ids)]

# 2. LOST PRs (Were in 'v1' but NOT in 'new')
perdidos_ids = df_diff[df_diff['_merge'] == 'left_only']['id']
df_perdidos = v1[v1['id'].isin(perdidos_ids)]

# 3. COMMON PRs (Were in both)
comuns_ids = df_diff[df_diff['_merge'] == 'both']['id']
df_comuns = prs_com_issue[prs_com_issue['id'].isin(comuns_ids)]

In [None]:
df_comuns[['id','number','repo_url','issues']]

In [None]:
colunas_chave = ['id', 'number', 'repo_url']

print("Preparing source DataFrame (aggregating by keys)...")
df_source = df_comuns.groupby(
    colunas_chave, 
    as_index=False
)['issues'].first()

# (Optional) Rename column to avoid conflict during merge
df_source = df_source.rename(columns={'issues': 'common_issues'})


# --- STEP 2: Execute Merge (Left Join) ---
# 'how='left'' keeps all rows from 'df_com_issues_linkadas'
# and adds data from 'df_source' where keys match.

print("Executing merge (left join)...")
df_final = pd.merge(
    df_com_issues_linkadas,
    df_source,
    on=colunas_chave,
    how='left'
)

# Where there was no match, the new 'common_issues' column will have 'NaN'.


# --- STEP 3: Clean NaNs (fillna with []) ---

def clean_cell_to_list(x):
    """
    Robust function that converts any cell to a clean Python list.
    - Converts np.ndarray -> list
    - Converts np.nan/None -> []
    - Keeps list -> list
    """
    if isinstance(x, np.ndarray):
        return x.tolist()
    if isinstance(x, list):
        return x
    return [] # <-- Converts 'NaN' to '[]'

print("Cleaning new column (converting NaNs to [])...")
df_final['common_issues'] = df_final['common_issues'].apply(clean_cell_to_list)
df_final['has_issues'] = df_final.apply(lambda x: True if len(x['common_issues']) > 0 else False,axis = 1)
df_final = df_final.rename(columns={'common_issues': 'issues'})
df_final.drop(columns=['possible_issues','matched_issues','linked_issues'],inplace = True)
# --- Verification ---
print("Operation complete.")
print(f"Final DataFrame size: {len(df_final)}")
print("\nChecking rows that did NOT match (should have '[]'):")
display(df_final[df_final['issues'].apply(len) == 0].head())

print("\nChecking rows that DID match (should have lists):")
df_final[df_final['issues'].apply(len) > 0].head()

In [None]:
df_final

In [None]:
df_final.to_parquet(r'output_files\fix_PRs_with_issues.parquet',index = False)

In [None]:
df_novos_encontrados