In [None]:
from urllib.parse import urlparse
import requests
import pandas as pd
from tqdm import tqdm
import os
import glob
import json

In [None]:
token = "INSERT_TOKEN"
headers = {"Authorization": f"token {token}"}

In [None]:
# Load the input file (output from previous step)
all_fix_prs_with_issues = pd.read_parquet('output_files/fix_PRs_with_issues.parquet')

In [None]:
def extract_owner_repo(url):
    path = url.strip().replace("https://github.com/", "").replace("https://api.github.com/repos/", "")
    parts = path.strip("/").split("/")
    return parts[0], parts[1]

In [None]:
# --- Function to fetch modified files (with status and patch)
def get_pr_files_detailed(owner, repo, pr_number, headers):
    """Returns a list of dictionaries: filename, status, and patch"""
    files_info = []
    page = 1
    while True:
        url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/files?page={page}&per_page=100"
        r = requests.get(url, headers=headers)
        if r.status_code != 200:
            print(f"‚ö†Ô∏è Error {r.status_code} in PR {pr_number} ({owner}/{repo})")
            break
        data = r.json()
        if not data:
            break

        for f in data:
            files_info.append({
                "filename": f.get("filename"),
                "status": f.get("status"),
                "additions": f.get("additions"),
                "deletions": f.get("deletions"),
                "changes": f.get("changes"),
                "patch": f.get("patch")
            })
        page += 1
    return files_info

In [None]:
def safe_json_load(x):
    """
    Helper function to safely load JSON from a parquet cell,
    handling empty/NaN cells or malformed JSON.
    """
    if pd.isna(x):
        return [] # Returns empty list if cell was NaN
    
    try:
        # Tries to load as JSON (standard and correct format)
        return json.loads(x)
    except (json.JSONDecodeError, TypeError):
        # If it fails, tries as a Python 'literal' (e.g., "['a', 'b']")
        # This can happen if the parquet was saved in a weird way
        try:
            import ast
            result = ast.literal_eval(x)
            # Ensures the result is indeed a list
            return result if isinstance(result, list) else []
        except (ValueError, SyntaxError):
            # If both fail, it is invalid data
            print(f"Warning: Failed to decode JSON/literal, treating as empty: {str(x)[:50]}...")
            return [] # Returns empty list if invalid

In [None]:
# --- Configuration ---
save_interval = 50
final_column_name = "modified_files" # Name of the result column
partial_file_prefix = "output_files/partial/prs_with_files_partial_"
final_output_file = "output_files/prs_with_files_FINAL.parquet" 
results = []
start_index = 0
df_final_found = False # Flag to skip the loop

# ===================================================
# 0. CHECK IF FINAL FILE ALREADY EXISTS
# ===================================================
if os.path.exists(final_output_file):
    print(f"üéâ Found final file: {final_output_file}")
    print("Loading final results and skipping loop...")
    
    try:
        # Load final DF
        df_complete = pd.read_parquet(final_output_file)
        
        # Deserialize the column
        df_complete[final_column_name] = df_complete[final_column_name].apply(safe_json_load)
        
        # Synchronize 'results' and 'start_index' to skip loop
        results = df_complete[final_column_name].tolist()
        start_index = len(df_complete) 
        
        # IMPORTANT: Overwrite 'all_fix_prs_with_issues' with the complete version
        all_fix_prs_with_issues = df_complete
        df_final_found = True # Set the flag
        
        print(f"‚úÖ Final results loaded. {len(results)} rows.")
    
    except Exception as e:
        print(f"‚ö†Ô∏è Error loading final file {final_output_file}: {e}")
        print("Ignoring and continuing with partial resume logic...")

In [None]:
# ===================================================
# 1. SEARCH FOR PARTIAL FILE (if final was not loaded)
# ===================================================
if not df_final_found: # Only runs if final load failed or didn't exist
    partial_files = glob.glob(f"{partial_file_prefix}*.parquet") 

    if partial_files:
        latest_partial_file = max(partial_files, key=os.path.getmtime)
        print(f"Found partial save: {latest_partial_file}")
        
        try:
            df_partial = pd.read_parquet(latest_partial_file) 
            
            # Deserialize JSON
            results = df_partial[final_column_name].apply(safe_json_load).tolist()
            start_index = len(results)                       
            
            print(f"Loaded {start_index} previous results (from parquet).")
            print(f"Resuming process from index {start_index}...")

        except Exception as e:
            print(f"Error loading or processing {latest_partial_file}: {e}. Starting from scratch.")
            results = []
            start_index = 0
    else:
        print("No partial save found. Starting from scratch.")

In [None]:
# 4. We iterate only over what is MISSING
df_remaining = all_fix_prs_with_issues.iloc[start_index:]

print(f"Processing {len(df_remaining)} remaining PRs (from {start_index} to {len(all_fix_prs_with_issues)})...")

# 5. We use 'start=start_index' in enumerate
for i, (idx, row) in enumerate(tqdm(df_remaining.iterrows(), total=len(df_remaining)), start=start_index):
    owner, repo = extract_owner_repo(row["repo_url"])
    pr_number = row["number"]

    try:
        files = get_pr_files_detailed(owner, repo, pr_number, headers)
    except Exception as e:
        print(f"Error in PR {pr_number} ({owner}/{repo}): {e}")
        files = [] # Error value must be an empty list

    # 1. Just collect into list:
    results.append(files) # Adds the *new* result (still as Python list)
    
    # üíæ Partial save
    if (i + 1) % save_interval == 0:
        print(f"\nStarting partial save (processed {i+1})...")
        
        # 2. CREATE PARTIAL DATAFRAME
        df_partial = all_fix_prs_with_issues.iloc[:i+1].copy()  
        
        # 3. ASSIGN RESULT LIST
        df_partial[final_column_name] = results
        
        # 4. SERIALIZE COLUMN TO JSON (NEW STEP!)
        # Parquet cannot store lists, so we transform the list into a JSON string.
        df_partial[final_column_name] = df_partial[final_column_name].apply(json.dumps)
        
        # 5. SAVE PARTIAL DF (to parquet)
        partial_path = f"{partial_file_prefix}{i+1}.parquet" # CHANGED TO .parquet
        df_partial.to_parquet(partial_path, index=False) # CHANGED TO .to_parquet
        print(f"‚úÖ Partial save: {partial_path}")


# --- Final save
print("\nLoop finished. Preparing final save...")

#display(results[0])

# 5. FINAL ASSIGNMENT
all_fix_prs_with_issues[final_column_name] = results
# 6. FINAL SERIALIZATION (NEW STEP!)
# We also need to serialize the final column before saving to parquet
all_fix_prs_with_issues[final_column_name] = all_fix_prs_with_issues[final_column_name].apply(json.dumps)

# 7. FINAL SAVE
all_fix_prs_with_issues.to_parquet(final_output_file, index=False) # CHANGED TO .to_parquet
print(f"‚úÖ Final save complete: {final_output_file}")

In [None]:
# ===================================================
# üßπ CLEANUP BLOCK 
# ===================================================
print(f"\nCleaning partial files with prefix: '{partial_file_prefix}'...")
try:
    # partial_file_prefix = "prs_with_files_partial_"
    # Ensure the extension is correct (.parquet or .pkl)
    pattern_to_clean = f"{partial_file_prefix}*.parquet" 
    
    # If you are still using pickle for this script:
    # pattern_to_clean = f"{partial_file_prefix}*.pkl" 
    
    files_to_delete = glob.glob(pattern_to_clean)
    
    if not files_to_delete:
        print("No partial files to clean.")
    else:
        for f in files_to_delete:
            os.remove(f)
        print(f"‚úÖ {len(files_to_delete)} partial files removed.")

except Exception as e:
    print(f"‚ö†Ô∏è Error during partial file cleanup: {e}")
# ===================================================
# END OF CLEANUP BLOCK
# ===================================================

print("\nProcess finished.")

In [None]:
#460 human
#all_fix_prs_with_issues.to_parquet(r'output_files\fix_prs_with_issues_and_files.parquet', index=False) #<- may generate errors in modified files
all_fix_prs_with_issues.to_parquet(r'output_files\fix_prs_with_issues_and_files.parquet')
all_fix_prs_with_issues