In [None]:
import pandas as pd
from dotenv import load_dotenv
import os
import json
from token_manager import TokenManager
from scraping_utils import get_paginated_data, parse_data, save_data_to_json

In [24]:
directory = "../../data/pull_request"

if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"Directory '{directory}' created successfully.")
else:
    print(f"Directory '{directory}' already exists.")

load_dotenv()

token1 = os.getenv('TOKEN1')
token2 = os.getenv('TOKEN2')

tokens = [token1, token2]

index_list_token = 0

df = pd.read_csv("../../dataset/dataset_filtrado.csv", delimiter=';')

Directory '../../data/pull_request' already exists.


In [None]:
# Initialize the TokenManager
token_manager = TokenManager(tokens)

# Read the filtered dataset
df = pd.read_csv("../../dataset/dataset_filtrado.csv", delimiter=';')

# Iterate over the repository URLs in the dataset
for url in df["URL"]:
    try:
        # Parse the URL to get owner and repo
        parts = url.split("https://github.com/")[1].split("/")
        owner = parts[0]
        repo = parts[1]
    except IndexError:
        print(f"Invalid URL format: {url}")
        continue

    print(f"\nProcessing repository: {owner}/{repo}")

    # Check if the issues data exists for the repository
    issues_file = f"../../data/issues/closed_issues_{owner}_{repo}.json"
    if os.path.exists(issues_file):
        with open(issues_file, "r") as f:
            try:
                issues_data = json.load(f)
            except json.JSONDecodeError:
                print(f"Corrupted issues file '{issues_file}'. Skipping repository.")
                continue
    else:
        print(f"Issues file for {owner}/{repo} not found.")
        continue  # Skip to next repository

    # Check if pull request data exists
    repo_filename = f"{directory}/pull_files_{owner}_{repo}.json"
    if os.path.exists(repo_filename):
        with open(repo_filename, "r") as f:
            try:
                existing_pr_data = json.load(f)
                existing_pull_numbers = {pr["pull_number"] for pr in existing_pr_data}
                print(f"Loaded {len(existing_pull_numbers)} existing PRs from '{repo_filename}'.")
            except json.JSONDecodeError:
                print(f"Corrupted JSON file '{repo_filename}'. Starting fresh.")
                existing_pr_data = []
                existing_pull_numbers = set()
    else:
        existing_pr_data = []
        existing_pull_numbers = set()
        print(f"No existing PR data found for {owner}/{repo}. Starting fresh.")

    # Iterate through the issues data and fetch pull request data
    for issue in issues_data:
        if "pull_request" in issue:
            pull_number = issue["number"]

            if pull_number in existing_pull_numbers:
                print(f"PR #{pull_number} already scraped. Skipping.")
                continue 

            # Fetch pull request files data
            api_url_files = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pull_number}/files"
            print(f"Fetching files for pull request #{pull_number}...")

            pr_files_data = get_paginated_data(api_url_files, token_manager)

            # Fetch main pull request data
            api_url_main = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pull_number}"
            print(f"Fetching main data for pull request #{pull_number}...")
            pr_main_data_list = get_paginated_data(api_url_main, token_manager)

            if pr_files_data and pr_main_data_list:
                pr_main_data = pr_main_data_list[0]  # Extract the dictionary from the list

                additions = pr_main_data.get("additions", 0)
                deletions = pr_main_data.get("deletions", 0)
                changed_files = pr_main_data.get("changed_files", 0)

                # Prepare the pull request data entry
                pr_entry = {
                    "pull_number": pull_number,
                    "additions": additions,
                    "deletions": deletions,
                    "changed_files": changed_files
                }
                print(pr_entry)

                existing_pr_data.append(pr_entry)
                existing_pull_numbers.add(pull_number)
                print(f"Data for pull request #{pull_number} fetched and added.")

                # Save the pull request data to a JSON file
                save_data_to_json(existing_pr_data, repo_filename)
                print(f"PR #{pull_number} data saved to '{repo_filename}'.")
            else:
                print(f"No data fetched for pull request #{pull_number}.")

    print(f"Completed processing for repository: {owner}/{repo}")

print("\nAll data fetching completed.")