In [1]:
import pandas as pd
from dotenv import load_dotenv
import os
import requests
import json
import time

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
diretorio = "../data/pull_request"

if not os.path.exists(diretorio):
    os.makedirs(diretorio)
    print(f"Diretório '{diretorio}' criado com sucesso.")
else:
    print(f"Diretório '{diretorio}' já existe.")

load_dotenv()

token1 = os.getenv('TOKEN1')
token2 = os.getenv('TOKEN2')

tokens = [token1, token2]

index_list_token = 0

df = pd.read_csv("../dataset/dataset_filtrado.csv", delimiter=';')

Diretório '../data/pull_request' já existe.


In [3]:
class TokenManager:
    def __init__(self, tokens):
        self.tokens = tokens
        self.index = 0
        self.rate_limits = [{} for _ in tokens]
        self.current_token = tokens[self.index]

    def get_headers(self):
        return {
            "Accept": "application/vnd.github+json",
            "Authorization": f"Bearer {self.current_token}",
            "X-GitHub-Api-Version": "2022-11-28",
        }

    def update_rate_limit(self, response_headers):
        self.rate_limits[self.index] = {
            "remaining": int(response_headers.get("X-RateLimit-Remaining", 0)),
            "reset": int(response_headers.get("X-RateLimit-Reset", 0)),
        }

    def switch_token(self):
        self.index = (self.index + 1) % len(self.tokens)
        self.current_token = self.tokens[self.index]
        print(f"Switched to token {self.index + 1}")
        self.print_rate_limit()

    def wait_for_rate_limit_reset(self):
        reset_times = [rl.get("reset", int(time.time()) + 3600) for rl in self.rate_limits]
        earliest_reset = min(reset_times)
        sleep_time = max(earliest_reset - int(time.time()), 0) + 10
        print(f"All tokens rate limited. Waiting for {sleep_time} seconds.")
        time.sleep(sleep_time)
        self.index = 0
        self.current_token = self.tokens[self.index]
        print("Resuming with first token after waiting.")
        self.print_rate_limit()

    def print_rate_limit(self):
        remaining = self.rate_limits[self.index].get("remaining", "Unknown")
        reset = self.rate_limits[self.index].get("reset", "Unknown")
        reset_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(reset)) if reset != "Unknown" else "Unknown"
        print(f"Current Token: {self.index + 1} | Remaining Rate Limit: {remaining} | Rate Limit Resets At: {reset_time}")

In [4]:
def get_paginated_data(url, token_manager):
    data = []
    while url:
        headers = token_manager.get_headers()
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            response_data = response.json()
            data.extend(parse_data(response_data))
            link_header = response.headers.get("Link")
            if link_header and 'rel="next"' in link_header:
                next_url = [
                    url_part.split(";")[0].strip().strip("<>")
                    for url_part in link_header.split(",")
                    if 'rel="next"' in url_part
                ]
                url = next_url[0] if next_url else None
            else:
                url = None
            token_manager.update_rate_limit(response.headers)
            token_manager.print_rate_limit()
            if token_manager.rate_limits[token_manager.index].get("remaining", 1) <= 10:
                token_manager.switch_token()
        else:
            if response.status_code == 403 and "rate limit exceeded" in response.text.lower():
                token_manager.update_rate_limit(response.headers)
                token_manager.switch_token()
                if token_manager.index == 0:
                    token_manager.wait_for_rate_limit_reset()
                continue  # Retry with new token
            else:
                print(f"Error: {response.status_code}, {response.text}")
                break
    return data

def parse_data(response_data):
    if isinstance(response_data, list):
        return response_data
    elif isinstance(response_data, dict):
        return [response_data]
    return []

def save_data_to_json(data, filename="data.json"):
    with open(filename, "w") as json_file:
        json.dump(data, json_file, indent=4)

In [None]:
token_manager = TokenManager(tokens)

for url in df["URL"]:
    try:
        parts = url.split("https://github.com/")[1].split("/")
        owner = parts[0]
        repo = parts[1]
    except IndexError:
        print(f"Invalid URL format: {url}")
        continue

    print(f"\nProcessing repository: {owner}/{repo}")

    issues_file = f"../data/issues/closed_issues_{owner}_{repo}.json"
    if os.path.exists(issues_file):
        with open(issues_file, "r") as f:
            try:
                issues_data = json.load(f)
            except json.JSONDecodeError:
                print(f"Corrupted issues file '{issues_file}'. Skipping repository.")
                continue
    else:
        print(f"Issues file for {owner}/{repo} not found.")
        continue  # Skip to next repository

    repo_filename = f"{diretorio}/pull_files_{owner}_{repo}.json"

    if os.path.exists(repo_filename):
        with open(repo_filename, "r") as f:
            try:
                existing_pr_data = json.load(f)
                existing_pull_numbers = {pr["pull_number"] for pr in existing_pr_data}
                print(f"Loaded {len(existing_pull_numbers)} existing PRs from '{repo_filename}'.")
            except json.JSONDecodeError:
                print(f"Corrupted JSON file '{repo_filename}'. Starting fresh.")
                existing_pr_data = []
                existing_pull_numbers = set()
    else:
        existing_pr_data = []
        existing_pull_numbers = set()
        print(f"No existing PR data found for {owner}/{repo}. Starting fresh.")

    for issue in issues_data:
        if "pull_request" in issue:
            pull_number = issue["number"]

            if pull_number in existing_pull_numbers:
                print(f"PR #{pull_number} already scraped. Skipping.")
                continue 

            api_url_files = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pull_number}/files"
            print(f"Fetching files for pull request #{pull_number}...")

            pr_files_data = get_paginated_data(api_url_files, token_manager)

            api_url_main = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pull_number}"
            print(f"Fetching main data for pull request #{pull_number}...")
            pr_main_data_list = get_paginated_data(api_url_main, token_manager)

            if pr_files_data and pr_main_data_list:
                pr_main_data = pr_main_data_list[0]  # Extract the dictionary from the list

                additions = pr_main_data.get("additions", 0)
                deletions = pr_main_data.get("deletions", 0)
                changed_files = pr_main_data.get("changed_files", 0)

                pr_entry = {
                    "pull_number": pull_number,
                    "additions": additions,
                    "deletions": deletions,
                    "changed_files": changed_files
                }
                print(pr_entry)

                existing_pr_data.append(pr_entry)
                existing_pull_numbers.add(pull_number)
                print(f"Data for pull request #{pull_number} fetched and added.")

                save_data_to_json(existing_pr_data, repo_filename)
                print(f"PR #{pull_number} data saved to '{repo_filename}'.")
            else:
                print(f"No data fetched for pull request #{pull_number}.")

    print(f"Completed processing for repository: {owner}/{repo}")

print("\nAll data fetching completed.")


Processing repository: mirumee/saleor
Loaded 10968 existing PRs from '../data/pull_request/pull_files_mirumee_saleor.json'.
PR #16718 already scraped. Skipping.
PR #16716 already scraped. Skipping.
PR #16705 already scraped. Skipping.
PR #16704 already scraped. Skipping.
PR #16703 already scraped. Skipping.
PR #16702 already scraped. Skipping.
PR #16700 already scraped. Skipping.
PR #16699 already scraped. Skipping.
PR #16698 already scraped. Skipping.
PR #16697 already scraped. Skipping.
PR #16695 already scraped. Skipping.
PR #16694 already scraped. Skipping.
PR #16693 already scraped. Skipping.
PR #16692 already scraped. Skipping.
PR #16691 already scraped. Skipping.
PR #16690 already scraped. Skipping.
PR #16689 already scraped. Skipping.
PR #16688 already scraped. Skipping.
PR #16687 already scraped. Skipping.
PR #16686 already scraped. Skipping.
PR #16685 already scraped. Skipping.
PR #16683 already scraped. Skipping.
PR #16682 already scraped. Skipping.
PR #16681 already scrape