In [1]:
import os
import pandas as pd
import json
from glob import glob
from datetime import datetime

In [2]:
def load_json_data(file_path):
    """Helper function to load a JSON file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def load_issues(file_path_issues, file_path_prs):
    all_issues = []
    
    prs = load_json_data(file_path_prs)
    prs_dict = {pr["number"]: pr for pr in prs} 

    repo_name = os.path.splitext(os.path.basename(file_path_issues))[0]
    data = load_json_data(file_path_issues)
    
    for issue in data:
        is_pull_request = "pull_request" in issue

        closed_at = datetime.strptime(issue["closed_at"], "%Y-%m-%dT%H:%M:%SZ") if issue.get("closed_at") else None

        pr_data = {}
        if issue.get("pull_request"):
            pr_number = issue["number"]
            pr = prs_dict.get(pr_number)
            if pr:
                pr_data = {
                    "pr_commits": pr.get("commits", 0),
                    "pr_additions": pr.get("additions", 0),
                    "pr_deletions": pr.get("deletions", 0),
                    "pr_changed_files": pr.get("changed_files", 0),
                }
        
        issue_data = {
            "repo_name": repo_name,
            "issue_number": issue["number"],
            "title": issue["title"],
            "user": issue["user"]["login"] if issue.get("user") else None,
            "state": issue["state"],
            "created_at": datetime.strptime(issue["created_at"], "%Y-%m-%dT%H:%M:%SZ"),
            "closed_at": closed_at,
            "labels": issue.get("labels", []),
            "is_pull_request": is_pull_request,
            "comments": issue.get("comments", 0),
            "body": issue.get("body", ""),
            # # Add PR data if exists
            # **pr_data
        }
        all_issues.append(issue_data)
    
    issues_df = pd.DataFrame(all_issues)
    return issues_df


In [4]:
df = pd.read_csv("../dataset/dataset_filtrado.csv", delimiter=';')

In [None]:
for url in df['URL']:
    parts = url.split('https://github.com/')[1].split('/')
    owner = parts[0]
    repo = parts[1]

    filename = f"../data/commits/commits_{owner}_{repo}.json"

    load_issues(filename)    

    print() # apenas para formatar a saida