In [2]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import requests
from dotenv import load_dotenv
import os
import asyncio

In [4]:
df = pd.read_csv("../dataset/dataset_filtrado.csv", delimiter=';')
dfs = []

load_dotenv()

token1 = os.getenv('TOKEN1')
token2 = os.getenv('TOKEN2')
token3 = os.getenv('TOKEN3')
token4 = os.getenv('TOKEN4')

list_tokens = [token1, token2, token3, token4]

index_list_token = 0

api_call = 1

def get_paginated_data(url):
    global api_call
    global index_list_token
    global list_tokens

    data = []
    while url:
        if api_call >= 4998:
            api_call = 1
            index_list_token += 1
            if index_list_token > len(list_tokens):
                index_list_token = 0
                timeout = 3600
                asyncio.sleep(timeout)

        headers = {
            "Accept": "application/vnd.github+json",
            "Authorization": f"Bearer {list_tokens[index_list_token]}",
            "X-GitHub-Api-Version": "2022-11-28"
        }
        print(api_call, end=", ")
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            response_data = response.json()
            data.extend(parse_data(response_data))

            link_header = response.headers.get('Link')
            if link_header and 'rel="next"' in link_header:
                next_url = [url_part.split(";")[0].strip().strip("<>")
                            for url_part in link_header.split(",")
                            if 'rel="next"' in url_part]
                url = next_url[0] if next_url else None
            else:
                url = None
        else:
            print(f"Erro: {response.status_code}, {response.text}")
            break
        api_call = api_call + 1
    return data


def parse_data(response_data):
    if isinstance(response_data, list):
        return response_data
    elif isinstance(response_data, dict):
        namespace_key = list(response_data.keys())[0]
        return response_data[namespace_key]
    return []


def save_data_to_json(data, filename="data.json"):
    with open(filename, "w") as json_file:
        json.dump(data, json_file, indent=4)

In [5]:
df['URL']

0                     https://github.com/mirumee/saleor
1                 https://github.com/cfpb/hmda-platform
2                https://github.com/golastmile/rasa_nlu
3                      https://github.com/samvera/hyrax
4                https://github.com/mesosphere/marathon
5     https://github.com/openstates/openstates-scrapers
6                https://github.com/letsencrypt/boulder
7              https://github.com/deviantony/docker-elk
8                       https://github.com/akka/alpakka
9         https://github.com/andryyy/mailcow-dockerized
10                  https://github.com/nextcloud/docker
Name: URL, dtype: object

In [None]:
for url in df['URL']:

    parts = url.split('https://github.com/')[1].split('/')

    owner = parts[0]
    repo = parts[1]

    url_final = f"https://api.github.com/repos/{owner}/{repo}/pulls"

    print(url_final)

    data = get_paginated_data(url_final)

    filename = f"../data/pulls/commits_{owner}_{repo}.json"

    save_data_to_json(data, filename)    

    print() # apenas para formatar a saida

In [None]:
api_call = 1

for url in df['URL']:
    parts = url.split('https://github.com/')[1].split('/')
    owner = parts[0]
    repo = parts[1]

    filename = f"../data/issues/closed_issues_{owner}_{repo}.json"
    print(f"Read file: {filename}")
    
    with open(filename, 'r') as file:
        issues = json.load(file)
    
    if issues:
        issues_data = []
        for issue in issues:
            pull_request_url = issue.get('pull_request', {}).get('url')
            if api_call >= 4998:
                api_call = 1
                index_list_token += 1
                if index_list_token > len(list_tokens):
                    index_list_token = 0
                    timeout = 3600
                    asyncio.sleep(timeout)

            headers = {
                "Accept": "application/vnd.github+json",
                "Authorization": f"Bearer {list_tokens[index_list_token]}",
                "X-GitHub-Api-Version": "2022-11-28"
            }
            print(api_call, end=", ")

            comments = review_comments = maintainer_can_modify = commits = additions = deletions = changed_files = None
            
            if pull_request_url:
                is_pull_request = True
                response = requests.get(pull_request_url, headers=headers)
                if response.status_code == 200:
                    pr_data = response.json()
                    comments = pr_data.get('comments', None)
                    review_comments = pr_data.get('review_comments', None)
                    maintainer_can_modify = pr_data.get('maintainer_can_modify', None)
                    commits = pr_data.get('commits', None)
                    additions = pr_data.get('additions', None)
                    deletions = pr_data.get('deletions', None)
                    changed_files = pr_data.get('changed_files', None)
            else:
                is_pull_request = False

            # print(f"Comments: {comments}")
            # print(f"Review Comments: {review_comments}")
            # print(f"Maintainer Can Modify: {maintainer_can_modify}")
            # print(f"Commits: {commits}")
            # print(f"Additions: {additions}")
            # print(f"Deletions: {deletions}")
            # print(f"Changed Files: {changed_files}")
            if issue.get('closed_at'):
                closed_at = datetime.strptime(issue['closed_at'], '%Y-%m-%dT%H:%M:%SZ')
            else:
                closed_at = None
            issues_data.append({
                'repo_name': repo,
                'issue_number': issue["number"],
                'title': issue['title'],
                'user': issue['user']['login'] if issue.get('user') else None,
                'state': issue['state'],
                'labels': issue.get('labels', []),
                'url': issue['html_url'],
                'is_pull_request': is_pull_request,
                'created_at': issue['created_at'],
                'closed_at': issue.get('closed_at'),
                'comments': issue['comments'],
                'body': issue.get('body', ''),
                'pull_request_url': pull_request_url,
                'pr_comments': comments,
                'pr_review_comments': review_comments,
                'pr_maintainer_can_modify': maintainer_can_modify,
                'pr_commits': commits,
                'pr_additions': additions,
                'pr_deletions': deletions,
                'pr_changed_files': changed_files
            })
        
        df_issues = pd.DataFrame(issues_data)
        dfs.append(df_issues)
    
    print("Done!")

In [None]:
df_final = pd.concat(dfs, ignore_index=True)
