# Setup

In [1]:
from ipywidgets import FileUpload, interact

import requests
import pandas as pd

# para coletar dados adicionais
from urllib.parse import urlparse, parse_qs

# Autenticação API Github

In [None]:
@interact(files=FileUpload())
def set_token(files={}):
    global GITHUB_TOKEN
    if files:
        if isinstance(files, tuple): # Jupyter Notebook
            file = files[0]
            token = file['content'].tobytes().decode("utf-8").strip()
        elif isinstance(files, dict): # Google Colab
            file = list(files.values())[0]
            GITHUB_TOKEN = file['content'].decode("utf-8").strip()
        print("Token Loaded!")

interactive(children=(FileUpload(value={}, description='Upload'), Output()), _dom_classes=('widget-interact',)…

In [None]:
SITE = "https://api.github.com"
def token_auth(request):
    request.headers["User-Agent"] = "MetodosPesquisa" # Required
    request.headers["Authorization"] = "token {}".format(GITHUB_TOKEN)
    return request
response = requests.get(SITE, auth=token_auth)
response.status_code

200

# I - Leitura dos dados

In [2]:
forked_repos_df = pd.read_csv('/content/forked_repos.csv')
common_repos_df = pd.read_csv('/content/common_repos.csv')
forked_repos_df.shape, common_repos_df.shape

((1, 259), (174, 64))

# II - Coletando dados adicionais

## A) Coletando número de commits

In [None]:
def get_github_repo_commit_count(owner_name: str, repo_name: str) -> int:
    url = f"https://api.github.com/repos/{owner_name}/{repo_name}/commits?per_page=1"
    response = requests.get(url, auth=token_auth)

    if response.status_code == 200:
        links = response.links
        if 'last' in links:
            last_link_url = links['last']['url']
            parsed_url = urlparse(last_link_url)
            query_params = parse_qs(parsed_url.query)
            commit_count = int(query_params['page'][0])
            return commit_count
        else:
            return len(response.json())
    else:
        response.raise_for_status()

In [None]:
commits_count = []
for i, row in common_repos_df.iterrows():
    owner_name, repo_name = row["full_name"].split("/")
    try:
        if row['commits_count'] > 0:
            commits_count.append(row['commits_count'])
        else:
            total_commits = get_github_repo_commit_count(owner_name, repo_name)
            commits_count.append(total_commits)
    except Exception as e:
        print(f"Erro: {e}")
        commits_count.append(None)

In [None]:
common_repos_df["commits_count"] = commits_count

In [None]:
print(common_repos_df.shape)
# common_repos_df.to_csv("common_repos_df_commits.csv", index=False)

(174, 65)


In [None]:
common_repos_df["commits_count"].value_counts().sort_index()

Unnamed: 0_level_0,count
commits_count,Unnamed: 1_level_1
4.0,5
7.0,3
8.0,2
9.0,2
10.0,5
...,...
67957.0,1
666593.0,1
702054.0,1
702056.0,1


## B) Coletando número de contribuidores

In [None]:
contr_count = []
for i, row in common_repos_df.iterrows():
    repo_response_raw = requests.get(row["contributors_url"], auth=token_auth)
    response_data = repo_response_raw.json()
    contr_count.append(len(response_data))

In [None]:
common_repos_df["contr_count"] = contr_count

print(common_repos_df.shape)
# common_repos_df.to_csv("common_repos_df_commits_v2.csv", index=False)

(174, 66)


## C) Coletando número de downloads [NOT WORKING]

In [None]:
# downloads_count = []
# for i, row in common_repos_df.iterrows():
#     repo_response_raw = requests.get(row["downloads_url"], auth=token_auth)
#     response_data = repo_response_raw.json()
#     downloads_count.append(len(response_data))

In [None]:
# downloads_count = []
# for i, row in common_repos_df.iterrows():
#     owner_name, repo_name = row["full_name"].split("/")

#     url = f"https://api.github.com/repos/{owner_name}/{repo_name}/releases"
#     response = requests.get(url, auth=token_auth)
#     response.raise_for_status()
#     releases_data = response.json()

#     total_downloads = 0
#     for release in releases_data:
#         # print(f"Release: {release.get('name') or release.get('tag_name')}")
#         for asset in release.get('assets', []):
#             # print(f"  Asset: {asset.get('name')}, Downloads: {asset.get('download_count')}")
#             total_downloads += asset.get('download_count', 0)
#     downloads_count.append(total_downloads)
#     # print(f"\nTotal downloads across all releases: {total_downloads}")

In [None]:
# common_repos_df["downloads_count"] = downloads_count

# III - Salvando resultados

In [None]:
common_repos_df.to_csv("common_repos_increased.csv", index=False) # common_repos_df_commits_v2.csv