## Importações

In [2]:
import os
import csv
import requests
try:
    from github import Github
except:
    !pip install PyGithub
    from github import Github
from pprint import pprint
import pandas as pd
import math

## Preparação

In [3]:
# lista de campos necessários de cada repositório
fieldnames = [
    'id',
    'full_name',
    'name',
    'owner',
    'description',
    'topics',
    'stargazers_count',
    'subscribers_count',
    'fork',
    'forks_count',
    'language',
    'languages',
    # 'has_issues',
    'open_issues_count',
    # 'commits_count',
    # 'last_year_commits_count',
    'contributors_count',
    # 'top_5_contributors',
    'pushed_at',
    'created_at',
    'updated_at',
]

# foi usada a biblioteca PyGithub ao invés de um web-crawler para facilitar a aquisição dos atributos dos repositórios
# no entanto, a api do github possui um limite de 5000 chamadas por hora para contas autenticadas
# dessa forma, vão ser necessárias múltiplas pesquisas espaçadas num intervalo de horas para conseguir dados de milhares de repositórios, considerando que cada repositório resulta em múltiplas chamadas a api
# outro ponto importante é que o github retorna apenas 1000 resultados por pesquisa, então para conseguir >1000 repositórios serão necessárias múltiplas pesquisas

# instanciar wrapper a api do github usando chave de acesso
# duas tokens de acesso, uma para cada pesquisa
key1 = open('key1.txt', 'r').read()
key2 = open('key2.txt', 'r').read()

# g = Github(key1)
g = Github(key2)

## Realizar pesquisas na API

### Preparação

In [3]:
def search_api(query, github):
    """
        pesquisa query na api do github e retorna uma lista de dicts com os resultados
    """

    results = github.search_repositories(query=query)
    results_dicts = []

    print(f'{results.totalCount} repositórios encontrados para "{query}".')
    print('iniciando construção da lista de resultados')

    n = 0
    for repo in results:
        topics = repo.get_topics()
        contributors = repo.get_contributors()
        # commits = repo.get_commits()
        languages = repo.get_languages()
        # last_year_commits_count = 0
        # for week in repo.get_stats_commit_activity():
        #     last_year_commits_count += week.total

        results_dicts.append({
            'id': repo.id,
            'full_name': repo.full_name,
            'name': repo.name,
            'owner': repo.owner.login,
            'description': repo.description,
            'topics': ", ".join(topics),
            'stargazers_count': repo.stargazers_count,
            'subscribers_count': repo.subscribers_count,
            'fork': repo.fork,
            'forks_count': repo.forks_count,
            'language': repo.language,
            'languages': ", ".join(languages.keys()),
            # 'has_issues': repo.has_issues,
            'open_issues_count': repo.open_issues_count,
            # 'commits_count': commits.totalCount,
            # 'last_year_commits_count': last_year_commits_count,
            'contributors_count': contributors.totalCount,
            # 'top_5_contributors': [person.login for person in contributors[:5]] if contributors.totalCount > 5 else [],
            'pushed_at': repo.pushed_at,
            'created_at': repo.created_at,
            'updated_at': repo.updated_at,
        })

        n += 1
        print(f'{n}/{results.totalCount}: {repo.full_name} ({math.floor((n/results.totalCount)*100)}%)')

    print('finalizada construção dos resultados')
    return results_dicts

def write_csv_from_results(filename, dict_list, fieldnames, csv):
    """
        escreve um csv usando uma lista de dicts e nomes de colunas
    """

    with open(filename, mode='w', encoding="utf-8") as data_api:
        writer = csv.DictWriter(data_api, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, fieldnames=fieldnames)
        writer.writeheader()

        for item in dict_list:
            writer.writerow(item)
    print(f'escrito {filename}')

### Como serão feitas várias chamadas a api, é importante usar filtros para diminuir o número de chamadas
### Também é melhor separar o processo em múltiplas etapas, assim se alcançar o limite podemos fazer outra pesquisa depois e juntar os resultados
### De qualquer forma, como o github retorna no máximo 1000 resultados por pesquisa, serão necessárias múltiplas pesquisas

In [9]:
%%time

results1 = search_api('blockchain stars:50..120', g)
write_csv_from_results('data-api-1.csv', results1, fieldnames, csv)

: Ashlar/cryptometa (23%)
158/668: XTRABYTES/XTRABYTES (23%)
159/668: ontio/ontology-ts-sdk (23%)
160/668: Ice-Storm/structure-and-interpretation-of-blockchain (23%)
161/668: imfly/blockchain-on-nodejs (24%)
162/668: simpleapples/py-blockchain-cli (24%)
163/668: web3labs/blk-explorer-free (24%)
164/668: cynthiablee/blockchain-to-spreadsheet (24%)
165/668: Turing-Chain/Honeypots-on-Blockchain (24%)
166/668: holgern/beem (24%)
167/668: blockchain-certificates/cert-viewer (25%)
168/668: hyperledger-archives/iroha-android (25%)
169/668: mailchain/mailchain (25%)
170/668: airalab/robonomics_contracts (25%)
171/668: saber-hq/stable-swap-program (25%)
172/668: hitripod/awesome-blockchain (25%)
173/668: tronprotocol/tron-contracts (25%)
174/668: blockchainedindia/resources (26%)
175/668: GetScatter/eos-sharp (26%)
176/668: ArkEcosystem/explorer (26%)
177/668: CoinAlpha/fund-protocol (26%)
178/668: input-output-hk/symphony-2 (26%)
179/668: steemit/hivemind (26%)
180/668: CortexFoundation/Cortex

In [4]:
%%time

results2 = search_api('blockchain stars:>120', g)
write_csv_from_results('data-api-2.csv', results2, fieldnames, csv)

rnity/protocol (30%)
230/754: cosme12/SimpleCoin (30%)
231/754: inoutcode/ethereum_book (30%)
232/754: U-Network/UNetwork (30%)
233/754: aviaviavi/legion (30%)
234/754: aitos-io/BoAT-X-Framework (31%)
235/754: zack-bitcoin/amoveo (31%)
236/754: w3f/polkadot-wiki (31%)
237/754: stellar/go (31%)
238/754: bitquest/bitquest (31%)
239/754: hyperledger/fabric-ca (31%)
240/754: tyrchen/unchained (31%)
241/754: AplaProject/go-apla (31%)
242/754: Azure/coco-framework (32%)
243/754: ConsenSys/smart-contract-best-practices (32%)
244/754: khipu-io/khipu (32%)
245/754: bitshares/bitshares-ui (32%)
246/754: okTurtles/blockchainid (32%)
247/754: julienr/ipynb_playground (32%)
248/754: anders94/public-private-key-demo (32%)
249/754: nanocurrency/nano-node (33%)
250/754: yep/eth-tweet (33%)
251/754: unlock-protocol/unlock (33%)
252/754: input-output-hk/cardano-node (33%)
253/754: 15Dkatz/cryptochain (33%)
254/754: blockchain-jd-com/jdchain (33%)
255/754: everitoken/evt (33%)
256/754: ethereum/solidity 

## Combinando resultados

In [5]:
files = [
    pd.read_csv('data-api-1.csv'),
    pd.read_csv('data-api-2.csv'),
]

dataset = pd.concat(files, ignore_index=True).sort_values('id').reset_index(drop=True)
dataset.to_csv('dataset.csv', index=False)

In [6]:
dataset

Unnamed: 0,id,full_name,name,owner,description,topics,stargazers_count,subscribers_count,fork,forks_count,language,languages,open_issues_count,contributors_count,pushed_at,created_at,updated_at
0,2304420,bitcoin-dot-org/Bitcoin.org,Bitcoin.org,bitcoin-dot-org,Bitcoin.org Website,"bitcoin, blockchain, cryptocurrency, p2p",1150,228,False,1931,HTML,"HTML, SCSS, JavaScript, Less, Ruby, Shell, Mak...",70,334,2021-06-23 11:51:52,2011-08-31 21:58:01,2021-06-24 04:36:53
1,2724167,ripple/rippled,rippled,ripple,Decentralized cryptocurrency blockchain daemon...,"xrpl, xrp-ledger, xrp, cplusplus, c-plus-plus,...",3877,506,False,1229,C++,"C++, C, CMake, Shell, JavaScript, Java, Sage, ...",294,89,2021-06-23 09:11:32,2011-11-07 04:40:15,2021-06-25 17:08:35
2,2877991,mhanne/block_browser,block_browser,mhanne,Bitcoin Blockchain Browser,,51,7,False,22,Ruby,"Ruby, HTML, CoffeeScript, CSS, JavaScript",11,2,2016-07-21 15:07:16,2011-11-29 20:44:00,2019-09-26 01:05:39
3,3886965,bitcoin-wallet/bitcoin-wallet,bitcoin-wallet,bitcoin-wallet,Bitcoin Wallet app for your Android device. St...,"bitcoin, bitcoinj, bitcoin-wallet, android, ja...",2475,475,False,1660,Java,Java,86,27,2021-05-30 15:13:32,2012-03-31 17:06:47,2021-06-26 07:22:42
4,3900173,blockchain/unused-My-Wallet-iPhone,unused-My-Wallet-iPhone,blockchain,"For latest source, please see: https://github....",,128,30,False,66,Assembly,"Assembly, C, Objective-C, C++, HTML, Objective...",6,9,2015-09-01 21:20:50,2012-04-02 12:01:57,2021-02-15 00:40:30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1417,363445257,guydavis/machinaris,machinaris,guydavis,An easy-to-use WebUI for Chia plotting and far...,"chia, plotting, webui, plotman, chia-network, ...",131,16,False,17,Python,"Python, HTML, Shell, PowerShell, CSS, Mako",19,3,2021-06-27 19:58:47,2021-05-01 15:36:50,2021-06-27 19:10:49
1418,365701873,dfinity/ic,ic,dfinity,Internet Computer blockchain source: the clien...,,653,41,False,49,Rust,"Rust, WebAssembly, Shell, Dockerfile, HTML, Mo...",0,1,2021-06-24 09:00:37,2021-05-09 08:16:37,2021-06-27 18:52:09
1419,367841354,ChainGreenOrg/chaingreen-blockchain,chaingreen-blockchain,ChainGreenOrg,,,50,7,False,21,Python,"Python, Shell, PowerShell, Dockerfile",24,71,2021-06-24 12:15:26,2021-05-16 09:53:27,2021-06-27 18:09:29
1420,376155268,OffcierCia/ultimate-defi-research-base,ultimate-defi-research-base,OffcierCia,Here we collect and discuss the best DeFI & Bl...,,164,11,False,19,,,0,7,2021-06-25 00:44:38,2021-06-11 22:27:41,2021-06-27 18:50:43
