In [1]:
import os
import io
import requests
import pandas as pd
import numpy as np
import json
import time
from datetime import datetime, timedelta

from tqdm import tqdm

In [2]:
'''

args:
PERCEVAL_LOCATION - gets the location of perceval in our local system
FILE_SAVE_LOCATION - the location to save the queried data in the form of a .json file per repository
repository - the owner_name/repositoriy_name of the corresponding GitHub repository that we want to query
GiHuB_API_KEY - the GitHub API key for querying the data (limit is 5000 queries per hour)

This function uses perceval tool to query our required data (issue, pul_request) for the required repository. Converts the returned output in string format to
.json format and saves the .json file in the name of owner_name/repository_name.json at the given location to save.


'''
def query_and_save_data_using_perceval(PERCEVAL_LOCATION, FILE_SAVE_LOCATION, repository, GiHuB_API_KEY):
    
    owner_name = repository.split(r'/')[0]
    repo_name = repository.split(r'/')[1]
    COMMAND_STRING = PERCEVAL_LOCATION + ' && perceval github {0} {1} --from-date 2021-12-01 --json-line --sleep-for-rate -t {2} '.format(owner_name, repo_name, GiHuB_API_KEY)
    command_output = os.popen(COMMAND_STRING)
    command_result = command_output.read()
    
    file_save_name = FILE_SAVE_LOCATION + '{0}_{1}.json'.format(owner_name, repo_name)
    dict_list = [d.strip() for d in command_result.splitlines()]
    j = [json.loads(i) for i in dict_list]
    with open(file_save_name, 'w') as outfile:
        json.dump(j, outfile, indent = 4)

In [3]:
PERCEVAL_LOCATION = 'source ./../../venv/bin/activate' # perceval tool location
FILE_SAVE_LOCATION = r'./../Data/' # folder to save the query result
GiHuB_API_KEY = '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'
repository_list = ['diem/diem', 'paritytech/substrate', 'rust-lang/rust', 'servo/servo'] #repositories that needs to be queried

In [None]:
# query the tool for each repository individually
for repository in tqdm(repository_list):
    query_using_perceval(PERCEVAL_LOCATION, FILE_SAVE_LOCATION, repository, GiHuB_API_KEY)

In [5]:
data = json.loads(open(FILE_SAVE_LOCATION + r'rust-lang_rust.json', 'r').read())

In [6]:
len(data) 

3564

## do it one by one

In [4]:
repository_list = ['rust-lang/rust'] #repositories that needs to be queried
for repository in tqdm(repository_list):
    query_and_save_data_using_perceval(PERCEVAL_LOCATION, FILE_SAVE_LOCATION, repository, GiHuB_API_KEY)

  0%|                                                     | 0/1 [00:00<?, ?it/s][2022-01-27 11:06:30,939] - Sir Perceval is on his quest.
[2022-01-27 11:21:56,528] - Rate limit exhausted. Waiting 2573 secs for rate limit reset.
[2022-01-27 12:04:49,755] - Rate limit exhausted. Waiting 0 secs for rate limit reset.
[2022-01-27 12:04:49,905] - Rate limit exhausted. Waiting 0 secs for rate limit reset.
[2022-01-27 12:04:50,054] - Rate limit exhausted. Waiting 0 secs for rate limit reset.
[2022-01-27 12:04:50,245] - Rate limit exhausted. Waiting 0 secs for rate limit reset.
[2022-01-27 12:04:50,420] - Rate limit exhausted. Waiting 0 secs for rate limit reset.
[2022-01-27 12:04:50,650] - Rate limit exhausted. Waiting 0 secs for rate limit reset.
[2022-01-27 12:04:50,831] - Rate limit exhausted. Waiting 0 secs for rate limit reset.
[2022-01-27 12:20:31,878] - Rate limit exhausted. Waiting 2659 secs for rate limit reset.
[2022-01-27 13:04:51,162] - Rate limit exhausted. Waiting 0 secs for rate

## read_data

In [None]:
import json
 
# Opening JSON file
f = open('./../Data/diem_diem.json')
 
# returns JSON object as
# a dictionary
file = json.load(f)
 
# Iterating through the json
# list
for i in file:
    try:
        # print(i['data'])
        for j in range(len(i['data']['comments_data'])):
            print(i['data']['comments_data'][j]['body'])
            print(i['data']['comments_data'][j]['user_data']['login'])
            print(i['data']['comments_data'][j]['created_at'])
            print('#'*50)
    except:
        print('empty')
    print('-'*50)
    
    # break
f.close()

# inside data -> comments_data -> body
# inside data -> comments_data -> user_data -> name
# inside data -> comments_data -> created_at