In [1]:
import os
import io
import requests
import pandas as pd
import numpy as np
import json
import time
from datetime import datetime, timedelta

from tqdm import tqdm

In [2]:
'''

args:
PERCEVAL_LOCATION - gets the location of perceval in our local system
FILE_SAVE_LOCATION - the location to save the queried data in the form of a .json file per repository
repository - the owner_name/repositoriy_name of the corresponding GitHub repository that we want to query
GiHuB_API_KEY - the GitHub API key for querying the data (limit is 5000 queries per hour)

This function uses perceval tool to query our required data (issue, pul_request) for the required repository. Converts the returned output in string format to
.json format and saves the .json file in the name of owner_name/repository_name.json at the given location to save.


'''
def query_and_save_data_using_perceval(PERCEVAL_LOCATION, FILE_SAVE_LOCATION, repository, GiHuB_API_KEY):
    
    owner_name = repository.split(r'/')[0]
    repo_name = repository.split(r'/')[1]
    COMMAND_STRING = PERCEVAL_LOCATION + ' && perceval github {0} {1} --from-date 2021-12-01 --json-line --sleep-for-rate -t {2} '.format(owner_name, repo_name, GiHuB_API_KEY)
    command_output = os.popen(COMMAND_STRING)
    command_result = command_output.read()
    
    file_save_name = FILE_SAVE_LOCATION + '{0}_{1}.json'.format(owner_name, repo_name)
    dict_list = [d.strip() for d in command_result.splitlines()]
    j = [json.loads(i) for i in dict_list]
    with open(file_save_name, 'w') as outfile:
        json.dump(j, outfile, indent = 4)

In [3]:
PERCEVAL_LOCATION = 'source ./../../venv/bin/activate' # perceval tool location
FILE_SAVE_LOCATION = r'./../Data/' # folder to save the query result
GiHuB_API_KEY = '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'
repository_list = ['diem/diem', 'paritytech/substrate', 'rust-lang/rust', 'servo/servo'] #repositories that needs to be queried

In [None]:
# query the tool for each repository individually
for repository in tqdm(repository_list):
    query_using_perceval(PERCEVAL_LOCATION, FILE_SAVE_LOCATION, repository, GiHuB_API_KEY)

## read_data

In [30]:
files = os.listdir('./../Data/')
files.remove(".ipynb_checkpoints")
files

['paritytech_substrate.json',
 'servo_servo.json',
 'diem_diem.json',
 'rust-lang_rust.json']

In [31]:
import json
for file_name in files: 
    print(file_name)
    # Opening JSON file
    f = open('./../Data/'+file_name)

    # returns JSON object as
    # a dictionary
    file = json.load(f)
    data_frame = pd.DataFrame(columns=['author', 'body', 'number', 'created_at', 'empty']) 
    # Iterating through the json
    # list
    for i in file:
        try:
            # print(i['data'])
            # print(i['data']['number'])
            for j in range(len(i['data']['comments_data'])):
                empty = 0
                if len(i['data']['comments_data'][j]['body']) < 2:
                    empty = 1
                data_frame = data_frame.append({'author': i['data']['comments_data'][j]['user_data']['login'],
                                   'body': i['data']['comments_data'][j]['body'],
                                   'number': i['data']['number'],
                                   'created_at': i['data']['comments_data'][j]['created_at'],
                                   'empty': empty
                                  }, ignore_index=True)
                # print(i['data']['comments_data'][j]['body'])
                # print(i['data']['comments_data'][j]['user_data']['login'])
                # print(i['data']['comments_data'][j]['created_at'])
                # print('#'*50)
        except:
            print('empty')

        # break
    f.close()
    data_frame.to_csv('./../Data_frames/'+file_name.split(".")[0]+'.csv')
    print('-'*50)

paritytech_substrate.json
--------------------------------------------------
servo_servo.json
--------------------------------------------------
diem_diem.json
--------------------------------------------------
rust-lang_rust.json
--------------------------------------------------
