In [2]:
import pandas as pd
import math
from datetime import datetime
import time
import sys
import json
import os.path
import requests

In [3]:
api_root = "https://api.github.com"
request_header = {'Accept':'application/vnd.github.v3+json'}

In [4]:
"""list of all client_id and client_secret that we have"""
client_list = [('client_id', 'client_secret')]

In [5]:
def get_commits(request_subject, df, client_list, client_number=0,request_header=request_header, upper_bound=math.inf, page_number=1):
    while(page_number<=upper_bound):
        try: 
            response = requests.get(request_subject + '/issues?state=all&per_page=100&page=' + str(page_number) + 
                                    '&client_id=' + client_list[client_number][0] + '&client_secret=' + client_list[client_number][1], headers = request_header)
            if response.ok:
                json_response = response.json()
                if not json_response:
                    break
                for i in range(len(json_response)):
                    """ for each issue we create a dictionary to keep data of that issue
                    these try/excepts add data to this dictionary and i the end we add this dictioary to main data frame 
                    which holds whole data about this specific peoject """
                    data = {}
                    try:
                        data['id'] = json_response[i]['id']
                    except:
                        data['id'] = ""
                    try:
                        data['node_id'] = json_response[i]['node_id']
                    except:
                        data['node_id'] = ""
                    try:
                        data['number'] = json_response[i]['number']
                    except:
                        data['number'] = ""
                    try:
                        data['title'] = json_response[i]['title']
                    except:
                        data['title'] = ""
                    try:
                        data['user_login_name'] = json_response[i]['user']['login']   
                    except:
                        data['user_login_name'] = ""
                    try:
                        data['user_login_id'] = json_response[i]['user']['id']
                    except:
                        data['user_login_id'] = ""
                    try:
                        data['user_node_id'] = json_response[i]['user']['node_id']    
                    except:
                        data['user_node_id'] = ""
                    try:
                        data['user_type'] = json_response[i]['user']['type']    
                    except:
                        data['user_type'] = ""
                    try:
                        data['body'] = json_response[i]['body']    
                    except:
                        data['body'] = ""
                    try:
                        data['labels'] = json_response[i]['labels'] 
                    except: 
                        data['labels'] = ""
                    try:
                        data['state'] = json_response[i]['state']   
                    except:
                        data['state'] = ""
                    try:    
                        tmp = ''
                        num_of_assignees = len(json_response[i]['assignees']) 
                        for j in range(num_of_assignees):
                            assignee = json_response[i]['assignees'][j]
                            tmp += ('login_name:'+assignee['login'])
                            tmp += ('login_id:'+assignee['id'])
                            tmp += ('login_node_id:'+assignee['node_id'])
                            tmp += ('login_type:'+assignee['type'])
                            if j < (num_of_assignees-1):
                                tmp += '|'
                        data['assignees']= tmp
                    except:
                        data['assignees'] = ""
                    try:
                        data['created_at'] = json_response[i]['created_at']
                    except:
                        data['created_at'] = ""
                    try:
                        data['closed_at'] = json_response[i]['closed_at']
                    except:
                        data['closed_at'] = ""

                    df = df.append(data, ignore_index=True)
                print(f'page {page_number} is added to data frame')
                page_number += 1
                try:
                    if int(response.headers["X-RateLimit-Remaining"]) < 2:
                        print("limit exceeded!!!!!!!!!!!!")
                        delay = 60
                        print('sleeping for '+str(delay)+' seconds...')
                        print("current time:" + str(datetime.now()))
                        time.sleep(int(delay))
                except (KeyError):
                    pass
            else:
                """ this part is for the time our client limit is complete so we change it by changing the client number """
                resp = json.loads(response.text or response.content)
                print('\n---'+str(response))
                print('\n---'+str(resp['message']))
                new_client_number = client_number + 1
                new_client_number %= int(len(client_list))
                if new_client_number < int(len(client_list)):
                    return get_commits(request_subject, df, client_list, client_number=new_client_number, page_number=page_number)
                else:
                    return df 
        except requests.exceptions.Timeout as e:
            print("-------timeout-------")
            print(e)
            time.sleep(delay_conn)
            return get_commits(request_subject, df, request_header, upper_bound, page_number)
        except requests.ConnectionError as e:
            print("-------connection error-------")
            print(e)
            time.sleep(delay_conn)
            return get_commits(request_subject, df, request_header, upper_bound, page_number)
    return df

In [6]:
""" we create a base url with repo and owner name, also create base dataframe which store data 
and in the end save the data we get from 'get_commits' function which are all the commits of that specific project """
def get_commits_of_repo(owner, repo, client_list, api_root=api_root):
    request_subject = api_root + '/repos/' + owner + '/' + repo 
    cols = ['id', 'node_id', 'number', 'title', 'user_login_name', 'user_login_id', 'user_node_id', 'body', 
            'labels','state', 'assignees', 'created_at', 'closed_at', 'user_type']
    df = pd.DataFrame(columns=cols)
    repo_dataframe = get_commits(request_subject, df, client_list=client_list)
    print(f'data frame shape is : {repo_dataframe.shape}')
    repo_dataframe.to_csv(owner+'_'+repo+'_issues_data.csv', index=False)
    print(f"issues for {owner}/{repo} saved succesfully!!! ;)")

In [7]:
""" specifying all the owners and repos that we want their whole commits data """
sources = [('owner','repo')]

In [None]:
for source in sources:
    if not os.path.exists(source[0]+'_'+source[1]+'_issues_data.csv'):
        print(f'getting issues for {source[0]}/{source[1]} is started')
        get_commits_of_repo(source[0], source[1], client_list, api_root)