In [1]:
import pandas as pd
import math
from datetime import datetime
import time
import sys
import json
import os.path
from bs4 import BeautifulSoup
import requests

In [2]:
request_header = {'Accept':'application/vnd.github.v3+json'}
api_root = "https://api.github.com"

In [3]:
"""list of all client_id and client_secret that we have"""
client_list = [('client_id', 'client_secret')]

In [4]:
def get_commits(request_subject, df, client_list, client_number=0,request_header=request_header, upper_bound=math.inf, page_number=1):
#     page_number = 1
    while(page_number<=upper_bound):
        try: 
            response = requests.get(request_subject + '/commits?per_page=100&page=' + str(page_number) + 
                                    '&client_id=' + client_list[client_number][0] + '&client_secret=' + client_list[client_number][1], headers = request_header)
            if response.ok:
                json_response = response.json()
                if not json_response:
                    break
                for i in range(len(json_response)):
                    """ for each commit we create a dictionary to keep data of that commit
                    these try/excepts add data to this dictionary and i the end we add this dictioary to main data frame 
                    which holds whole data about this specific peoject """
                    data = {}
                    data['sha'] = json_response[i]['sha']
                    
                    """ for these two data which are commit diff and commit comments we need to send another request to 
                    get the data from. for commit diff we use 'url' and for comments we use 'comments_url' also becuase of limitation of
                    request we have to add these else just in case request response wasn't ok """
                    try:
                        commit_changes = requests.get(json_response[i]['url']+'?client_id=' + client_list[client_number][0] 
                                                      + '&client_secret=' + client_list[client_number][1], headers = request_header)
                        commit_data = commit_changes.json()
                        changed_files_name = []
                        changed_codes = []
                        for j in range(len(commit_data['files'])):
                            changed_files_name.append(commit_data['files'][j]['filename'])
                            changed_codes.append(commit_data['files'][j]['patch'])
                        data['changed_files_name'] = changed_files_name
                        data['changed_codes'] = changed_codes

                    except:
                        data['changed_files_name'] = ''
                        data['changed_codes'] = ''
                    df = df.append(data, ignore_index=True)
                print(f'page {page_number} is added to data frame')
                page_number += 1
                try:
                    if int(response.headers["X-RateLimit-Remaining"]) < 2:
                        print("limit exceeded!!!!!!!!!!!!")
                        delay = 60
                        print('sleeping for '+str(delay)+' seconds...')
                        print("current time:" + str(datetime.now()))
                        time.sleep(int(delay))
                except (KeyError):
                    pass
            else:
                """ this part is for the time our client limit is complete so we change it by changing the client number """
                resp = json.loads(response.text or response.content)
                print('\n---'+str(response))
                print('\n---'+str(resp['message']))
                new_client_number = client_number + 1
                new_client_number %= int(len(client_list))
                if new_client_number < int(len(client_list)):
                    return get_commits(request_subject, df, client_list, client_number=new_client_number, page_number=page_number)
                else:
                    return df 
        except requests.exceptions.Timeout as e:
            print("-------timeout-------")
            print(e)
            time.sleep(delay_conn)
            return get_commits(request_subject, df, request_header, upper_bound, page_number)
        except requests.ConnectionError as e:
            print("-------connection error-------")
            print(e)
            time.sleep(delay_conn)
            return get_commits(request_subject, df, request_header, upper_bound, page_number)
    return df

In [5]:
""" we create a base url with repo and owner name, also create base dataframe which store data 
and in the end save the data we get from 'get_commits' function which are all the commits of that specific project """
def get_commits_of_repo(owner, repo, client_list, api_root=api_root):
    request_subject = api_root + '/repos/' + owner + '/' + repo 
    # creating the suitable data frame
    cols = ['sha', 'changed_files_name', 'changed_codes']
    df = pd.DataFrame(columns=cols)
    df['changed_files_name'] = df['changed_files_name'].astype(object)
    df['changed_codes'] = df['changed_codes'].astype(object)
    
    # end of creating data frame
    repo_dataframe = get_commits(request_subject, df, client_list=client_list)
    print(f'data frame shape is : {repo_dataframe.shape}')
    repo_dataframe.to_csv(owner+'_'+repo+'_commits_code_diff_data.csv', index=False)
    print(f"commits for {owner}/{repo} saved succesfully!!! ;)")

In [6]:
""" specifying all the owners and repos that we want their whole commits data """
sources = [('owner','repo')]

In [None]:
for source in sources:
    if not os.path.exists(source[0]+'_'+source[1]+'_commits_code_diff_data.csv'):
        print(f'getting commits for {source[0]}/{source[1]} is started')
        get_commits_of_repo(source[0], source[1], client_list, api_root)