In [1]:
import pandas as pd
import math
from datetime import datetime
import time
import sys
import json
import os.path
from bs4 import BeautifulSoup
import requests

In [2]:
request_header = {'Accept':'application/vnd.github.v3+json'}
api_root = "https://api.github.com"

In [3]:
"""list of all client_id and client_secret that we have"""
client_list = [('client_id', 'client_secret')]

In [4]:
def get_commits(request_subject, df, client_list, client_number=0,request_header=request_header, upper_bound=math.inf, page_number=1):
    while(page_number<=upper_bound):
        try: 
            response = requests.get(request_subject + '/commits?per_page=100&page=' + str(page_number) + 
                                    '&client_id=' + client_list[client_number][0] + '&client_secret=' + client_list[client_number][1], headers = request_header)
            if response.ok:
                json_response = response.json()
                if not json_response:
                    break
                for i in range(len(json_response)):
                    """ for each commit we create a dictionary to keep data of that commit
                    these try/excepts add data to this dictionary and i the end we add this dictioary to main data frame 
                    which holds whole data about this specific peoject """
                    data = {}
                    data['sha'] = json_response[i]['sha']
                    try:
                        data['commit_author_name'] = json_response[i]['commit']['author']['name']
                    except:
                        data['commit_author_name'] = ""
                    try:
                        data['commit_author_email'] = json_response[i]['commit']['author']['email']
                    except:
                        data['commit_author_email'] = ""
                    try:
                        data['commit_author_date'] = json_response[i]['commit']['author']['date']
                    except:
                        data['commit_author_date'] = ""
                    try:
                        data['commit_message'] = json_response[i]['commit']['message']
                    except:
                        data['commit_message'] = ""
                    try:
                        data['author_login'] = json_response[i]['author']['login']   
                    except:
                        data['author_login'] = ""
                    try:
                        data['author_id'] = json_response[i]['author']['id']
                    except:
                        data['author_id'] = ""
                    try:
                        data['author_node_id'] = json_response[i]['author']['node_id']    
                    except:
                        data['author_node_id'] = ""
                    try:
                        data['author_type'] = json_response[i]['author']['type']
                    except:
                        data['author_type'] = ""
                    try:
                        data['committer_login'] = json_response[i]['committer']['login']    
                    except:
                        data['committer_login'] = ""
                    try:
                        data['committer_id'] = json_response[i]['committer']['id']    
                    except: 
                        data['committer_id'] = ""
                    try:
                        data['committer_date'] = json_response[i]['committer']['date']    
                    except: 
                        data['committer_date'] = ""
                    try:
                        data['committer_node_id'] = json_response[i]['committer']['node_id']    
                    except:
                        data['committer_node_id'] = ""
                    try:    
                        data['verification_verified']= json_response[i]['commit']['verification']['verified']
                    except:
                        data['verification_verified'] = ""
                    try:
                        data['verification_reason'] = json_response[i]['commit']['verification']['reason']
                    except:
                        data['verification_reason'] = ""
                    try:
                        data['stats_additions'] = json_response[i]['stats']['additions']
                    except:
                        data['stats_additions'] = ""
                    try:
                        data['stats_deletions'] = json_response[i]['stats']['deletions']
                    except:
                        data['stats_deletions'] = ""
                    try:
                        data['stats_total'] = json_response[i]['stats']['total']
                    except:
                        data['stats_total'] = ""

                    df = df.append(data, ignore_index=True)
                print(f'page {page_number} is added to data frame')
                page_number += 1
                try:
                    if int(response.headers["X-RateLimit-Remaining"]) < 2:
                        print("limit exceeded!!!!!!!!!!!!")
                        delay = 60
                        print('sleeping for '+str(delay)+' seconds...')
                        print("current time:" + str(datetime.now()))
                        time.sleep(int(delay))
                except (KeyError):
                    pass
            else:
                """ this part is for the time our client limit is complete so we change it by changing the client number """
                resp = json.loads(response.text or response.content)
                print('\n---'+str(response))
                print('\n---'+str(resp['message']))
                new_client_number = client_number + 1
                new_client_number %= int(len(client_list))
                if new_client_number < int(len(client_list)):
                    return get_commits(request_subject, df, client_list, client_number=new_client_number, page_number=page_number)
                else:
                    return df 
        except requests.exceptions.Timeout as e:
            print("-------timeout-------")
            print(e)
            time.sleep(delay_conn)
            return get_commits(request_subject, df, request_header, upper_bound, page_number)
        except requests.ConnectionError as e:
            print("-------connection error-------")
            print(e)
            time.sleep(delay_conn)
            return get_commits(request_subject, df, request_header, upper_bound, page_number)
    return df

In [5]:
""" we create a base url with repo and owner name, also create base dataframe which store data 
and in the end save the data we get from 'get_commits' function which are all the commits of that specific project """
def get_commits_of_repo(owner, repo, client_list, api_root=api_root):
    request_subject = api_root + '/repos/' + owner + '/' + repo 
    # creating the suitable data frame
    cols = ['sha', 'commit_author_name', 'commit_author_email', 'commit_message', 'author_login', 'author_id', 'author_node_id', 
            'committer_login', 'committer_id', 'committer_date', 'committer_node_id', 'verification_verified', 'verification_reason', 'changed_files_name', 
            'changed_codes', 'comments_bodies', 'comments_creation_date', 'comment_updating_date', 'comment_users_login', 'comment_users_id', 'author_type',
            'stats_additions', 'stats_deletions', 'stats_total']
    df = pd.DataFrame(columns=cols)
    df['changed_files_name'] = df['changed_files_name'].astype(object)
    df['changed_codes'] = df['changed_codes'].astype(object)
    df['comments_bodies'] = df['comments_bodies'].astype(object)
    df['comments_creation_date'] = df['comments_creation_date'].astype(object)
    df['comment_updating_date'] = df['comment_updating_date'].astype(object)
    df['comment_users_login'] = df['comment_users_login'].astype(object)
    df['comment_users_id'] = df['comment_users_id'].astype(object)
    # end of creating data frame
    repo_dataframe = get_commits(request_subject, df, client_list=client_list)
    print(f'data frame shape is : {repo_dataframe.shape}')
    repo_dataframe.to_csv(owner+'_'+repo+'_commits_data.csv', index=False)
    print(f"commits for {owner}/{repo} saved succesfully!!! ;)")

In [6]:
""" specifying all the owners and repos that we want their whole commits data """
sources = [('owner','repo')]

In [None]:
for source in sources:
    if not os.path.exists(source[0]+'_'+source[1]+'_commits_data.csv'):
        print(f'getting commits for {source[0]}/{source[1]} is started')
        get_commits_of_repo(source[0], source[1], client_list, api_root)