In [1]:
import pandas as pd
import requests
import csv
import numpy as np
import os
import time

In [19]:
'''
Gets a commit table for the project included as a parameter. Besides, it also performs the count of developers and commits per project.
Note that the function may be changed when crawling for commits before the follow up period 

PARAMS: - projectfullname: full name for the API call of each project parameter.
        - updateissuetablename: path of the CSV file gathering the commit data.
        - commitCounter: Counter of commits per project.
        - headers: With the necessary token information.

RETURNS: - [0]: Number of developers
         - [1]: Number of commits
'''
def getCommitTablebyProject(projectfullname, updateissuetablename, commitCounter, headers):

    # Time limit date until the start of the follow-up period (12-20): (When crawling for vcommits before the follow up period)
    earlytimeLimit = "2020-12-31T23:59:0Z"
    earlylimitTimeStamp = pd.Timestamp(earlytimeLimit)

    lasttimeLimit = "2022-12-31T23:59:0Z"
    lastlimitTimeStamp = pd.Timestamp(lasttimeLimit)

    theCommitQuery = f"https://api.github.com/repos/{projectfullname}/commits"
    theProjectQuery = f"https://api.github.com/repos/{projectfullname}"

    # Request for the project general details
    p_search = requests.get(theProjectQuery, headers=headers)

    # GitHub query control condition
    if p_search.headers['X-RateLimit-Remaining'] == 1:
        time.sleep(3600)

    project_info = p_search.json()

    try:
        project_fullName = project_info['full_name']
    except:
        project_fullName = projectfullname
    params = {'per_page': 100}
    page = 1

    # Createtion of the headers
    commit_features = ['project_fullName','commit_sha', 'authorID', 'authorLOG','author_date', 'TOTAL_COMMITS']
    with open(updateissuetablename, 'a', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(commit_features)

    developers = [] # List of developers/commiters in the project repository

    # API call loop
    while 1 == 1:
        params['page'] = page # Current pagination
        print(projectfullname + ' ' + 'page ' + str(page))
        theResult = requests.get(theCommitQuery, headers=headers, params=params) # Commit query

        # GitHub query control condition
        print(f"Remaining API call rate: {theResult.headers['X-RateLimit-Remaining']}")
        if theResult.headers['X-RateLimit-Remaining'] == 1:
            time.sleep(3600)

        print(theResult.status_code)
        theItemListPerPage = theResult.json()

        if len(theItemListPerPage) == 0: # Means that there's no more commits to download
            break
        else:

            # print(len(theItemListPerPage.keys()))
            # Special exception: If there is no APi error but it throws that the repo is empty.
            # After checking a local case, the difference is that it would throw a dict with two informative keys and not a list of commits as it should. WE IGNORE THIS PROJECT
            if type(theItemListPerPage) is dict and len(theItemListPerPage.keys()) == 2:
                print('catch')
                break

            print(f"Commit amount: {len(theItemListPerPage)}, project: {project_fullName}")
            for item in theItemListPerPage: # Storing the commit data
                '''
                # If commit date is later than the specified follow-up then we are no longer interested
                if pd.Timestamp(item['commit']['author']['date']) >= earlylimitTimeStamp:
                    continue
                '''
                # If commit date is earlier than the follow-up period start then we already have that info.
                # Or if the commits are newer than the last follow-up date, then we don't need them
                if pd.Timestamp(item['commit']['author']['date']) <= earlylimitTimeStamp or pd.Timestamp(item['commit']['author']['date']) >= lastlimitTimeStamp:
                    # print('COMMIT AFTER FOLLOW-UP PERIOD')
                    continue

                commititem = {}
                commititem['project_fullName'] = project_fullName
                try:
                    commititem['commit_sha'] = item['sha']
                except:
                    commititem['commit_sha'] = np.NaN
                try:
                    commititem['authorID'] = item['author']['id']
                    commititem['authorLOG'] = item['author']['login']
                except:
                    commititem['authorID'] = np.NaN
                    commititem['authorLOG'] = np.NaN
                try:
                    commititem['author_date'] = item['commit']['author']['date']
                except:
                    commititem['author_date'] = np.NaN

                commititem['TOTAL_COMMITS'] = ''
                #try:
                #    commititem['message'] = item['commit']['message']
                #except:
                #    commititem['message'] = np.NaN

                with open(updateissuetablename, 'a', encoding='utf-8') as csvfile:
                    writer = csv.writer(csvfile, delimiter=',')
                    writer.writerow([commititem[x] for x in commit_features])

                # If there's a new commiter in the project then we include it in our list of developers
                if commititem['authorLOG'] not in developers:
                    developers.append(commititem['authorLOG'])
                #projectissuedataitems.append(commititem)
                commitCounter += 1

            page = page + 1

    # Writing statement for the last value that will store the number of commits per file.
    finalrow = {'','','','', '',commitCounter} # Used to keep in the CSV a value for the total commits.
    with open(updateissuetablename, 'a', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(finalrow)

    return len(developers), commitCounter

In [5]:
'''
A bit manual function since it only works for this scenario. It will handle in a fancy way the few apache projects appearing in SQ that do not match with the official names in github. After looking for them in the web, it's been found that the are some name differences, so this function changes the names from SQ list so that they match with the ones in github for later use.

PARAMS: - githubProPath: CSV file path with projects from github (Mature and incubator)
        - SQProPath: CSV file path with project from SQ.

RETURN: - [0]: Rearranged list of apache projects in SQ
        - [1]: Cleaned list of github projects that don't use SQ.
'''
def nameMatching(githubProPath, SQProPath):

    correctNames = {3:'pdfbox', 4:'hadoop', 5:'iotdb', 7:'causeway-site', 9:'dolphinscheduler', 10:'incubator-nemo', 229:'jmeter-site', 230:'jspwiki', 235:'daffodil', 241:'doris', 243:'camel-quarkus', 247:'jackrabbit-oak'}

    # List of projects appearing in the Github ASF repos.
    apacheProjects = pd.read_csv(githubProPath)
    listGithubProjects = list(apacheProjects.name)

    # List of projects appearing in sonarQube that got some activity registered before the measurement period start.
    projectsSQ = pd.read_csv(SQProPath)
    projectSQNames = list(projectsSQ.project_name.unique())

    # Matching process
    counter = 0
    notMathingPros = []
    matchingPros = []
    for i in projectSQNames:
        if i[0:7] == 'apache_': # If it contains the prefix we need to ignore it.
            if i[7:] in listGithubProjects: # If the resting part matches with a github apache project
                counter += 1
                matchingPros.append(i[7:])
            else:
                notMathingPros.append(i)
        else: # The project name doesn't come with a prefix.
            if i in listGithubProjects: # If the matches again.
                counter += 1
                matchingPros.append(i)
            else:
                notMathingPros.append(i)

    # Projects that didn't match is because they differ from the name stored in github, thus we need to correct the name so that they match.
    for i in notMathingPros:
        projectSQNames[projectSQNames.index(i)] = correctNames[projectSQNames.index(i)]

    # We remove from the github list the matching projects because this list will now work as a list for projects that weren't registered in SQ, thus we compute the same process for all the projects in github that at mature or are incubating.
    [listGithubProjects.remove(x) for x in matchingPros]
    [listGithubProjects.remove(y) for y in correctNames.values()]

    return projectSQNames, listGithubProjects

In [31]:
personal_token = ""
token = os.getenv('GITHUB_TOKEN', personal_token)
headers = {'Authorization': f'token {token}'}

# Projects using SQ #
# function call for name matching between SQ and Github.

githubProPath = 'resultFiles/cleanRepos070223.csv'
SQProPath = 'sonarQubeData/sonarQubeIssueDatesFixed.csv'
matchingSQlists = nameMatching(githubProPath, SQProPath)

colnames = ['projectname', 'commit_count', 'developer_count']
countsDFSQ = pd.DataFrame(columns=colnames)
countsDFnonSQ = pd.DataFrame(columns=colnames)

# function call for commit crawling in SQ projects #

for i in matchingSQlists[0]: # These are apache projects from SQ

    # If any of the names has an apache prefix, omit it to avoid error in the API call
    if i[0:7] == 'apache_':
        i = i[7:]

    commitCounter = 0
    updateissuetablename = 'testCommits/commitsDuringFollowUp/SQprojects/'+i+'.csv'
    projectCounters = getCommitTablebyProject("apache/"+i, updateissuetablename, commitCounter, headers)
    # countsDFSQ.append({'projectname': i, 'commit_count': projectCounters[1], 'developer_count': projectCounters[0]}, ignore_index=True)

    time.sleep(10)

# function call for commit crawling in rest of Github projects not using SQ #
for i in matchingSQlists[1]: # These are projects that don't appear in SQ

    commitCounter = 0
    updateissuetablename = 'testCommits/commitsDuringFollowUp/nonSQprojects/'+i+'.csv'
    projectCounters = getCommitTablebyProject('apache/'+i, updateissuetablename, commitCounter, headers)
    # countsDFnonSQ.append({'projectname': i, 'commit_count': projectCounters[1], 'developer_count': projectCounters[0]}, ignore_index=True)

    time.sleep(10)

# Storing of total counts in csv files #
# countsDFSQ.to_csv('/home/mikel/Desktop/project1/githubProjectsIssueCounts/commitCountTables/ApacheSQPros.csv')
# countsDFnonSQ.to_csv('/home/mikel/Desktop/project1/githubProjectsIssueCounts/commitCountTables/ApachenonSQPros.csv')


apache/tapestry3 page 1
Remaining API call rate: 3679
200
Commit amount: 100, project: apache/tapestry3
apache/tapestry3 page 2
Remaining API call rate: 3678
200
Commit amount: 100, project: apache/tapestry3
apache/tapestry3 page 3
Remaining API call rate: 3677
200
Commit amount: 100, project: apache/tapestry3
apache/tapestry3 page 4
Remaining API call rate: 3676
200
Commit amount: 100, project: apache/tapestry3
apache/tapestry3 page 5
Remaining API call rate: 3675
200
Commit amount: 100, project: apache/tapestry3
apache/tapestry3 page 6
Remaining API call rate: 3674
200
Commit amount: 100, project: apache/tapestry3
apache/tapestry3 page 7
Remaining API call rate: 3673
200
Commit amount: 100, project: apache/tapestry3
apache/tapestry3 page 8
Remaining API call rate: 3672
200
Commit amount: 100, project: apache/tapestry3
apache/tapestry3 page 9
Remaining API call rate: 3671
200
Commit amount: 100, project: apache/tapestry3
apache/tapestry3 page 10
Remaining API call rate: 3670
200
Commi