## Import modules

In [76]:
#importing all the necessary packages
import requests
import csv
import pandas as pd
import matplotlib.pyplot as mp
import seaborn as sb
import numpy as np
import json
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import re

## Define a class for GitHub Repository

In [36]:
# Define a class to represent a GitHub Repository
class GitHubRepository:
    def __init__(self, repo_name, owner, description, homepage, license, forks, watchers, date_of_collection):
        self.repo_name = repo_name
        self.owner = owner
        self.description = description
        self.homepage = homepage
        self.license = license  # License could be another class with its own attributes
        self.forks = forks
        self.watchers = watchers
        self.date_of_collection = date_of_collection

    def __str__(self):
        return f"{self.owner}/{self.repo_name}: {self.description} ({self.watchers})"


In [39]:
url = 'https://api.github.com/repos/JabRef/jabref'

#http request to the URL and collecting it reponse
response = requests.get(url)

#the reponse data is converted json
data=response.json()

#print(json.dumps(data, indent=4))

repo_data = {
    "repo_name": data['name'],
    "owner": data['owner']['login'],
    "description": data['description'],
    "homepage": data['homepage'],
    "license": data['license'],
    "forks": data['forks'],
    "watchers": data['watchers'],
    "date_of_collection": datetime.now().timestamp()
}

my_repo = GitHubRepository(**repo_data)

# Access attributes of the repository object
#print(my_repo.forks)
#print(my_repo.license)
print(my_repo)

JabRef/jabref: Graphical Java application for managing BibTeX and biblatex (.bib) databases (3195)


## Define a class for PullRequests

In [4]:
# Define a class to represent a GitHub Repository
class PullRequest:
    def __init__(self, title, number, body, state, date_of_creation, closing_date, user):
        self.title = title
        self.number = number
        self.body = body
        self.state = state
        self.license = license  # License could be another class with its own attributes
        self.date_of_creation = date_of_creation
        self.closing_date = closing_date
        self.user = user

    def __str__(self):
        return f"{self.number}: {self.title})"


In [9]:
#saving jabref api in URl variable
url='https://api.github.com/search/issues?q=is:pr+repo:jabref/jabref'

#http request to the URL and collecting it reponse
response = requests.get(url)

#the reponse data is converted json
data=response.json()

#collecting title , pull request ,body , state, date of creation , closing date , user details of the repositories in respo.csv file
pullrequest_list = []
for i in data['items']: #collecting all responses from items key
    pr = PullRequest(i['title'],i['number'],i['body'],i['state'],i['created_at'],i['closed_at'],i['user']['login'])
    pullrequest_list.append(pr)

print(pullrequest_list[1])

10642: [Bot] Update CSL styles)


## From list of Pull Requests, get username and number of pull requests

In [15]:
# Function to extract user and pull request count
def extract_user_pull_request_count(pull_requests):
    user_pull_request_count = {}

    for pr in pull_requests:
        user = pr.user
        if user in user_pull_request_count:
            user_pull_request_count[user] += 1
        else:
            user_pull_request_count[user] = 1

    return user_pull_request_count

pullrequest_count = extract_user_pull_request_count(pullrequest_list)
print(pullrequest_count)

{'koppor': 7, 'dependabot[bot]': 12, 'JawadTUE': 1, 'Siedlerchr': 2, 'ThiloteE': 1, 'LenkaBuebnkova': 1, 'k3KAW8Pnf7mkmdSMPHz27': 2, 'JamesZhang11': 1, 'u7500248': 1, 'michalfarago': 1, 'lisongxuan': 1}


## Scrape data from the user profile page on GitHub

In [80]:
result = requests.get('https://github.com/koppor')
content = result.content
soup = BeautifulSoup(content, "html.parser")

#print(soup.prettify())

username = 'koppor'

# Extract the required information
repositories_count_element = soup.find('span', {'class': 'Counter'})
repositories_count = repositories_count_element.text.strip() if repositories_count_element else 'N/A'

followers_following = soup.find_all('span', {'class':"text-bold color-fg-default"})#.text.strip()
followers_count = followers_following[0].text.strip()
following_count = followers_following[1].text.strip()

contributions = soup.find('h2', {'class':"f4 text-normal mb-2"}).text.strip()
contributions= re.split("\s", contributions)[0]

# Print the results
print(f"Repositories: {repositories_count}")
print(f"Followers: {followers_count}")
print(f"Following: {following_count}")
print(f"Contributions: {contributions}")


Repositories: 309
Followers: 283
Following: 497
Contributions: 2,189


In [11]:
#saving jabref api in URl variable
url='https://api.github.com/search/issues?q=is:pr+repo:jabref/jabref'

#http request to the URL and collecting it reponse
response = requests.get(url)

#the reponse data is converted json
data=response.json()

#collecting title , pull request ,body , state, date of creation , closing date , user details of the repositories in respo.csv file
with open('repos.csv', 'w', newline='',encoding="utf-8") as file: #creating repos.csv file in write mode
    writer = csv.writer(file)
    writer.writerow(["title","number","body","state","date of creation","closing date","user"]) #creating column names
    j=0
    for i in data['items']: #collecting all responses from items key
        title,number,body,state,created_at,closed_at,user=(i['title'],i['number'],i['body'],i['state'],i['created_at'],i['closed_at'],i['user']['login'])
        writer.writerow([title,number,body,state,created_at,closed_at,user])
        j=j+1

#reading repos.cvs into pandas data frame
repos_df=pd.read_csv('repos.csv')
print(repos_df)

                                                title  number  \
0             [Bot] Update journal abbreviation lists   10645   
1                             [Bot] Update CSL styles   10642   
2                    Use clparse (instead of heylogs)   10641   
3   Bump org.mockito:mockito-core from 5.6.0 to 5.7.0   10638   
4   Bump org.fxmisc.richtext:richtextfx from 0.11....   10637   
5   Bump com.tngtech.archunit:archunit-junit5-engi...   10636   
6   Bump org.junit.platform:junit-platform-launche...   10635   
7   Bump io.github.classgraph:classgraph from 4.8....   10634   
8         Added parent field to Hayagriva YAML export   10633   
9                          Adapt tests to new heylogs   10631   
10                        Fix NPE in lookup identifer   10630   
11           Add minimum requirements to CHANGELOG.md   10628   
12                             Update to JavaFX 21.01   10627   
13         Bump org.jsoup:jsoup from 1.16.1 to 1.16.2   10625   
14  Bump org.junit.jupite

In [12]:
#reading repos.cvs into pandas data frame
repos_df=pd.read_csv('repos.csv')
print(repos_df)

                                                title  number  \
0             [Bot] Update journal abbreviation lists   10645   
1                             [Bot] Update CSL styles   10642   
2                    Use clparse (instead of heylogs)   10641   
3   Bump org.mockito:mockito-core from 5.6.0 to 5.7.0   10638   
4   Bump org.fxmisc.richtext:richtextfx from 0.11....   10637   
5   Bump com.tngtech.archunit:archunit-junit5-engi...   10636   
6   Bump org.junit.platform:junit-platform-launche...   10635   
7   Bump io.github.classgraph:classgraph from 4.8....   10634   
8         Added parent field to Hayagriva YAML export   10633   
9                          Adapt tests to new heylogs   10631   
10                        Fix NPE in lookup identifer   10630   
11           Add minimum requirements to CHANGELOG.md   10628   
12                             Update to JavaFX 21.01   10627   
13         Bump org.jsoup:jsoup from 1.16.1 to 1.16.2   10625   
14  Bump org.junit.jupite

In [30]:
#collecting number columns values into list
num_list=repos_df['number'].tolist()
print(num_list)

[10642, 10641, 10638, 10637, 10636, 10635, 10634, 10633, 10631, 10630, 10628, 10627, 10625, 10624, 10623, 10622, 10621, 10620, 10619, 10618, 10617, 10616, 10614, 10613, 10612, 10611, 10610, 10609, 10608, 10607]


In [31]:
#collecting owner , repo , number of commits , additions , deletions , changed files details in repos/owner-repo.csv files
for i in num_list:
    url='https://api.github.com/repos/JabRef/jabref/pulls/'+str(i) #api call for each user based on number of Github pulls
    response = requests.get(url) #http request to the URL and collecting it reponse
    data=response.json() #the reponse data is converted json
    owner=data['user']['login']
    repo=data['head']['repo']['name']
    commits=data['commits']
    additions=data['additions']
    deletions=data['deletions']
    changed_files=data['changed_files']
    file_name='repos'+owner+'-'+repo+'.csv'
    csvfile = open(file_name, 'w')
    rowWriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_NONE)
    rowWriter.writerow (["number of commits","additions","deletions","changed_files"]) #creating columns
    rowWriter.writerow([commits,additions,deletions,changed_files]) #writing data into csv file
    csvfile.close()

In [32]:
#collecting number of follower , followings , repos of a user into users.cvs file
l=list(set(repos_df['user']))
with open('users.csv', 'w', newline='',encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["user","number of repositories","number of followers","number of following"])
    for i in l:
        repo_url='https://api.github.com/users/'+str(i)+'/repos'
        followers_url='https://api.github.com/users/'+str(i)+'/followers'
        following_url='https://api.github.com/users/'+str(i)+'/following'
        response1= requests.get(repo_url) #http request to the URL and collecting it reponse
        response2 = requests.get(followers_url)#http request to the URL and collecting it reponse
        response3 = requests.get(following_url)#http request to the URL and collecting it reponse
        data1=response1.json()#the reponse data is converted json
        data2=response2.json()#the reponse data is converted json
        data3=response3.json()#the reponse data is converted json
        num_repos=len(data1)
        num_followers=len(data2)
        num_following=len(data3)
        writer.writerow([i,num_repos,num_followers,num_following]) #saving the data into cvs file

In [39]:
num_list=repos_df['number'].tolist()
print(num_list)
#collecting owner , repo number of commits addition , deletions in all repos.cvs file
with open('all_repos.csv', 'w', newline='',encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["user","repo_name","commits","additions","deletions","changed_files"])
    for i in num_list:
        url='https://api.github.com/repos/JabRef/jabref/pulls/'+str(i)
        response = requests.get(url)#http request to the URL and collecting it reponse
        data=response.json()#the reponse data is converted json
        owner=data['user']['login']
        repo=data['head']['repo']['name']
        commits=data['commits']
        additions=data['additions']
        deletions=data['deletions']
        changed_files=data['changed_files']
        writer.writerow([owner,repo,commits,additions,deletions,changed_files]) #saving data in cvs file

[10642, 10641, 10638, 10637, 10636, 10635, 10634, 10633, 10631, 10630, 10628, 10627, 10625, 10624, 10623, 10622, 10621, 10620, 10619, 10618, 10617, 10616, 10614, 10613, 10612, 10611, 10610, 10609, 10608, 10607]


KeyError: ignored

In [None]:
users_df=pd.read_csv('users.csv') #collecting cvs data into pandas data file
all_repos=pd.read_csv('all_repos1.csv') #collecting cvs data into pandas data file

In [None]:
all_repos['state']=np.where(all_repos['user']==repos_df['user'],repos_df['state'],'') #collecting state details in all repos data frame

In [None]:
#box plot that comapares closed vs open pulled requests in terms of number of commits
sb.set_style("whitegrid")











sb.boxplot(x = 'state', y = 'commits', data = all_repos)

In [None]:
#box plot that comapares closed vs open pulled requests in terms of additions
sb.set_style("whitegrid")
fig, ax = mp.subplots(figsize=(10, 6))
sb.boxplot(x = 'additions', y = 'commits', data = all_repos,ax=ax)

In [None]:
#box plot that comapares closed vs open pulled requests in terms of deletions
sb.set_style("whitegrid")
fig, ax = mp.subplots(figsize=(10, 6))
sb.boxplot(x = 'deletions', y = 'commits', data = all_repos,ax=ax)

In [None]:
#box plot that comapares closed vs open pulled requests in terms of deletions
sb.set_style("whitegrid")
fig, ax = mp.subplots(figsize=(10, 6))
sb.boxplot(x = 'user', y = 'changed_files', data = all_repos,ax=ax)

In [None]:
#a scatterplot that showws the relationship between additions and deletions
all_repos.plot.scatter(x='additions', y='deletions', title= "Scatter plot between two variables X and Y");

mp.show(block=True);

In [None]:
#line graph showing the total number of pull requests per day
repos_df.plot( 'date of creation' , 'number' )

In [None]:
#bars comparing the number of users per repo
users_df.plot(x="number of repositories", y="number of followers", kind="bar")

In [None]:
#corelation between all the numeric data in pull requests for a repo
print(all_repos.corr())

# plotting correlation heatmap
dataplot = sb.heatmap(all_repos.corr(), cmap="YlGnBu", annot=True)

# displaying heatmap
mp.show()

In [None]:
#corelation between all the numeric data in pull requests for a repo
print(users_df.corr())

# plotting correlation heatmap
dataplot = sb.heatmap(users_df.corr(), cmap="YlGnBu", annot=True)

# displaying heatmap
mp.show()