# Creating a representative set of GitHub Repos

In [1]:
from github import Github, GithubException
import json
import pandas as pd
import configparser
from tqdm import tqdm

In [2]:
def get_access_token():
    """Read Github API access token from config file.

    Returns:
        str: Access Token
    """
    config = configparser.ConfigParser()
    config.read('../config.cfg')
    return config['ACCESS']['token']

In [3]:
g = Github(get_access_token())

## Southhampton ePrints

Considering just repos from within the last year in the Southhampton ePrints. Problem: quite few repos, plus missing huge repos with over 10000 commits.

In [4]:
with open("data/cleaned_urls_eprints.soton.ac.uk_2022-_github.com.json", "r") as f:
    repo_dict = json.load(f)

In [65]:
df_dict = {"pdf_link": [], "user_name": [], "repo_name": [], "stars": [], "watchers": [], "forks": [], "commits_no": [], "contributors_no": [], "size_kb": []}

In [66]:
for pdf_link, repos in tqdm(repo_dict.items()):
    for repo_data in repos:
        repo = g.get_repo(f"{repo_data['user']}/{repo_data['repo']}")
        df_dict["pdf_link"].append(pdf_link)
        df_dict["user_name"].append(repo_data['user'])
        df_dict["repo_name"].append(repo_data['repo'])
        df_dict["stars"].append(repo.get_stargazers().totalCount)
        df_dict["watchers"].append(repo.get_subscribers().totalCount)
        df_dict["forks"].append(repo.get_forks().totalCount)
        df_dict["commits_no"].append(repo.get_commits().totalCount)
        df_dict["contributors_no"].append(repo.get_contributors().totalCount)
        df_dict["size_kb"].append(repo.size)

100%|██████████| 17/17 [00:34<00:00,  2.04s/it]


In [67]:
df = pd.DataFrame(df_dict)

In [68]:
df

Unnamed: 0,pdf_link,user_name,repo_name,stars,watchers,forks,commits_no,contributors_no,size_kb
0,https://eprints.soton.ac.uk/457919/1/CLPsych_S...,stuartemiddleton,uos_clpsych,1,3,1,88,3,2985
1,https://eprints.soton.ac.uk/473266/1/2211.1562...,paboyle,Grid,132,30,84,6868,37,63648
2,https://eprints.soton.ac.uk/458195/1/SIGIR_202...,TGMclustering,TGMclustering,0,1,0,8,1,571
3,https://eprints.soton.ac.uk/457426/1/ACMFACCT_...,seatgeek,fuzzywuzzy,8848,268,883,384,60,336
4,https://eprints.soton.ac.uk/454952/1/2201.1170...,JAEarly,MILLI,10,2,1,3,1,24795
5,https://eprints.soton.ac.uk/454808/1/JPAL_AAMA...,ilkaza,JPAL-HA,1,2,0,17,1,1320
6,https://eprints.soton.ac.uk/457013/1/EmeCom_at...,olipinski,rl_werewolf,1,1,0,362,2,5524
7,https://eprints.soton.ac.uk/467378/1/Multiobje...,Miya-Liu,equitable-ridesharing,0,1,0,7,1,90114
8,https://eprints.soton.ac.uk/467378/2/4.pdf,Miya-Liu,equitable-ridesharing,0,1,0,7,1,90114
9,https://eprints.soton.ac.uk/468206/1/Camera_re...,AntoniaMarcu,Data-Modification,1,1,0,2,1,194


## Overall GitHub

"Sampling" from GitHub, taking stars as indicator.

In [36]:
def parse_samples(slice):
    samples_dict = {"user_name": [], "repo_name": [], "stars": [], "watchers": [], "forks": [], "commits_no": [], "contributors_no": [], "size_kb": []}
    for s in slice:
        samples_dict["user_name"].append(s.owner.login)
        samples_dict["repo_name"].append(s.name)
        samples_dict["stars"].append(s.get_stargazers().totalCount)
        samples_dict["watchers"].append(s.get_subscribers().totalCount)
        samples_dict["forks"].append(s.get_forks().totalCount)
        try:
            samples_dict["commits_no"].append(s.get_commits().totalCount)
        except GithubException:
            samples_dict["commits_no"].append(0)
        samples_dict["contributors_no"].append(s.get_contributors().totalCount)
        samples_dict["size_kb"].append(s.size)
    samples_df = pd.DataFrame(samples_dict)
    return samples_df

In [37]:
samples = {}
stars_intervals = ["<1", "1..100", "100..1000", "1000..10000", ">10000"]
for interval in tqdm(stars_intervals):
    result = g.search_repositories(f"stars:{interval} fork:false created:>2018-01-01")
    samples[interval] = parse_samples(result[:20])

100%|██████████| 5/5 [02:23<00:00, 28.75s/it]


In [38]:
samples["1000..10000"]

Unnamed: 0,user_name,repo_name,stars,watchers,forks,commits_no,contributors_no,size_kb
0,microsoft,pyright,9992,97,919,5954,87,58929
1,pwxcoo,chinese-xinhua,9978,309,2314,15,3,36271
2,kautukkundan,Awesome-Profile-README-templates,9978,165,7062,344,236,482
3,troyeguo,koodo-reader,9975,80,833,920,10,121224
4,skywind3000,awesome-cheatsheets,9975,274,1855,334,33,396
5,ajeetdsouza,zoxide,9958,39,358,397,52,1488
6,hoffstadt,DearPyGui,9954,139,526,2747,57,144351
7,qarmin,czkawka,9952,88,277,565,50,3317
8,PaddlePaddle,PaddleDetection,9951,184,2477,2128,134,424676
9,qeeqbox,social-analyzer,9891,355,766,869,16,63286


In [39]:
df = pd.concat(samples.values())

In [40]:
len(df)

100

In [42]:
def compose_repo_link(row) -> str:
    link = f"{row['user_name']}/{row['repo_name']}"
    return link
df["github_id"] = df.apply(compose_repo_link, axis=1)

In [43]:
df.head(10)

Unnamed: 0,user_name,repo_name,stars,watchers,forks,commits_no,contributors_no,size_kb,github_id
0,ObaidaNa,albaath-marks,0,1,0,5,1,630,ObaidaNa/albaath-marks
1,ja153903,jabbariao-dotcom-api,0,1,0,16,1,36,ja153903/jabbariao-dotcom-api
2,paulkeysdev,AirBnB_clone,0,1,0,1,0,49,paulkeysdev/AirBnB_clone
3,evandrogouveia,api-camara,0,1,0,68,1,3545,evandrogouveia/api-camara
4,FelipeCastro2021,Portfolio_FelipeCastro2022,0,1,0,20,1,2384,FelipeCastro2021/Portfolio_FelipeCastro2022
5,pfernandom,blog,0,2,0,92,1,110605,pfernandom/blog
6,DalhousieAI,pytorch-logit-logic,0,2,0,11,1,1714,DalhousieAI/pytorch-logit-logic
7,E-Shop-FR,E-bot,0,0,0,122,3,89,E-Shop-FR/E-bot
8,jveinti2,my-store-angular,0,1,0,3,0,204,jveinti2/my-store-angular
9,svanmeter93,Book-Search-Engine,0,1,0,0,0,0,svanmeter93/Book-Search-Engine


In [44]:
df.to_csv("data/representative_set.csv", index=False)

## GitHub investigation playground

In [4]:
repo = Github.get_repo(g, "Netflix/pollyjs")

In [5]:
readme = repo.get_readme()

In [21]:
import re
pattern = r"#+ .*\n"

In [24]:
headings = re.findall(pattern, readme.decoded_content.decode())

In [27]:
cleaned_headings = []
for h in headings:
    cleaned_headings.append(h.strip("# \n"))
cleaned_headings

['Why Polly?',
 'Features',
 'Getting Started',
 'Usage',
 'Credits',
 'Prior Art',
 'Contributors',
 "We're hiring!",
 'License']

In [30]:
from emoji import emoji_count
emoji_count(readme.decoded_content.decode())

6

In [49]:
repo.get_contents("CONTRIBUTING.md").size

2929

In [59]:
issues = repo.get_issues(state='all')
issues.totalCount

473

In [64]:
issues[0].created_at

datetime.datetime(2023, 3, 14, 18, 12, 6)