# Creating a representative set of GitHub Repos

In [90]:
from github import Github, GithubException
import json
import pandas as pd
import configparser
from tqdm import tqdm

In [35]:
def get_access_token():
    """Read Github API access token from config file.

    Returns:
        str: Access Token
    """
    config = configparser.ConfigParser()
    config.read('../config.cfg')
    return config['ACCESS']['token']

In [36]:
g = Github(get_access_token())

## Southhampton ePrints

Considering just repos from within the last year in the Southhampton ePrints. Problem: quite few repos, plus missing huge repos with over 10000 commits.

In [34]:
with open("data/cleaned_urls_eprints.soton.ac.uk_2022-_github.com.json", "r") as f:
    repo_dict = json.load(f)

In [65]:
df_dict = {"pdf_link": [], "user_name": [], "repo_name": [], "stars": [], "watchers": [], "forks": [], "commits_no": [], "contributors_no": [], "size_kb": []}

In [66]:
for pdf_link, repos in tqdm(repo_dict.items()):
    for repo_data in repos:
        repo = g.get_repo(f"{repo_data['user']}/{repo_data['repo']}")
        df_dict["pdf_link"].append(pdf_link)
        df_dict["user_name"].append(repo_data['user'])
        df_dict["repo_name"].append(repo_data['repo'])
        df_dict["stars"].append(repo.get_stargazers().totalCount)
        df_dict["watchers"].append(repo.get_subscribers().totalCount)
        df_dict["forks"].append(repo.get_forks().totalCount)
        df_dict["commits_no"].append(repo.get_commits().totalCount)
        df_dict["contributors_no"].append(repo.get_contributors().totalCount)
        df_dict["size_kb"].append(repo.size)

100%|██████████| 17/17 [00:34<00:00,  2.04s/it]


In [67]:
df = pd.DataFrame(df_dict)

In [68]:
df

Unnamed: 0,pdf_link,user_name,repo_name,stars,watchers,forks,commits_no,contributors_no,size_kb
0,https://eprints.soton.ac.uk/457919/1/CLPsych_S...,stuartemiddleton,uos_clpsych,1,3,1,88,3,2985
1,https://eprints.soton.ac.uk/473266/1/2211.1562...,paboyle,Grid,132,30,84,6868,37,63648
2,https://eprints.soton.ac.uk/458195/1/SIGIR_202...,TGMclustering,TGMclustering,0,1,0,8,1,571
3,https://eprints.soton.ac.uk/457426/1/ACMFACCT_...,seatgeek,fuzzywuzzy,8848,268,883,384,60,336
4,https://eprints.soton.ac.uk/454952/1/2201.1170...,JAEarly,MILLI,10,2,1,3,1,24795
5,https://eprints.soton.ac.uk/454808/1/JPAL_AAMA...,ilkaza,JPAL-HA,1,2,0,17,1,1320
6,https://eprints.soton.ac.uk/457013/1/EmeCom_at...,olipinski,rl_werewolf,1,1,0,362,2,5524
7,https://eprints.soton.ac.uk/467378/1/Multiobje...,Miya-Liu,equitable-ridesharing,0,1,0,7,1,90114
8,https://eprints.soton.ac.uk/467378/2/4.pdf,Miya-Liu,equitable-ridesharing,0,1,0,7,1,90114
9,https://eprints.soton.ac.uk/468206/1/Camera_re...,AntoniaMarcu,Data-Modification,1,1,0,2,1,194


## Overall GitHub

"Sampling" from GitHub, taking stars as indicator.

In [97]:
def parse_samples(slice):
    samples_dict = {"user_name": [], "repo_name": [], "stars": [], "watchers": [], "forks": [], "commits_no": [], "contributors_no": [], "size_kb": []}
    for s in slice:
        samples_dict["user_name"].append(s.owner.login)
        samples_dict["repo_name"].append(s.name)
        samples_dict["stars"].append(s.get_stargazers().totalCount)
        samples_dict["watchers"].append(s.get_subscribers().totalCount)
        samples_dict["forks"].append(s.get_forks().totalCount)
        try:
            samples_dict["commits_no"].append(s.get_commits().totalCount)
        except GithubException:
            samples_dict["commits_no"].append(0)
        samples_dict["contributors_no"].append(s.get_contributors().totalCount)
        samples_dict["size_kb"].append(s.size)
    samples_df = pd.DataFrame(samples_dict)
    return samples_df

In [98]:
samples = {}
stars_intervals = ["<1", "1..100", "100..1000", "1000..10000", ">10000"]
for interval in tqdm(stars_intervals):
    result = g.search_repositories(f"stars:{interval} fork:false created:>2018-01-01")
    samples[interval] = parse_samples(result[:20])

100%|██████████| 5/5 [02:24<00:00, 28.80s/it]


In [100]:
samples["1000..10000"]

Unnamed: 0,user_name,repo_name,stars,watchers,forks,commits_no,contributors_no,size_kb
0,upscayl,upscayl,9985,58,267,256,12,411174
1,InterviewReady,system-design-resources,9984,205,1104,49,5,70
2,deepmind,alphafold,9968,199,1702,110,14,16387
3,vipstone,faceai,9965,388,2393,150,4,40473
4,qinguoyi,TinyWebServer,9964,76,2706,159,9,57318
5,openai,DALL-E,9961,228,1807,3,1,7
6,gyoogle,tech-interview-for-developer,9949,148,2080,773,59,6632
7,airbytehq,airbyte,9942,172,2443,10312,420,1989906
8,macrozheng,mall-swarm,9936,273,4617,448,1,53241
9,kautukkundan,Awesome-Profile-README-templates,9930,165,7000,344,236,482


In [101]:
df = pd.concat(samples.values())

In [103]:
df.to_csv("data/representative_set.csv")