In [82]:
# requires 2.0.1.rc01 to handle ratelimit

/Users/justin/work/explore/play-data-science/social-mining


In [9]:
from github import Github
import pandas as pd
import numpy as np
import json
import requests
import configparser
from datetime import date, timedelta
from timeit import default_timer as timer
from pathlib import Path

In [12]:
def load_access_token():
    parser = configparser.ConfigParser()
    parser.read('credential.ini')
    section = parser['github']
    return section['access_token']

def search_repo_iteratively(client, dict_list, lang, fork, stars, start, end):
    fork_str = "true" if fork else "false"
    # query_str = f"lang:{lang} stars:>={stars} fork:{fork_str} archived:false template:false created:{start}..{end}";
    query_str = f"lang:{lang} stars:>={stars} fork:{fork_str} created:{start}..{end}";
    repositories = client.search_repositories(query_str, sort="stars", order="desc")
    for repo in repositories:
        old_dict = vars(repo)
        dict_list.append({k:v for k,v in old_dict['_rawData'].items()})
        
def daterange(start_date, end_date, slice):
    tot_days = int((end_date - start_date).days) + 1
    periods = tot_days // slice
    remainder = tot_days % slice
    for n in range(periods):
        s = start_date + timedelta(n * slice)
        e = s + timedelta(slice - 1)
        yield (s, e)
    if remainder != 0:
        s = start_date + timedelta(periods * slice)
        e = s + timedelta(remainder - 1)
        yield (s, e)

def yearrange(start_year, end_year, slice):
    tot = end_year - start_year + 1
    periods = tot // slice
    remainder = tot % slice
    for n in range(periods):
        s = start_year + n * slice
        e = s + slice - 1
        yield (date(s, 1, 1), date(e, 12, 31))
    if remainder != 0:
        s = start_year + periods * slice
        e = s + remainder - 1
        yield (date(s, 1, 1), date(e, 12, 31))
        
def format_date(d):
    return d.strftime("%Y-%m-%d")

def collect_data(start_year, end_year, extra_year_range, lang, fork, stars, slice, subdir):
    sub = Path(subdir)
    sub.mkdir(exist_ok=True)
    client = Github(load_access_token(), per_page=100)
    date_ranges = []
    for t in yearrange(start_year, end_year, 2):
        date_ranges.append(t)
    if (extra_year_range != None):
        date_ranges.append(extra_year_range)
        
    date_ranges = date_ranges[::-1]
    for date_range in date_ranges:
        t0 = timer()
        dict_list = []
        for t in daterange(date_range[0], date_range[1], slice):
            search_repo_iteratively(client, dict_list, lang, fork, stars, format_date(t[0]), format_date(t[1]))
        t1 = timer()
        print(f"Collect {lang} data between {date_range[0]} and {date_range[1]} took {t1-t0} seconds")
        df = pd.DataFrame(dict_list)
        df.to_excel("%s/%s-repo-%d-%s-%s.xlsx" % (subdir, lang, stars, date_range[0], date_range[1]))
        t2 = timer()
        print(f"Save {lang} data between {date_range[0]} and {date_range[1]} took {t2-t1} seconds")
    
    t3 = timer()
    cost_dfs = []
    for date_range in date_ranges:
        df = pd.read_excel("%s/%s-repo-%d-%s-%s.xlsx" % (subdir, lang, stars, date_range[0], date_range[1]))
        cost_dfs.append(df)
    combined = pd.concat(cost_dfs)
    combined.to_excel("%s/%s-repo-%d-combined.xlsx" % (subdir, lang, stars))
    t4 = timer()
    print(f"Combine and save {lang} data took {t4-t3} seconds")

### collect golang repositories with stars>=10, including folked(compare to GHS)

In [11]:
%%time
subdir="ghs-compare"
fork = True
stars = 10
slice = 15
lang="golang"
start_year = 2008
end_year = 2022
extra = (date(2023, 1, 1), date(2023, 8, 11))
collect_data(start_year, end_year, extra, lang, fork, stars, slice, subdir)

Collect golang data between 2023-01-01 and 2023-08-11 took 70.19333025906235 seconds
Save golang data between 2023-01-01 and 2023-08-11 took 2.9992968959268183 seconds
Collect golang data between 2022-01-01 and 2022-12-31 took 176.9628741350025 seconds
Save golang data between 2022-01-01 and 2022-12-31 took 7.681846981053241 seconds
Collect golang data between 2020-01-01 and 2021-12-31 took 496.4912161080865 seconds
Save golang data between 2020-01-01 and 2021-12-31 took 21.172533147968352 seconds
Collect golang data between 2018-01-01 and 2019-12-31 took 520.5672592349583 seconds
Save golang data between 2018-01-01 and 2019-12-31 took 21.336455759010278 seconds
Collect golang data between 2016-01-01 and 2017-12-31 took 416.8428523009643 seconds
Save golang data between 2016-01-01 and 2017-12-31 took 16.97026139998343 seconds
Collect golang data between 2014-01-01 and 2015-12-31 took 283.73086229898036 seconds
Save golang data between 2014-01-01 and 2015-12-31 took 12.62168198893778 se

Request GET /search/repositories?sort=stars&order=desc&q=lang%3Agolang+stars%3A%3E%3D10+fork%3Atrue+created%3A2013-08-08..2013-08-22&per_page=100 failed with 403: Forbidden
Setting next backoff to 2.187122s


Collect golang data between 2012-01-01 and 2013-12-31 took 108.50337658403441 seconds
Save golang data between 2012-01-01 and 2013-12-31 took 4.349049465032294 seconds


Request GET /search/repositories?sort=stars&order=desc&q=lang%3Agolang+stars%3A%3E%3D10+fork%3Atrue+created%3A2010-07-15..2010-07-29&per_page=100 failed with 403: Forbidden
Setting next backoff to 14.011813s
Request GET /search/repositories?sort=stars&order=desc&q=lang%3Agolang+stars%3A%3E%3D10+fork%3Atrue+created%3A2011-10-08..2011-10-22&per_page=100 failed with 403: Forbidden
Setting next backoff to 35.180746s


Collect golang data between 2010-01-01 and 2011-12-31 took 91.34662862902042 seconds
Save golang data between 2010-01-01 and 2011-12-31 took 0.8600214379839599 seconds


Request GET /search/repositories?sort=stars&order=desc&q=lang%3Agolang+stars%3A%3E%3D10+fork%3Atrue+created%3A2008-12-26..2009-01-09&per_page=100 failed with 403: Forbidden
Setting next backoff to 38.737497s


Collect golang data between 2008-01-01 and 2009-12-31 took 70.71417723502964 seconds
Save golang data between 2008-01-01 and 2009-12-31 took 0.09253387490753084 seconds
Combine and save golang data took 155.88087240199093 seconds
CPU times: user 4min 40s, sys: 4.44 s, total: 4min 44s
Wall time: 41min 19s


### combine all golang repositories with stars>=10, non-fork

In [94]:
cost_dfs = []
for date_range in date_ranges:
    df = pd.read_excel("round2/%s-repo-%d-%s-%s.xlsx" % (lang, stars, date_range[0], date_range[1]))
    cost_dfs.append(df)
combined = pd.concat(cost_dfs)
combined.to_csv('round2/combined.csv')


## DEBUGGING CELLS

In [None]:
start_date = date(2023, 6, 1)
end_date = date(2023, 7, 31)

for t in daterange(start_date, end_date, 15):
    print(t[0], t[1])

In [None]:
client = Github(load_access_token(), per_page=100)
user = client.get_user("schnell18")
repo = user.get_repo("influx-demo")
for gazer in repo.get_stargazers():
    print(gazer)

In [105]:
start_year = 2008
end_year = 2022

for t in yearrange(start_year, end_year, 3):
    print(t[0], t[1])

2008-01-01 2010-12-31
2011-01-01 2013-12-31
2014-01-01 2016-12-31
2017-01-01 2019-12-31
2020-01-01 2022-12-31
