In [1]:
from github import Github, NamedUser
from tqdm.notebook import tqdm
import time
import os
import datetime
from collections import Counter
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [33]:
def get_timestamp(date):
    date_str = str(date)
    pattern = '%Y-%m-%d %H:%M:%S'
    if '.' in date_str:
        pattern += '.%f'
    return int(time.mktime(time.strptime(date_str, pattern)))

def make_act_dict(users, times, act:dict=None):
    if act is None:
        act = {}
    for user, t in zip(users, times):
        if user is None:
            continue
        if user not in act:
            act[user] = []
        act[user] += [t]
    return act

def get_issues_lifetime(issues):
    issues_lifetime = []
    for issue in issues:
        try:
            if issue.state == 'open':
                v = get_timestamp(datetime.datetime.now()) - get_timestamp(issue.created_at)
            else:
                v = get_timestamp(issue.closed_at) - get_timestamp(issue.created_at)
            issues_lifetime += [v // (24*3600)]
        except Exception as e:
            print(e)
    
    return np.array(issues_lifetime)

def fetch_issues(repo):
    open_issues = repo.get_issues(state='open').totalCount
    closed_issues = repo.get_issues(state='closed').totalCount
    issues = [o for o in tqdm(repo.get_issues(state='all'), total=repo.get_issues(state='all').totalCount)]
    return open_issues, closed_issues, issues


def get_information(repo):
    commits = [o for o in tqdm(repo.get_commits(), total=repo.get_commits().totalCount)]
    commit_users = []
    commit_times = []
    for commit in commits:
        try:
            if commit.author is not None:
                commit_users += [commit.author.id]
                commit_times += [get_timestamp(commit.commit.committer.date)]
        except Exception as e:
            print(e)
    act = make_act_dict(commit_users, commit_times)
    assert len(act) > 1
    
    forks = repo.forks_count
    stars = repo.stargazers_count
    open_issues, closed_issues, issues = fetch_issues(repo)
    states = np.array([0 if issue.state == 'closed' else 1 for issue in issues])
    
    stats = {
        'forks': forks,
        'stars' : stars,
        'issues_state' : states,
        'issues_lifetime' : get_issues_lifetime(issues),
        'activities': act
    }
    pickle.dump(stats, open(f'minimal_pkls/{"____".join(repo.full_name.split("/"))}.pkl', 'wb'))

In [3]:
g = Github("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")

In [4]:
search = g.search_repositories(query='stars:>10000')
n_commits = np.array([s.get_commits().totalCount for s in tqdm(search, total=search.totalCount)])

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [5]:
# print(np.min(n_commits), np.max(n_commits), np.mean(n_commits), np.std(n_commits))
# plt.figure(figsize=(20,4))
# plt.title('commits')
# plt.hist(n_commits, 50)
# plt.show()

In [35]:
check_existance = lambda name : os.path.exists(f'minimal_pkls/{"____".join(name.split("/"))}.pkl')
indices = np.where((5000 <= n_commits) & (n_commits <= 80000))[0]
len(indices)

250

In [None]:
for idx in tqdm(indices):
    repo = search[int(idx)]
    if check_existance(repo.full_name):
        continue
    print(repo.full_name)
    get_information(repo)

HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))

golang/go


HBox(children=(FloatProgress(value=0.0, max=48337.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=46601.0), HTML(value='')))

## fix issues

In [7]:
path = 'minimal_pkls/'
check_existance = lambda name : os.path.exists(path + '____'.join(name.split('/')) + '.pkl')

In [32]:
BIN = 3600
prev_time = get_timestamp(datetime.datetime.now())
prev_req = 0

for idx in tqdm(indices):
    repo = search[int(idx)]
    name = repo.full_name
    if check_existance(name):
        continue
    if not os.path.exists(path + '-'.join(name.split('/'))+ '.pkl'):
        continue
    
    print(name)
    obj = pickle.load(open(path + "-".join(name.split("/"))+ '.pkl', 'rb'))
    del obj['issues']
    
    try:
        open_issues, closed_issues, issues = fetch_issues(repo)
        prev_req += len(issues)
    except:
        now = get_timestamp(datetime.datetime.now())
        diff = BIN - ((now - prev_time) % BIN)
        for t in range(diff):
                print(f'sleeping for {t+1}/{diff} seconds...', end='\r')
                time.sleep(1)
        prev = now
        open_issues, closed_issues, issues = fetch_issues(repo)
    
    states = np.array([0 if issue.state == 'closed' else 1 for issue in issues])
    obj['issues_lifetime'] = get_issues_lifetime(issues)
    obj['issues_state'] = states
    
    pickle.dump(obj, open(path + '____'.join(name.split('/')) + '.pkl', 'wb'))
    os.remove(path + '-'.join(name.split('/')) + '.pkl')

HBox(children=(FloatProgress(value=0.0, max=201.0), HTML(value='')))

reddit-archive/reddit


HBox(children=(FloatProgress(value=0.0, max=891.0), HTML(value='')))


facebook/relay


HBox(children=(FloatProgress(value=0.0, max=3519.0), HTML(value='')))


pubkey/rxdb


HBox(children=(FloatProgress(value=0.0, max=3237.0), HTML(value='')))


processing/p5.js


HBox(children=(FloatProgress(value=0.0, max=5255.0), HTML(value='')))


material-components/material-components-web


HBox(children=(FloatProgress(value=0.0, max=7003.0), HTML(value='')))


dianping/cat


HBox(children=(FloatProgress(value=0.0, max=2117.0), HTML(value='')))


taosdata/TDengine


HBox(children=(FloatProgress(value=0.0, max=6625.0), HTML(value='')))


arendst/Tasmota


HBox(children=(FloatProgress(value=0.0, max=11452.0), HTML(value='')))


vnpy/vnpy


HBox(children=(FloatProgress(value=0.0, max=3018.0), HTML(value='')))


handsontable/handsontable


HBox(children=(FloatProgress(value=0.0, max=7987.0), HTML(value='')))


sleeping for 2449/2449 seconds...

HBox(children=(FloatProgress(value=0.0, max=7987.0), HTML(value='')))


bokeh/bokeh


HBox(children=(FloatProgress(value=0.0, max=11351.0), HTML(value='')))



