In [1]:
from github import Github, NamedUser
from tqdm.notebook import tqdm
import time
import datetime
from collections import Counter
import pickle
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
import requests
import json
import matplotlib.pyplot as plt

In [2]:
def get_timestamp(date):
    return int(time.mktime(time.strptime(str(date),'%Y-%m-%d %H:%M:%S')))

def get_commit_tuple(commit):
    id = None
    if commit.author:
        id = commit.author.id
    time = get_time(commit.commit.committer.date)
    return (id, time)

def crawl(func, max_count:int=None, **kwargs):
    n = func(**kwargs).totalCount
    print('total count:', n)
    
    if max_count is None:
        max_count = n
    else:
        max_count = min(n, max_count)
    res = [o for o in tqdm(func(**kwargs)[:max_count], total=max_count)]
    return res


def make_act_dict(users, times, act:dict=None):
    if act is None:
        act = {}
    for user, t in zip(users, times):
        if user is None:
            continue

        if user not in act:
            act[user] = []
        act[user] += [t]
    return act

In [3]:
g = Github("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")

In [4]:
repo_name = "numpy/numpy"
repo = g.get_repo(repo_name)

In [5]:
offset = get_timestamp(repo.created_at)
offset

1284402759

26759

## issues

In [15]:
issues = crawl(repo.get_issues)

total count: 2319


HBox(children=(FloatProgress(value=0.0, max=2319.0), HTML(value='')))




In [372]:
issues = [get_timestamp(issue.created_at)-offset for issue in issues]

In [373]:
print(datetime.datetime.fromtimestamp(issues[0] + offset),
      ' : ', 
      datetime.datetime.fromtimestamp(issues[-1] + offset))

2021-06-18 11:39:53  :  2018-02-05 09:02:14


## stars

In [374]:
stars = crawl(repo.get_stargazers_with_dates, max_count=39900)

total count: 37338


HBox(children=(FloatProgress(value=0.0, max=37338.0), HTML(value='')))




In [375]:
stars = [get_timestamp(star.starred_at)-offset for star in stars]

## commits

In [376]:
commits = crawl(repo.get_commits)

total count: 1389


HBox(children=(FloatProgress(value=0.0, max=1389.0), HTML(value='')))




In [377]:
commit_users = []
commit_times = []
for commit in commits:
    try:
        if commit.author is not None:
            commit_users += [commit.author.id]
            commit_times += [get_timestamp(commit.commit.committer.date)-offset]
    except:
        pass

In [378]:
unique_users = Counter(commit_users)
len(unique_users)

75

In [379]:
act = make_act_dict(commit_users, commit_times)

## forks

In [380]:
forks = crawl(repo.get_forks)

total count: 10887


HBox(children=(FloatProgress(value=0.0, max=10887.0), HTML(value='')))




In [381]:
forks = [get_timestamp(fork.created_at)-offset for fork in forks]

## save

In [382]:
stats = {
    'forks': forks[::-1],
    'stars' : stars,
    'releases' : releases[::-1],
    'issues': issues[::-1],
    'activities': act
}

In [383]:
pickle.dump(stats, open(f'pkls/{"-".join(repo_name.split("/"))}.pkl', 'wb'))