# GitHub Metrics v5

Metrics to measure the evolution of github projects

In [None]:
# Requirements -- uncomment to install
# !pip install -q pandas tqdm PyGithub openpyxl xlsxwriter 

In [None]:
import time
from functools import lru_cache
from datetime import date, datetime
from operator import itemgetter as item

import pandas as pd
from github import Github, RateLimitExceededException
from tqdm.auto import tqdm, trange
import requests

tqdm.pandas()

## Setup

Create one or more Personal Access Tokens
Only one access token per user can be used, so in order to use multiple tokens multiple users
need to be crated

Documentation about how to create a token [here](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token)

In [None]:
TOKENS = [
    '<paste-the-tokens-here>',
]

In [None]:
# Current Rate Limits for the given tokens

[Github(token).get_rate_limit() for token in TOKENS]

In [None]:
def rate_limited(gh, func, *args, **kwargs):
    try:
        return func(*args, **kwargs)
    except RateLimitExceededException:
        limits = gh.get_rate_limit()
        print(f"Rate limit exceeded running {func}")

        if limits.search.remaining == 0:
            limited = limits.search
        elif limits.graphql.remaining == 0:
            limited = limits.graphql
        else:
            limited = limits.core

        seconds = (limited.reset - datetime.utcnow()).total_seconds() + 30
        if seconds > 0.0:
            print(f"Waiting for {seconds} seconds...")
            time.sleep(seconds)
            print("Resuming")

        return func(*args, **kwargs)

In [None]:
class MultiTokenGithub:
    
    def _get_github(self):
        gs = []
        for g in self._gs:
            limit = g.get_rate_limit().core
            gs.append((g, limit.remaining, limit.reset))

        gs.sort(key=item(2))
        gs.sort(reverse=True, key=item(1))
        return gs[0]
    
    def __init__(self, tokens):
        self._gs = [Github(token, per_page=100) for token in tokens]
        self._g = self._get_github()[0]
    
    def _caller(self, attr):
        def call(*args, **kwargs):
            try:
                return getattr(self._g, attr)(*args, **kwargs)
            except RateLimitExceededException:
                print('Rate limit hit - switching client')
                self._g, remaining, reset = self._get_github()
                if remaining:
                    return getattr(self._g, attr)(*args, **kwargs)
                
                seconds = (reset - datetime.utcnow()).total_seconds() + 30
                print(f'No remaining requests - Waiting for {seconds} seconds...')
                time.sleep(seconds)
                print("Resuming")
                return getattr(self._g, attr)(*args, **kwargs)

        return call
    
    def __getattr__(self, attr):
        return self._caller(attr)

In [None]:
@lru_cache
def get_repo(gh, repo):
    if '/' not in repo:
        repo = f'sdv-dev/{repo}'
        
    return rate_limited(gh, gh.get_repo, repo)

In [None]:
def get_num_pages(pages):
    return int(pages.totalCount / 100) + 1

def to_list(gh, pages):
    num_pages = rate_limited(gh, get_num_pages, pages)
    elements = []
    for page in trange(num_pages):
        page_elements = rate_limited(gh, pages.get_page, page)
        elements.extend(page_elements)
    
    return elements

In [None]:
@lru_cache
def get_issues(gh, repo):
    grepo = get_repo(gh, repo)
    issues = rate_limited(gh, grepo.get_issues, state='all')
    return to_list(gh, issues)

In [None]:
def get_all_issues(gh, repos):
    data = []
    for repo in tqdm(repos):
        issues = get_issues(gh, repo)
        for issue in tqdm(issues):
            #if issue.pull_request:
            data.append({
                'user': issue.user.login,
                'repository': repo,
                'number': issue.number,
                'created_at': issue.created_at,
                'closed_at': issue.closed_at,
                'state': issue.state,
                'comments': issue.comments,
                'title': issue.title,
            })

    return pd.DataFrame(data)

In [None]:
GRAPHQL_URL = 'https://api.github.com/graphql'

def run_query(query, token):
    headers = {'Authorization': token}
    request = requests.post(GRAPHQL_URL, json={'query': query}, headers=headers)
    if request.status_code == 200:
        return request.json()
    else:
        raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, query))


query = """
{{
  repository(owner: "{0}", name: "{1}") {{
    stargazers(first: 100 {2}) {{
      pageInfo {{
        endCursor
        hasNextPage
        hasPreviousPage
        startCursor
      }}
      edges {{
        starredAt
        node {{
          login
        }}
      }}
    }}
  }}
}}
"""


def get_repo_stargazers_gql(gh, repo):
    token = gh._g._Github__requester._Requester__authorizationHeader
    owner, repo_name = repo.split('/')
    
    hasNextPage = True
    endCursor = ""
    stargazers = []
    total = get_repo(gh, repo).stargazers_count
    with tqdm(total=total) as pbar:
        while hasNextPage:
            this_query = query.format(owner, repo_name, endCursor)
            result = run_query(this_query, token) # Execute the query
            hasNextPage = result['data']['repository']['stargazers']['pageInfo']['hasNextPage']
            endCursor = result['data']['repository']['stargazers']['pageInfo']['endCursor']
            endCursor = ', after: "' + endCursor + '"'
            data = result['data']['repository']['stargazers']['edges']

            for item in data:
                username = item['node']['login']
                star_time = datetime.strptime(item['starredAt'],'%Y-%m-%dT%H:%M:%SZ')
                star_time = star_time.strftime('%Y-%m-%d %H:%M:%S')
                stargazers.append({
                    'user': username,
                    'starred_at': star_time,
                    'repository': repo,
                })
                pbar.update(1)
    
    return stargazers

In [None]:
def get_repo_stargazers(gh, repo):
    grepo = get_repo(gh, repo)
    stargazers = rate_limited(gh, grepo.get_stargazers_with_dates)
    stargazers = to_list(gh, stargazers)
    all_stargazers = []
    for stargazer in tqdm(stargazers):
        all_stargazers.append({
            'user': stargazer.user.login,
            'starred_at': stargazer.starred_at,
            'repository': repo,
        })
    
    return all_stargazers

In [None]:
def get_stargazers(gh, repos):
    all_stargazers = []
    for repo in tqdm(repos):
        all_stargazers.extend(get_repo_stargazers_gql(gh, repo))
        
    all_stargazers = pd.DataFrame(all_stargazers)

    unique_stargazers = all_stargazers.sort_values('starred_at')
    unique_stargazers = unique_stargazers.drop_duplicates(subset=['user'], keep='first')
    
    return unique_stargazers

In [None]:
@lru_cache
def get_user(gh, user):
    if not isinstance(user, str):
        return user
    
    return rate_limited(gh, gh.get_user, user)

@lru_cache
def get_profile(gh, user):
    user = get_user(gh, user)
    return pd.Series({
        'user': user.login,
        'name': user.name,
        'email': user.email,
        'blog': user.blog,
        'company': user.company,
        'location': user.location,
        'twitter': user.twitter_username,
        'repos': user.public_repos,
        'gists': user.public_gists,
        'followers': user.followers,
        'following': user.following,
        'user_created_at': user.created_at,
        'user_updated_at': user.updated_at,
        'bio': user.bio,
    })

In [None]:
def get_user_profiles(gh, users):
    user_profiles = []
    for user in tqdm(users):
        user_profiles.append(get_profile(gh, user))

    return pd.DataFrame(user_profiles)

In [None]:
ISSUES_COLUMNS = [
    'user',
    'repository',
    'number',
    'created_at',
    'closed_at',
    'state',
    'comments',
    'title',
]
USERS_COLUMNS = [
    'user',
    'first_issue_date',
    'db_account_issue_creation',
    'name',
    'email',
    'blog',
    'company',
    'location',
    'twitter',
    'repos',
    'gists',
    'followers',
    'following',
    'user_created_at',
    'user_updated_at',
    'bio'
]

def add_user_profiles(gh, issues, stargazers):
    issues_by_date = issues.sort_values('created_at')
    issue_users = issues_by_date.drop_duplicates(subset='user', keep='first')
    stargazer_users = stargazers.user.unique()

    unique_users = set(issue_users.user) | set(stargazer_users)
    profiles = get_user_profiles(gh, unique_users)
    
    rename_columns = {
        'created_at': 'first_issue_date'
    }
    issue_users = issue_users.rename(columns=rename_columns)
    issue_users = issue_users.merge(profiles, on='user', how='left')

    time_between = issue_users['first_issue_date'] - issue_users['user_created_at']
    issue_users['db_account_issue_creation'] = time_between.dt.days
    
    stargazers = stargazers.merge(profiles, on='user', how='left')
    
    return issue_users[USERS_COLUMNS], stargazers

In [None]:
def add_sheet(writer, data, sheet):
    data.to_excel(writer, sheet_name=sheet, index=False)

    for column in data:
        column_width = max(data[column].astype(str).map(len).max(), len(column))
        col_idx = data.columns.get_loc(column)
        writer.sheets[sheet].set_column(col_idx, col_idx, column_width + 2)

def create_excel(name, issues, users, stargazers):
    today = date.today().isoformat()
    filename = f'github-stats-{name}-{today}.xlsx'

    print(f'Creating file {filename}')

    with pd.ExcelWriter(filename, mode='w') as writer:
        add_sheet(writer, issues, 'Issues')
        add_sheet(writer, users, 'Unique Issue Users')
        add_sheet(writer, stargazers, 'Unique Stargazers')

In [None]:
def get_github_stats(repos, name):
    gh = MultiTokenGithub(TOKENS)
    
    print('Getting issues')
    issues = get_all_issues(gh, repos)
    
    print('Getting stargazers')
    stargazers = get_stargazers(gh, repos)
    
    print('Getting users')
    users, stargazers = add_user_profiles(gh, issues, stargazers)
    issues = issues[ISSUES_COLUMNS]
    
    create_excel(name, issues, users, stargazers)

In [None]:
# Usage
# - repos: List of repositories to grab and aggregate. Must have the format <org-name>/<repo-name>
# - name: Name that will used to create the output filename, which will be 'github-stats-<name>-<today>.xlsx'

repos = [
    'scikit-learn/scikit-learn',
]
name = 'scikit-learn'

get_github_stats(
    repos=repos,
    name=name
)

In [None]:
# Usage
# - repos: List of repositories to grab and aggregate. Must have the format <org-name>/<repo-name>
# - name: Name that will used to create the output filename, which will be 'github-stats-<name>-<today>.xlsx'

repos = [
    'huggingface/transformers',
    'huggingface/datasets',
    'huggingface/huggingface_hub',
    'huggingface/optimum',
    'huggingface/notebooks',
    'huggingface/accelerate',
    'huggingface/awesome-huggingface',
    'huggingface/swift-coreml-transformers',
    'huggingface/neuralcoref',
    'huggingface/tokenizers',
    'huggingface/allennlp',
    'huggingface/autonlp',
    'huggingface/knockknock',
    'huggingface/node-question-answering',
    'huggingface/pytorch-openai-transformer-lm',
]
name = 'huggingface'

get_github_stats(
    repos=repos,
    name=name
)

In [None]:
# Usage
# - repos: List of repositories to grab and aggregate. Must have the format <org-name>/<repo-name>
# - name: Name that will used to create the output filename, which will be 'github-stats-<name>-<today>.xlsx'

repos = [
    'grafana/grafana',
    'grafana/loki',
    'grafana/tempo',
    'grafana/k6',
    'grafana/grafana',
    'grafana/grafana',
]
name = 'grafana'

get_github_stats(
    repos=repos,
    name=name
)