## PySAL Change Log Statistics

This notebook generates the summary statistics for use in the 6-month releases of PySAL, which is now (2017-07) a meta package. 

It assumes the subpackages have been git cloned in a directory below the location of this notebook. It also requires network connectivity for some of the reporting.


In [103]:
from __future__ import print_function
import os
import json
import re
import sys
import pandas

from datetime import datetime, timedelta
from time import sleep
from subprocess import check_output
try:
    from urllib import urlopen
except:
    from urllib.request import urlopen

import ssl
import yaml

context = ssl._create_unverified_context()


In [2]:
with open('packages.yml') as package_file:
    packages = yaml.load(package_file)

In [3]:
CWD = os.path.abspath(os.path.curdir)

Our last main release was `2017-11-03`:

In [4]:
start_date = '2017-11-03'
since_date = '--since="{start}"'.format(start=start_date)
since_date
since = datetime.strptime(start_date+" 0:0:0", "%Y-%m-%d %H:%M:%S")
since

datetime.datetime(2017, 11, 3, 0, 0)

## Total commits by subpackage

In [5]:
cmd = ['git', 'log', '--oneline', since_date]
ncommits = len(check_output(cmd).splitlines())

In [6]:
activity = {}
total_commits = 0
for package in packages:
    subpackages = packages[package].split()
    for subpackage in subpackages:
        os.chdir(CWD)
        os.chdir('tmp/{subpackage}'.format(subpackage=subpackage))
        ncommits = len(check_output(cmd).splitlines())
        total_commits += ncommits
        activity[subpackage] = ncommits

In [7]:
activity, total_commits

({'libpysal': 161,
  'esda': 9,
  'pointpats': 15,
  'spaghetti': 10,
  'giddy': 84,
  'mapclassify': 2,
  'splot': 163,
  'gwr': 103,
  'spglm': 20,
  'spint': 21,
  'spreg': 30,
  'spvcm': 20},
 638)

## List Contributors

Some of our contributors have many aliases for the same identity. So, we've added a mapping to make sure that individuals are listed once (and only once). 

In [70]:
identities = {'Levi John Wolf': ('ljwolf', 'Levi John Wolf'),
              'Serge Rey': ('Serge Rey', 'Sergio Rey', 'sjsrey', 'serge'),
              'Wei Kang': ('Wei Kang', 'weikang9009'),
              'Dani Arribas-Bel': ('Dani Arribas-Bel', 'darribas')
}

def regularize_identity(string):
    string = string.decode()
    for name, aliases in identities.items():
        for alias in aliases:
            if alias in string:
                string = string.replace(alias, name)
    if len(string.split(' '))>1:
        string = string.title()
    return string.lstrip('* ')

In [71]:
author_cmd = ['git', 'log', '--format=* %aN', since_date]

In [72]:
from collections import Counter

In [73]:
authors_global = set()
authors = {}
global_counter = Counter()
counters = dict()
for package in packages:
    subpackages = packages[package].split()
    for subpackage in subpackages:
        os.chdir(CWD)
        os.chdir('tmp/{subpackage}'.format(subpackage=subpackage))
        ncommits = len(check_output(cmd).splitlines())
        all_authors = check_output(author_cmd).splitlines()
        counter = Counter([regularize_identity(author) for author in all_authors])
        global_counter += counter
        counters.update({'.'.join((package,subpackage)): counter})
        unique_authors = sorted(set(all_authors))
        authors[subpackage] =  unique_authors
        authors_global.update(unique_authors)
        #total_commits += ncommits
        #activity[subpackage] = ncommits

In [74]:
counters

{'lib.libpysal': Counter({'Levi John Wolf': 73,
          'James Gaboardi': 2,
          'Serge Rey': 48,
          'Eli Knaap': 5,
          'Dani Arribas-Bel': 19,
          'Taylor Oshan': 4,
          'Wei Kang': 8,
          'Stefanie Lumnitz': 2}),
 'explore.esda': Counter({'Levi John Wolf': 4, 'Serge Rey': 5}),
 'explore.pointpats': Counter({'Levi John Wolf': 7,
          'Wei Kang': 4,
          'Hu Shao': 2,
          'Serge Rey': 2}),
 'explore.spaghetti': Counter({'Levi John Wolf': 2, 'James Gaboardi': 8}),
 'dynamics.giddy': Counter({'Wei Kang': 55,
          'Serge Rey': 18,
          'Stefanie Lumnitz': 8,
          'Levi John Wolf': 1,
          'Eli Knaap': 2}),
 'viz.mapclassify': Counter({'Levi John Wolf': 1, 'Serge Rey': 1}),
 'viz.splot': Counter({'Serge Rey': 5,
          'Levi John Wolf': 8,
          'Stefanie Lumnitz': 147,
          'Thequackdaddy': 1,
          'Dani Arribas-Bel': 1,
          'Jsignell': 1}),
 'model.gwr': Counter({'Taylor Oshan': 46,
       

In [85]:
contributor_table = pandas.DataFrame.from_dict(counters).fillna(0).astype(int).T

In [86]:
contributor_table.to_html('./contributor_table.html')

In [115]:
totals = contributor_table.sum(axis=0).T
totals.sort_index().to_frame('commits')

Unnamed: 0,commits
Dani Arribas-Bel,20
Eli Knaap,7
Hu Shao,2
James Gaboardi,10
Jsignell,1
Levi John Wolf,140
Serge Rey,96
Stefanie Lumnitz,157
Taylor Oshan,85
Thequackdaddy,1


In [116]:
_.to_html('./commits_by_person.html')

## Disaggregate by PR, Issue

In [92]:
from datetime import datetime, timedelta
ISO8601 = "%Y-%m-%dT%H:%M:%SZ"
PER_PAGE = 100
element_pat = re.compile(r'<(.+?)>')
rel_pat = re.compile(r'rel=[\'"](\w+)[\'"]')


In [108]:

def parse_link_header(headers):
    link_s = headers.get('link', '')
    urls = element_pat.findall(link_s)
    rels = rel_pat.findall(link_s)
    d = {}
    for rel,url in zip(rels, urls):
        d[rel] = url
    return d

def get_paged_request(url):
    """get a full list, handling APIv3's paging"""
    results = []
    while url:
        #print("fetching %s" % url, file=sys.stderr)
        f = urlopen(url)
        results.extend(json.load(f))
        links = parse_link_header(f.headers)
        url = links.get('next')
    return results

def get_issues(project="pysal/pysal", state="closed", pulls=False):
    """Get a list of the issues from the Github API."""
    which = 'pulls' if pulls else 'issues'
    url = "https://api.github.com/repos/%s/%s?state=%s&per_page=%i" % (project, which, state, PER_PAGE)
    return get_paged_request(url)


def _parse_datetime(s):
    """Parse dates in the format returned by the Github API."""
    if s:
        return datetime.strptime(s, ISO8601)
    else:
        return datetime.fromtimestamp(0)


def issues2dict(issues):
    """Convert a list of issues to a dict, keyed by issue number."""
    idict = {}
    for i in issues:
        idict[i['number']] = i
    return idict


def is_pull_request(issue):
    """Return True if the given issue is a pull request."""
    return 'pull_request_url' in issue


def issues_closed_since(period=timedelta(days=365), project="pysal/pysal", pulls=False):
    """Get all issues closed since a particular point in time. period
can either be a datetime object, or a timedelta object. In the
latter case, it is used as a time before the present."""

    which = 'pulls' if pulls else 'issues'

    if isinstance(period, timedelta):
        period = datetime.now() - period
    url = "https://api.github.com/repos/%s/%s?state=closed&sort=updated&since=%s&per_page=%i" % (project, which, period.strftime(ISO8601), PER_PAGE)
    allclosed = get_paged_request(url)
    # allclosed = get_issues(project=project, state='closed', pulls=pulls, since=period)
    filtered = [i for i in allclosed if _parse_datetime(i['closed_at']) > period]

    # exclude rejected PRs
    if pulls:
        filtered = [ pr for pr in filtered if pr['merged_at'] ]

    return filtered


def sorted_by_field(issues, field='closed_at', reverse=False):
    """Return a list of issues sorted by closing date date."""
    return sorted(issues, key = lambda i:i[field], reverse=reverse)


def report(issues, show_urls=False):
    """Summary report about a list of issues, printing number and title.
    """
    # titles may have unicode in them, so we must encode everything below
    if show_urls:
        for i in issues:
            role = 'ghpull' if 'merged_at' in i else 'ghissue'
            print('* :%s:`%d`: %s' % (role, i['number'],
                                        i['title'].encode('utf-8')))
    else:
        for i in issues:
            print('* %d: %s' % (i['number'], i['title'].encode('utf-8')))



In [109]:
all_issues = {}
all_pulls = {}
total_commits = 0
for package in packages:
    subpackages = packages[package].split()
    for subpackage in subpackages:
        prj = 'pysal/{subpackage}'.format(subpackage=subpackage)
        os.chdir(CWD)
        os.chdir('tmp/{subpackage}'.format(subpackage=subpackage))
        #sub_issues = issues_closed_since(project=prj, period=since)
        sleep(5)
        issues = issues_closed_since(since, project=prj,pulls=False)
        pulls = issues_closed_since(since, project=prj,pulls=True)
        issues = sorted_by_field(issues, reverse=True)
        pulls = sorted_by_field(pulls, reverse=True)
        n_issues, n_pulls = map(len, (issues, pulls))
        n_total = n_issues + n_pulls
        all_issues[subpackage] = n_total, n_pulls
os.chdir(CWD)

HTTPError: HTTP Error 403: Forbidden

In [105]:
urlopen('https://api.github.com/rate_limit')

<http.client.HTTPResponse at 0x7fd9f1526668>

In [106]:
rq = _

In [107]:
rq.read()

b'{"resources":{"core":{"limit":60,"remaining":0,"reset":1531857287},"search":{"limit":10,"remaining":10,"reset":1531856690},"graphql":{"limit":0,"remaining":0,"reset":1531860230}},"rate":{"limit":60,"remaining":0,"reset":1531857287}}'

In [94]:
subs = all_issues.keys()
table = []
for sub in subs:
    total, pr = all_issues[sub]
    row = [sub, activity[sub], total, pr]
    table.append(row)

In [95]:
table

[['libpysal', 161, 76, 35],
 ['esda', 9, 12, 5],
 ['pointpats', 15, 8, 4],
 ['spaghetti', 10, 7, 2],
 ['giddy', 84, 39, 17],
 ['mapclassify', 2, 2, 1],
 ['splot', 163, 30, 14],
 ['gwr', 103, 39, 13],
 ['spglm', 20, 10, 5],
 ['spint', 21, 12, 6],
 ['spreg', 30, 9, 4],
 ['spvcm', 20, 3, 1]]

In [96]:
import pandas

In [97]:
df = pandas.DataFrame(table, columns=['package', 'commits', 'total issues', 'pulls'])

In [29]:
df.sort_values(['commits','pulls'], ascending=False)\
  .to_html('./commit_table.html', index=None)