## PySAL Change Log Statistics: Table Generation

This notebook generates the summary statistics for use in the 6-month releases of PySAL, which is now (2017-07) a meta package. 

It assumes the subpackages have been git cloned in a directory below the location of this notebook. It also requires network connectivity for some of the reporting.

Run this notebook after `gitcount.ipynb`


In [1]:
from __future__ import print_function
import os
import json
import re
import sys
import pandas
import subprocess
from subprocess import check_output

import yaml
from datetime import datetime, timedelta

from dateutil.parser import parse
import pytz

utc=pytz.UTC

In [2]:
from datetime import datetime, timedelta
from time import sleep
from subprocess import check_output
try:
    from urllib import urlopen
except:
    from urllib.request import urlopen

import ssl
import yaml

context = ssl._create_unverified_context()

In [3]:
with open('packages.yml') as package_file:
    packages = yaml.load(package_file)

  


In [4]:
CWD = os.path.abspath(os.path.curdir)

In [5]:
CWD

'/home/serge/Dropbox/p/pysal/src/pysal'

Our last main release was `2019-01-30`:

In [6]:
start_date = '2019-01-30'
since_date = '--since="{start}"'.format(start=start_date)
since_date
since = datetime.strptime(start_date+" 0:0:0", "%Y-%m-%d %H:%M:%S")
since

datetime.datetime(2019, 1, 30, 0, 0)

In [7]:
import pickle 

In [8]:
issue_details = pickle.load( open( "issue_details.p", "rb" ) )
pull_details = pickle.load( open( "pull_details.p", "rb" ) )


In [9]:
# get dates of tags
with open('subtags', 'r') as tag_name:
        tags = tag_name.readlines()


In [10]:
tag_dates = {}
#root = '/home/serge/Dropbox/p/pysal/src/pysal/tmp/'
root = CWD + "/tmp/"
for record in tags:
    pkg, tag = record.strip().split()
    tag = tag.split('/')[-1]
    pkdir = root+pkg
    cmd = "git log -1 --format=%ai {tag}".format(tag=tag)
    os.chdir(pkdir)
    #print(cmd)
    result = subprocess.run(cmd, check=True, shell=True, stdout=subprocess.PIPE)
    tag_string = result.stdout.decode('utf-8')
    tag_date = tag_string.split()[0]
    tag_dates[pkg] = tag_date
    print(pkg, tag, tag_date)

os.chdir(CWD)

    

libpysal v4.1.0 2019-07-01
esda v2.1.1 2019-07-01
giddy v2.2.0 2019-06-20
inequality v1.0.0 2018-10-31
pointpats v2.1.0 2019-07-01
spaghetti v1.3 2019-05-21
mapclassify v2.1.1 2019-06-28
spreg v1.1.0 2019-06-29
segregation 1.1.1 2019-07-19
spglm v1.0.7 2019-07-18
spint v1.0.5 2019-01-04
splot v1.1.1 2019-07-13
mgwr v2.1.1 2019-07-18
spvcm v0.2.1post1 2019-01-04


In [11]:
# get issues for a package and filter on tag date


for pkg in tag_dates.keys():
    issues = issue_details[pkg]
    tag_date = utc.localize(parse(tag_dates[pkg]))
    keep = []
    for issue in issues:
        closed = parse(issue['closed_at'])
        if closed <= tag_date:
            keep.append(issue)
    print(pkg, len(issues), len(keep))
    issue_details[pkg] = keep
    keep = []
    pulls = pull_details[pkg]
    for pull in pulls:
        closed = parse(pull['closed_at'])
        if closed <= tag_date:
            keep.append(pull)
    print(pkg, len(pulls), len(keep)) 
    pull_details[pkg] = keep
        
  

libpysal 35 26
libpysal 17 11
esda 28 9
esda 17 7
giddy 23 13
giddy 19 10
inequality 1 0
inequality 1 0
pointpats 11 7
pointpats 11 7
spaghetti 52 40
spaghetti 26 21
mapclassify 16 15
mapclassify 13 12
spreg 6 3
spreg 3 1
segregation 106 105
segregation 79 78
spglm 5 3
spglm 5 3
spint 0 0
spint 0 0
splot 28 25
splot 19 16
mgwr 19 15
mgwr 10 7
spvcm 0 0
spvcm 0 0


In [12]:
# commits
cmd = ['git', 'log', '--oneline', since_date]

activity = {}
total_commits = 0
for package in packages:
    subpackages = packages[package].split()
    for subpackage in subpackages:
        tag_date = tag_dates[subpackage]
        os.chdir(CWD)
        os.chdir('tmp/{subpackage}'.format(subpackage=subpackage))
        cmd_until = cmd + ['--until="{tag_date}"'.format(tag_date=tag_date)]
        ncommits = len(check_output(cmd_until).splitlines())
        ncommits_total = len(check_output(cmd).splitlines())
        print(subpackage, ncommits_total, ncommits, tag_date)
        total_commits += ncommits
        activity[subpackage] = ncommits

libpysal 80 80 2019-07-01
esda 67 67 2019-07-01
giddy 32 32 2019-06-20
inequality 0 0 2018-10-31
pointpats 25 20 2019-07-01
spaghetti 120 120 2019-05-21
segregation 456 456 2019-07-19
mapclassify 43 43 2019-06-28
splot 95 95 2019-07-13
spreg 16 16 2019-06-29
spglm 14 14 2019-07-18
spint 0 0 2019-01-04
mgwr 44 44 2019-07-18
spvcm 0 0 2019-01-04


In [13]:
cmd_until

['git', 'log', '--oneline', '--since="2019-01-30"', '--until="2019-01-04"']

In [14]:
identities = {'Levi John Wolf': ('ljwolf', 'Levi John Wolf'),
              'Serge Rey': ('Serge Rey', 'Sergio Rey', 'sjsrey', 'serge'),
              'Wei Kang': ('Wei Kang', 'weikang9009'),
              'Dani Arribas-Bel': ('Dani Arribas-Bel', 'darribas'),
              'Antti Härkönen': ( 'antth', 'Antti Härkönen', 'Antti Härkönen', 'Antth'  ),
              'Juan C Duque': ('Juan C Duque', "Juan Duque"),
              'Renan Xavier Cortes': ('Renan Xavier Cortes', 'renanxcortes', 'Renan Xavier Cortes'   ),
              'Taylor Oshan': ('Tayloroshan', 'Taylor Oshan', 'TaylorOshan'),
              'Tom Gertin': ('@Tomgertin', 'Tom Gertin', '@tomgertin')
}

def regularize_identity(string):
    string = string.decode()
    for name, aliases in identities.items():
        for alias in aliases:
            if alias in string:
                string = string.replace(alias, name)
    if len(string.split(' '))>1:
        string = string.title()
    return string.lstrip('* ')

In [15]:
author_cmd = ['git', 'log', '--format=* %aN', since_date]

In [16]:
author_cmd.append('blank')

In [17]:
author_cmd

['git', 'log', '--format=* %aN', '--since="2019-01-30"', 'blank']

In [18]:
from collections import Counter

In [19]:
authors_global = set()
authors = {}
global_counter = Counter()
counters = dict()
cmd = ['git', 'log', '--oneline', since_date]
total_commits = 0
activity = {}
for package in packages:
    subpackages = packages[package].split()
    for subpackage in subpackages:
        os.chdir(CWD)
        os.chdir('tmp/{subpackage}'.format(subpackage=subpackage))
        ncommits = len(check_output(cmd).splitlines())
        
        
        tag_date = tag_dates[subpackage]
        author_cmd[-1] = '--until="{tag_date}"'.format(tag_date=tag_date)
        #cmd_until = cmd + ['--until="{tag_date}"'.format(tag_date=tag_date)]

        
        all_authors = check_output(author_cmd).splitlines()
        counter = Counter([regularize_identity(author) for author in all_authors])
        global_counter += counter
        counters.update({'.'.join((package,subpackage)): counter})
        unique_authors = sorted(set(all_authors))
        authors[subpackage] =  unique_authors
        authors_global.update(unique_authors)
        total_commits += ncommits
        activity[subpackage] = ncommits

In [20]:
authors_global

{b'* @tomgertin',
 b'* Antti H\xc3\xa4rk\xc3\xb6nen',
 b'* Dani Arribas-Bel',
 b'* Elliott Sales de Andrade',
 b'* Filipe Fernandes',
 b'* James Gaboardi',
 b'* Juan C Duque',
 b'* Juan Duque',
 b'* Levi John Wolf',
 b'* Martin Fleischmann',
 b'* Renan Xavier Cortes',
 b'* Serge Rey',
 b'* Sergio Rey',
 b'* Stefanie Lumnitz',
 b'* Taylor Oshan',
 b'* TaylorOshan',
 b'* Wei Kang',
 b'* Ziqi Li',
 b'* antth',
 b'* eli knaap',
 b'* ljwolf',
 b'* renanxcortes',
 b'* serge',
 b'* weikang9009'}

In [21]:
activity

{'libpysal': 80,
 'esda': 67,
 'giddy': 32,
 'inequality': 0,
 'pointpats': 25,
 'spaghetti': 120,
 'segregation': 456,
 'mapclassify': 43,
 'splot': 95,
 'spreg': 16,
 'spglm': 14,
 'spint': 0,
 'mgwr': 44,
 'spvcm': 0}

In [22]:
counters

{'lib.libpysal': Counter({'Serge Rey': 14,
          'Wei Kang': 6,
          'James Gaboardi': 54,
          'Martin Fleischmann': 3,
          'Elliott Sales De Andrade': 2,
          'Levi John Wolf': 1}),
 'explore.esda': Counter({'Serge Rey': 18,
          'James Gaboardi': 3,
          'Filipe Fernandes': 3,
          'Wei Kang': 16,
          'Levi John Wolf': 14,
          'Juan C Duque': 8,
          'Dani Arribas-Bel': 5}),
 'explore.giddy': Counter({'Wei Kang': 27, 'James Gaboardi': 5}),
 'explore.inequality': Counter(),
 'explore.pointpats': Counter({'Wei Kang': 18,
          'James Gaboardi': 1,
          'Serge Rey': 1}),
 'explore.spaghetti': Counter({'James Gaboardi': 97,
          'Wei Kang': 21,
          'Levi John Wolf': 1,
          'Tom Gertin': 1}),
 'explore.segregation': Counter({'Renan Xavier Cortes': 349,
          'Eli Knaap': 94,
          'Antti Härkönen': 4,
          'Wei Kang': 3,
          'James Gaboardi': 4,
          'Serge Rey': 2}),
 'viz.mapclass

In [23]:
counters

{'lib.libpysal': Counter({'Serge Rey': 14,
          'Wei Kang': 6,
          'James Gaboardi': 54,
          'Martin Fleischmann': 3,
          'Elliott Sales De Andrade': 2,
          'Levi John Wolf': 1}),
 'explore.esda': Counter({'Serge Rey': 18,
          'James Gaboardi': 3,
          'Filipe Fernandes': 3,
          'Wei Kang': 16,
          'Levi John Wolf': 14,
          'Juan C Duque': 8,
          'Dani Arribas-Bel': 5}),
 'explore.giddy': Counter({'Wei Kang': 27, 'James Gaboardi': 5}),
 'explore.inequality': Counter(),
 'explore.pointpats': Counter({'Wei Kang': 18,
          'James Gaboardi': 1,
          'Serge Rey': 1}),
 'explore.spaghetti': Counter({'James Gaboardi': 97,
          'Wei Kang': 21,
          'Levi John Wolf': 1,
          'Tom Gertin': 1}),
 'explore.segregation': Counter({'Renan Xavier Cortes': 349,
          'Eli Knaap': 94,
          'Antti Härkönen': 4,
          'Wei Kang': 3,
          'James Gaboardi': 4,
          'Serge Rey': 2}),
 'viz.mapclass

In [24]:
def get_tag(title, level="##", as_string=True):
    words = title.split()
    tag = "-".join([word.lower() for word in words])
    heading = level+" "+title
    line = "\n\n<a name=\"{}\"></a>".format(tag)
    lines = [line]
    lines.append(heading)
    if as_string:
        return "\n".join(lines)
    else:
        return lines

In [29]:
subs = issue_details.keys()
table = []
txt = []
lines = get_tag("Changes by Package", as_string=False)

for sub in subs:
    total= issue_details[sub]
    pr = pull_details[sub]
    
    row = [sub, activity[sub], len(total), len(pr)]
    table.append(row)
    #line = "\n<a name=\"{sub}\"></a>".format(sub=sub)
    #lines.append(line)
    #line = "### {sub}".format(sub=sub)
    #lines.append(line)
    lines.extend(get_tag(sub.lower(), "###", as_string=False))
    for issue in total:
        url = issue['html_url']
        title = issue['title']
        number = issue['number']
        line = "* [#{number}:]({url}) {title} ".format(title=title,
                                                     number=number,
                                                     url=url)
        lines.append(line)



In [30]:
line

'* [#41:](https://github.com/pysal/mgwr/pull/41) Memory optimization '

In [31]:
table

[['libpysal', 80, 26, 11],
 ['esda', 67, 9, 7],
 ['giddy', 32, 13, 10],
 ['inequality', 0, 0, 0],
 ['pointpats', 25, 7, 7],
 ['spaghetti', 120, 40, 21],
 ['segregation', 456, 105, 78],
 ['mapclassify', 43, 15, 12],
 ['splot', 95, 25, 16],
 ['spreg', 16, 3, 1],
 ['spglm', 14, 3, 3],
 ['spint', 0, 0, 0],
 ['mgwr', 44, 15, 7],
 ['spvcm', 0, 0, 0]]

In [32]:
os.chdir(CWD)

import pandas

In [33]:
df = pandas.DataFrame(table, columns=['package', 'commits', 'total issues', 'pulls'])

In [34]:
df.sort_values(['commits','pulls'], ascending=False)\
  .to_html('./commit_table.html', index=None)

In [35]:
df.sum()

package         libpysalesdagiddyinequalitypointpatsspaghettis...
commits                                                       992
total issues                                                  261
pulls                                                         173
dtype: object

In [36]:
contributor_table = pandas.DataFrame.from_dict(counters).fillna(0).astype(int).T

In [37]:
contributor_table.to_html('./contributor_table.html')

In [38]:
totals = contributor_table.sum(axis=0).T
totals.sort_index().to_frame('commits')

Unnamed: 0,commits
Antti Härkönen,4
Dani Arribas-Bel,5
Eli Knaap,94
Elliott Sales De Andrade,2
Filipe Fernandes,3
James Gaboardi,175
Juan C Duque,8
Levi John Wolf,18
Martin Fleischmann,3
Renan Xavier Cortes,357


In [39]:
totals = contributor_table.sum(axis=0).T
totals.sort_index().to_frame('commits').to_html('./commits_by_person.html')

In [40]:
totals

Antti Härkönen                4
Dani Arribas-Bel              5
Eli Knaap                    94
Elliott Sales De Andrade      2
Filipe Fernandes              3
James Gaboardi              175
Juan C Duque                  8
Levi John Wolf               18
Martin Fleischmann            3
Renan Xavier Cortes         357
Serge Rey                    86
Stefanie Lumnitz             63
Taylor Oshan                  9
Tom Gertin                    1
Wei Kang                    135
Ziqi Li                      24
dtype: int64

In [41]:
n_commits = df.commits.sum()
n_issues = df['total issues'].sum()
n_pulls = df.pulls.sum()

In [42]:
n_commits

992

In [43]:
#Overall, there were 719 commits that closed 240 issues, together with 105 pull requests across 12 packages since our last release on 2017-11-03.
#('{0} Here is a really long '
#           'sentence with {1}').format(3, 5))
line = ('Overall, there were {n_commits} commits that closed {n_issues} issues,'  
    ' together with {n_pulls} pull requests since our last release' 
        ' on {since_date}.\n'.format(n_commits=n_commits, n_issues=n_issues,
        n_pulls=n_pulls, since_date = start_date))

In [44]:
line

'Overall, there were 992 commits that closed 261 issues, together with 173 pull requests since our last release on 2019-01-30.\n'

## append html files to end of changes.md with tags for toc

In [45]:
with open('changes.md', 'w') as fout:
    fout.write(line)
    fout.write("\n".join(lines))
    fout.write(get_tag("Summary Statistics"))
    
    with open('commit_table.html') as table:
        table_lines = table.readlines()
        title = "Package Activity"
        fout.write(get_tag(title,"###"))
        fout.write("\n")
        fout.write("".join(table_lines))
                
    with open('commits_by_person.html') as table:
        table_lines = table.readlines()
        title = "Contributor Activity"
        fout.write(get_tag(title,"###"))
        fout.write("\n")
        fout.write("".join(table_lines))
        
    with open('contributor_table.html') as table:
        table_lines = table.readlines()
        title = "Contributor by Package Activity"
        fout.write(get_tag(title,"###"))
        fout.write("\n")
        fout.write("".join(table_lines))