# Repository overview

In [1]:
import os
import sys
import pandas as pd
from datetime import datetime, timedelta
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

import settings
from dataprocessor import load_repo_commits
from project import get_project_output_dir_by_name, get_project_path

Set your project and repo config here

In [2]:
PROJECT_NAME = 'pydata-project'
REPO_NAME = 'numpy'

Load repo TSV file

In [3]:
data = load_repo_commits(get_project_path(PROJECT_NAME), REPO_NAME)

Loading commits dataframe for numpy
  infer_datetime_format=infer_datetime_format)


In [4]:
data.head()

Unnamed: 0_level_0,author_name,author_email,author_date,committer_name,committer_email,committer_date,subject
commit_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
707f33f6a55076bc12e25e736d910545377420e8,seberg,sebastian@sipsolutions.net,2017-08-22 08:27:08,GitHub,noreply@github.com,2017-08-22 08:27:08,Merge pull request #9586 from b-carter/fix_non...
6aed49135b26fe23620b15cab6c3b99c56545867,Brandon Carter,bcarter@mit.edu,2017-08-21 23:24:24,Brandon Carter,bcarter@mit.edu,2017-08-21 23:24:24,update example in np.nonzero docstring
a0c5f64f4f1a7f6ee80dc9b95aa813df28a87d72,Charles Harris,charlesr.harris@gmail.com,2017-08-19 15:00:56,GitHub,noreply@github.com,2017-08-19 15:00:56,Merge pull request #9582 from seberg/rcond-def...
bd3a2c580e2d5f0a7a958b3e7d942c230648f2e3,Sebastian Berg,sebastian@sipsolutions.net,2017-08-19 13:48:34,Sebastian Berg,sebastian@sipsolutions.net,2017-08-19 14:35:08,ENH: Warn to change lstsq default for rcond
3c887aa5242857ef92870d2988de7c899c6415be,Eric Wieser,wieser.eric@gmail.com,2017-08-18 00:57:23,GitHub,noreply@github.com,2017-08-18 00:57:23,Merge pull request #9581 from MSeifert04/doc_f...


## Repository overview
TODO
- TOP 10 contributors (3months, 2weeks)
  * How many commits
  * Percentage of commits
- Commmits per week chart over X weeks
- Commit message length distribution over all commits
- LOC over time
- Author count over time
- New authors per week for last X weeks
- Files over time
- Directories over time
- Fix commits over time

#### Commit and author count

In [5]:
total_commit_count = len(data)
print("Commit count: {}".format(total_commit_count))
total_author_count = data['author_email'].nunique()
print("Author count: {}".format(total_author_count))

Commit count: 16463
Author count: 697


#### Oldest commit and age of the repository

In [6]:
oldest_commit_datetime = data.iloc[-1].author_date.to_pydatetime()
age_delta = datetime.now() - oldest_commit_datetime
age_delta_years = age_delta.days // 365 # This is not exact - precise enough for this purpose
print("Repository is {years} year(s) and {days} day(s) old.".format(years=age_delta_years, days=(age_delta.days-age_delta_years*365)))
print("First commit was written {}".format(oldest_commit_datetime))

Repository is 15 year(s) and 253 day(s) old.
First commit was written 2001-12-18 15:45:10


### TOP Contributors

In [7]:
TOP_COUNT = 10

In [8]:
def get_top_contributors(df):
    commit_count = len(df)
    top_committers = df.groupby('author_email').size().sort_values(ascending=False)[:TOP_COUNT].to_frame(name='commit_count')
    top_committers['percentage_of_commits'] = (top_committers['commit_count'] / commit_count) * 100.0
    return top_committers

#### All time top contributors

In [9]:
alltime_top_contributors = get_top_contributors(data)

Top contributors part of all the contributions

In [10]:
alltime_top_contributors.sum()

commit_count             10729.000000
percentage_of_commits       65.170382
dtype: float64

In [11]:
alltime_top_commits = data.loc[data['author_email'].isin(alltime_top_contributors.index)]

In [12]:
alltime_top_commits

Unnamed: 0_level_0,author_name,author_email,author_date,committer_name,committer_email,committer_date,subject
commit_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
707f33f6a55076bc12e25e736d910545377420e8,seberg,sebastian@sipsolutions.net,2017-08-22 08:27:08,GitHub,noreply@github.com,2017-08-22 08:27:08,Merge pull request #9586 from b-carter/fix_non...
a0c5f64f4f1a7f6ee80dc9b95aa813df28a87d72,Charles Harris,charlesr.harris@gmail.com,2017-08-19 15:00:56,GitHub,noreply@github.com,2017-08-19 15:00:56,Merge pull request #9582 from seberg/rcond-def...
bd3a2c580e2d5f0a7a958b3e7d942c230648f2e3,Sebastian Berg,sebastian@sipsolutions.net,2017-08-19 13:48:34,Sebastian Berg,sebastian@sipsolutions.net,2017-08-19 14:35:08,ENH: Warn to change lstsq default for rcond
773ac68f901b0f87bb48fd10f11b9c45b21ca031,Charles Harris,charlesr.harris@gmail.com,2017-08-17 15:01:46,GitHub,noreply@github.com,2017-08-17 15:01:46,Merge pull request #9577 from bashtage/dirichl...
b9e5ea63c707747d7999a644be8b30d95eaceded,Charles Harris,charlesr.harris@gmail.com,2017-08-17 14:42:39,GitHub,noreply@github.com,2017-08-17 14:42:39,Merge pull request #9575 from pv/cabs-blacklist
3cfe9360e827dde42ff736e34760d90a29394819,Pauli Virtanen,pav@iki.fi,2017-08-17 12:14:26,Pauli Virtanen,pav@iki.fi,2017-08-17 12:14:26,BUG: core: remove extra return statement
36d969994d59cd0ce960e773b230b8e48082adfc,Pauli Virtanen,pav@iki.fi,2017-08-16 22:53:11,Pauli Virtanen,pav@iki.fi,2017-08-17 11:21:35,BUG: core: blacklist MSVC cabs* on win32
1fff440ba7c3da6cfe9f5d5b534e22fa43f18482,Pauli Virtanen,pav@iki.fi,2017-08-16 22:52:49,Pauli Virtanen,pav@iki.fi,2017-08-17 11:21:35,TST: core: add a test that exercise untested n...
26c79664d2edd48e1777cc4b31ecb952e2ec30d5,Charles Harris,charlesr.harris@gmail.com,2017-08-16 19:19:17,GitHub,noreply@github.com,2017-08-16 19:19:17,Merge pull request #9574 from pv/fpflag
3deb8fa4f4d25e434c86188593f95dfc2fed041c,Pauli Virtanen,pav@iki.fi,2017-08-16 16:15:18,Pauli Virtanen,pav@iki.fi,2017-08-16 18:39:14,TST: add FPU mode check also for pytest


#### Last three months top contributors

In [13]:
last_three_months_commits = data[data['author_date'] >= (datetime.today() - timedelta(days=3 * 30))]

last_three_months_top_contributors = get_top_contributors(last_three_months_commits)
last_three_months_top_contributors

Unnamed: 0_level_0,commit_count,percentage_of_commits
author_email,Unnamed: 1_level_1,Unnamed: 2_level_1
charlesr.harris@gmail.com,99,34.982332
wieser.eric@gmail.com,71,25.088339
ralf.gommers@gmail.com,14,4.946996
sebastian@sipsolutions.net,12,4.240283
juliantaylor108@gmail.com,8,2.826855
licht-t@math.dis.titech.ac.jp,8,2.826855
pav@iki.fi,7,2.473498
jaimefrio@google.com,6,2.120141
eldering@jive.eu,4,1.413428
ealloc@gmail.com,4,1.413428


#### Last two weeks top contributors

In [14]:
last_two_weeks_commits = data[data['author_date'] >= (datetime.today() - timedelta(days=14))]

In [15]:
last_two_weeks_top_contributors = get_top_contributors(last_two_weeks_commits)
last_two_weeks_top_contributors

Unnamed: 0_level_0,commit_count,percentage_of_commits
author_email,Unnamed: 1_level_1,Unnamed: 2_level_1
charlesr.harris@gmail.com,13,36.111111
pav@iki.fi,6,16.666667
wieser.eric@gmail.com,3,8.333333
sebastian@sipsolutions.net,3,8.333333
ralf.gommers@googlemail.com,1,2.777778
nico.schloemer@gmail.com,1,2.777778
mike@nolta.net,1,2.777778
michaelseifert04@yahoo.de,1,2.777778
kevin.k.sheppard@gmail.com,1,2.777778
jrbourbeau@gmail.com,1,2.777778


### Commits over time