# Repository overview

In [1]:
import os
import sys
import pandas as pd
from datetime import datetime, timedelta
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

import settings

from project_utils import get_project_output_dir_by_name

Set your project and repo config here

In [2]:
PROJECT_NAME = 'pydata-project'
REPO_NAME = 'pandas'

Load repo TSV file

In [3]:
tsv_path = os.path.join(get_project_output_dir_by_name(PROJECT_NAME), 'git_log_{repo}.tsv'.format(repo=REPO_NAME))

In [4]:
data = pd.DataFrame.from_csv(tsv_path, sep=settings.FIELD_SEPARATOR, parse_dates=['author_date', 'committer_date'], infer_datetime_format=True)
data.head()

  infer_datetime_format=infer_datetime_format)


Unnamed: 0_level_0,author_name,author_email,author_date,committer_name,committer_email,committer_date,subject
commit_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
95f4f7dc78ac21a132b86b01c31efc5b0fdbceab,jbrockmendel,jbrockmendel@gmail.com,2017-08-17 10:13:36,Jeff Reback,jeff@reback.net,2017-08-17 10:13:36,Replace imports of * with explicit imports (#1...
ecaac87c526f5642389dc36e6ee565fe8d21bfd7,jschendel,jschendel@users.noreply.github.com,2017-08-17 10:10:52,Jeff Reback,jeff@reback.net,2017-08-17 10:10:52,CLN: replace %s syntax with .format in core.dt...
57befd18cb8ea8d641ea88a5c8ef916a09a9a1aa,jbrockmendel,jbrockmendel@gmail.com,2017-08-16 20:52:29,gfyoung,gfyoung17@gmail.com,2017-08-16 20:52:29,CLN: Remove have_pytz (#17266)
6fe68325de93a5f745ff49eac57589d33a1d53c1,Daniel Grady,d.c.grady@gmail.com,2017-08-15 22:44:54,Jeff Reback,jeff@reback.net,2017-08-15 22:44:54,BUG: Fix behavior of argmax and argmin with in...
47b397309e9601640170aedd6f70486a54d638fd,jschendel,jschendel@users.noreply.github.com,2017-08-15 20:42:39,Jeff Reback,jeff@reback.net,2017-08-15 20:42:39,Fix bugs in IntervalIndex.is_non_overlapping_m...


## Repository overview
TODO
- TOP 10 contributors (3months, 2weeks)
  * How many commits
  * Percentage of commits
- Commmits per week chart over X weeks
- Commit message length distribution over all commits
- LOC over time
- Author count over time
- New authors per week for last X weeks
- Files over time
- Directories over time
- Fix commits over time

#### Commit and author count

In [5]:
total_commit_count = len(data)
print("Commit count: {}".format(total_commit_count))
total_author_count = data['author_email'].nunique()
print("Author count: {}".format(total_author_count))

Commit count: 15502
Author count: 1058


#### Oldest commit and age of the repository

In [6]:
oldest_commit_datetime = data.iloc[-1].author_date.to_pydatetime()
age_delta = datetime.now() - oldest_commit_datetime
age_delta_years = age_delta.days // 365 # This is not exact - precise enough for this purpose
print("Repository is {years} year(s) and {days} day(s) old.".format(years=age_delta_years, days=(age_delta.days-age_delta_years*365)))
print("First commit was written {}".format(oldest_commit_datetime))

Repository is 8 year(s) and 26 day(s) old.
First commit was written 2009-07-31 15:07:16


### TOP Contributors

In [7]:
TOP_COUNT = 10

In [8]:
def get_top_contributors(df):
    commit_count = len(df)
    top_committers = df.groupby('author_email').size().sort_values(ascending=False)[:TOP_COUNT].to_frame(name='commit_count')
    top_committers['percentage_of_commits'] = (top_committers['commit_count'] / commit_count) * 100.0
    return top_committers

#### All time top contributors

In [9]:
alltime_top_contributors = get_top_contributors(data)

Top contributors part of all the contributions

In [10]:
alltime_top_contributors.sum()

commit_count             11442.000000
percentage_of_commits       73.809831
dtype: float64

In [11]:
alltime_top_commits = data.loc[data['author_email'].isin(alltime_top_contributors.index)]

In [12]:
alltime_top_commits

Unnamed: 0_level_0,author_name,author_email,author_date,committer_name,committer_email,committer_date,subject
commit_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0fafd4f8f7967f83845a74905d7e3ed9432807b6,Phillip Cloud,cpcloud@gmail.com,2017-08-14 10:31:41,Jeff Reback,jeff@reback.net,2017-08-14 10:31:41,ENH/PERF: Remove frequency inference from .dt ...
dbffba81914c922925e098411d0f773a759f7992,Joris Van den Bossche,jorisvandenbossche@gmail.com,2017-08-10 10:37:39,Jeff Reback,jeff@reback.net,2017-08-10 10:37:39,CLN/ASV clean-up frame stat ops benchmarks (#1...
929c66fd74da221078a67ea7fd3dbcbe21d642e0,Joris Van den Bossche,jorisvandenbossche@gmail.com,2017-08-04 07:44:53,GitHub,noreply@github.com,2017-08-04 07:44:53,REF: repr - allow block to override values tha...
55ae03986dab53f39c1df2b8a5e3532f89ad22be,Jeff Reback,jeff@reback.net,2017-08-03 12:07:18,Jeff Reback,jeff@reback.net,2017-08-03 12:07:18,DOC: whatsnew 0.21.0 fixes
3fadc62e75bb09b2f39ddd2169baa182fb2ea720,Jeff Reback,jeff@reback.net,2017-08-03 01:02:12,GitHub,noreply@github.com,2017-08-03 01:02:12,TST: test for categorical index monotonicity (...
8e6b09ff3a09de58e82da6dcabbfddba61a743d6,Jeff Reback,jeff@reback.net,2017-08-02 11:22:17,Jeff Reback,jeff@reback.net,2017-08-02 11:22:17,"DOC: doc typos, xref #15838"
f4330611ff5ac1cbb4a89c4a7dab3d0900f9e64a,Jeff Reback,jeff@reback.net,2017-08-02 09:47:57,GitHub,noreply@github.com,2017-08-02 09:47:57,ENH: add to/from_parquet with pyarrow & fastpa...
ab49d1fcda17cdb5571959a0d85d5ee872638b4c,Jeff Reback,jeff@reback.net,2017-08-01 20:09:22,GitHub,noreply@github.com,2017-08-01 20:09:22,CI: bump version of xlsxwriter to 0.5.2 (#17142)
7358f096ef76207b05bcce0bd02f3a45246e8b09,Jeff Reback,jeff@reback.net,2017-08-01 18:19:16,GitHub,noreply@github.com,2017-08-01 18:19:16,COMPAT: make sure use_inf_as_null is deprecate...
b03f7e52e859c5d20141a47aa4d6880a321af84d,Joris Van den Bossche,jorisvandenbossche@gmail.com,2017-07-29 21:58:03,GitHub,noreply@github.com,2017-07-29 21:58:03,DOC: further clean-up null/na changes (#17113)


#### Last three months top contributors

In [13]:
last_three_months_commits = data[data['author_date'] >= (datetime.today() - timedelta(days=3 * 30))]

last_three_months_top_contributors = get_top_contributors(last_three_months_commits)
last_three_months_top_contributors

Unnamed: 0_level_0,commit_count,percentage_of_commits
author_email,Unnamed: 1_level_1,Unnamed: 2_level_1
jeff@reback.net,44,16.0
gfyoung17@gmail.com,39,14.181818
TomAugspurger@users.noreply.github.com,14,5.090909
jbrockmendel@gmail.com,14,5.090909
jorisvandenbossche@gmail.com,8,2.909091
dsm054@gmail.com,8,2.909091
me@pietrobattiston.it,7,2.545455
terji78@gmail.com,6,2.181818
jschendel@users.noreply.github.com,6,2.181818
cbartak@gmail.com,4,1.454545


#### Last two weeks top contributors

In [14]:
last_two_weeks_commits = data[data['author_date'] >= (datetime.today() - timedelta(days=14))]

In [15]:
last_two_weeks_top_contributors = get_top_contributors(last_two_weeks_commits)
last_two_weeks_top_contributors

Unnamed: 0_level_0,commit_count,percentage_of_commits
author_email,Unnamed: 1_level_1,Unnamed: 2_level_1
jbrockmendel@gmail.com,5,27.777778
jschendel@users.noreply.github.com,4,22.222222
gfyoung17@gmail.com,2,11.111111
terji78@gmail.com,1,5.555556
me@pietrobattiston.it,1,5.555556
matti.picus@gmail.com,1,5.555556
kerncece@gmail.com,1,5.555556
daniel.himmelstein@gmail.com,1,5.555556
d.c.grady@gmail.com,1,5.555556
cpcloud@gmail.com,1,5.555556


### Commits over time