# Repository overview

In [None]:
import os
import sys
import pandas as pd
from datetime import datetime, timedelta
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

import settings
from dataprocessor import load_repo_commits
from project import get_project_output_dir_by_name, get_project_path

Set your project and repo config here

In [None]:
PROJECT_NAME = 'pydata-project'
REPO_NAME = 'numpy'

Load repo TSV file

In [None]:
data = load_repo_commits(get_project_path(PROJECT_NAME), REPO_NAME)

In [None]:
data.head()

## Repository overview
TODO
- TOP 10 contributors (3months, 2weeks)
  * How many commits
  * Percentage of commits
- Commmits per week chart over X weeks
- Commit message length distribution over all commits
- LOC over time
- Author count over time
- New authors per week for last X weeks
- Files over time
- Directories over time
- Fix commits over time

#### Commit and author count

In [None]:
total_commit_count = len(data)
print("Commit count: {}".format(total_commit_count))
total_author_count = data['author_email'].nunique()
print("Author count: {}".format(total_author_count))

#### Oldest commit and age of the repository

In [None]:
oldest_commit_datetime = data.iloc[-1].author_date.to_pydatetime()
age_delta = datetime.now() - oldest_commit_datetime
age_delta_years = age_delta.days // 365 # This is not exact - precise enough for this purpose
print("Repository is {years} year(s) and {days} day(s) old.".format(years=age_delta_years, days=(age_delta.days-age_delta_years*365)))
print("First commit was written {}".format(oldest_commit_datetime))

### TOP Contributors

In [None]:
TOP_COUNT = 10

In [None]:
def get_top_contributors(df):
    commit_count = len(df)
    top_committers = df.groupby('author_email').size().sort_values(ascending=False)[:TOP_COUNT].to_frame(name='commit_count')
    top_committers['percentage_of_commits'] = (top_committers['commit_count'] / commit_count) * 100.0
    return top_committers

#### All time top contributors

In [None]:
alltime_top_contributors = get_top_contributors(data)

Top contributors part of all the contributions

In [None]:
alltime_top_contributors.sum()

In [None]:
alltime_top_commits = data.loc[data['author_email'].isin(alltime_top_contributors.index)]

In [None]:
alltime_top_commits

#### Last three months top contributors

In [None]:
last_three_months_commits = data[data['author_date'] >= (datetime.today() - timedelta(days=3 * 30))]

last_three_months_top_contributors = get_top_contributors(last_three_months_commits)
last_three_months_top_contributors

#### Last two weeks top contributors

In [None]:
last_two_weeks_commits = data[data['author_date'] >= (datetime.today() - timedelta(days=14))]

In [None]:
last_two_weeks_top_contributors = get_top_contributors(last_two_weeks_commits)
last_two_weeks_top_contributors

### Commits over time

In [None]:
author_date_index = pd.DatetimeIndex(data['author_date'])

In [None]:
data.groupby(author_date_index.hour).count()