# Dissertation writing analysis

## This notebook analyzes the words changed in my dissertation on a commit-by-commit basis

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
figsize=(15, 9)

### Dump the statistics using git and texcount

In [None]:
! bash ./dump_stats.sh

### Load in the statistics

In [None]:
timestamps = pd.read_csv("timestamps.list", header=None, names=["commit", "timestamp"])

In [None]:
words = pd.read_csv("word_stats.list", header=None, names=['commit', 'words'])

### Join the tables, convert to datetime types

In [None]:
word_stamps = pd.merge(words, timestamps, on='commit', sort=False)
times = pd.to_datetime(word_stamps['timestamp'], format='%Y-%m-%dT%H:%M:%S')
word_stamps = word_stamps.set_index(times).sort_index()
word_stamps = word_stamps.tz_localize('UTC').tz_convert("America/Denver")
# word_stamps.head()
# word_stamps.tz_localize('UTC').tz_convert("America/Denver")
# word_stamps.tz_localize('UTC').index

### Also find the number of words added by commit

In [None]:
word_stamps['dword'] = word_stamps['words'].diff()

### Plot the word count as a function of commit

In [None]:
word_stamps.head(20)

In [None]:
word_stamps.plot(y='dword', figsize=(15, 9))

In [None]:
word_stamps.plot(y='words', marker='.', figsize=figsize, grid=True)

#### The month of September 2015 looks particularly fruitful, let's zoom in

In [None]:
word_stamps['2015-09-22':'2015-09-24'].plot(y='words', marker='o', figsize=figsize)

In [None]:
# word_stamps['2015-09-23':'2015-09-23']

### How about number of commits per day?

In [None]:
# word_stamps.resample('1D').count()['commit']

In [None]:
word_stamps['commit'].resample('1D').count().plot(grid=True, figsize=figsize)

In [None]:
word_stamps['commit'].resample('1M').count().plot(grid=True, figsize=figsize)

### What about the number of commits per day of week?

In [None]:
weekdays = ["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"]
word_stamps['commit'].groupby(word_stamps.index.weekday_name, sort=False).count()\
.reindex(weekdays).plot.bar(grid=True, figsize=figsize)

In [None]:
word_stamps['commit'].groupby((word_stamps.index.year, word_stamps.index.weekofyear)).count().plot.bar(figsize=figsize)

### Alright, how about the local time and weekday for each commit?

In [None]:
df = pd.DataFrame({'time': word_stamps.index.second + \
                   word_stamps.index.minute * 60 + \
                   word_stamps.index.hour * 60 * 60,
                   'weekday': word_stamps.index.weekday})
df.head()

In [None]:
df.plot.scatter(x='weekday', y='time', figsize=figsize)
# pd.Series(word_stamps.index).plot()