In [None]:
import numpy 
import pandas
import datetime
import random
import calendar
from matplotlib import pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.animation import FuncAnimation
from matplotlib.widgets import MultiCursor
from IPython.display import HTML
from matplotlib.widgets import Slider, RadioButtons
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib
matplotlib.rcParams['animation.embed_limit'] = 2**128
matplotlib.rcParams['animation.html'] = 'jshtml'
pandas.options.display.max_columns = 200

In [None]:
!pip install ipympl
!jupyter nbextension enable --py widgetsnbextension

# Reading, validating & processing data

In [None]:
dateparser = lambda x: pandas.to_datetime(datetime.datetime.strptime(x,"%a %b %d %H:%M:%S %Y %z"), utc=True)
p = 0.01 # number of rows in full file = 3240997
df = pandas.read_csv('/kaggle/input/github-commit-messages-dataset/full.csv', 
                     parse_dates=['date'], 
                     date_parser=dateparser, 
                     usecols=['author', 'date', 'repo', 'commit', 'message'],
                     # nrows=100,
                     skiprows=lambda i: i > 0 and random.random() > p
                    )

color = {}
for repo in set(df.repo):
    color[repo] = '#' + ''.join([random.choice('23456789ABCD') for j in range(6)])

In [None]:
df.dropna(subset=['message'], inplace=True)
assert len(df.dropna()) == len(df)
assert len(set(df.commit)) == len(df)
assert df.message.apply(lambda x: bool(x)).sum() == len(df)

In [None]:
processed_df = df.copy()
processed_df['year'] = processed_df.date.dt.year
processed_df['month'] = processed_df.date.dt.month
processed_df.drop(labels=['author', 'commit'], axis=1, inplace=True)

processed_df = processed_df.groupby(['repo', 'year', 'month']).date.count().reset_index()\
    .sort_values(['repo', 'year', 'month'])
processed_df = processed_df.groupby(['repo', 'year', 'month']).sum()

min_date = df.date.min()
max_date = df.date.max()

for repo in processed_df.index.get_level_values('repo').unique():
    for year in range(processed_df.index.get_level_values('year').min(), 
                      processed_df.index.get_level_values('year').max() + 1):
        for month in range(1, 13):
            if year == max_date.year and month > max_date.month:
                continue
            if year == min_date.year and month < min_date.month:
                continue
            try:
                processed_df.loc[(repo, year, month)]
            except KeyError:
                processed_df.loc[(repo, year, month)] = 0

processed_df = processed_df.reset_index().sort_values(['repo', 'year', 'month'])
processed_df['year_month'] = processed_df.apply(lambda row: str(row.year) + ' ' + str(calendar.month_name[row.month][:3]), axis=1)
repos = processed_df.repo.unique().tolist()

year_months = processed_df[['year', 'month', 'year_month']].drop_duplicates(['year', 'month']).sort_values(['year', 'month'])\
    .year_month.tolist()

# Total number of commits vs. time per repo (Animation)

In [None]:
%matplotlib inline
total_df = processed_df.groupby(['repo', 'year', 'month']).sum()
total_df = total_df.groupby(level=0).cumsum().reset_index()
total_df['year_month'] = total_df.apply(lambda row: str(row.year) + ' ' + str(calendar.month_name[row.month][:3]), axis=1)
time_repo_wise_df = total_df.groupby(['year_month', 'repo']).date.sum()

del total_df
max_number = 6

def get_month_plot(year_month):
    year_month_df = time_repo_wise_df[year_month].sort_values(ascending=False).copy()[:max_number]
    monthly_values = list(zip(year_month_df.index, year_month_df))
    monthly_values.reverse()
    ax.clear()
    ax.barh([m[0] for m in monthly_values], [m[1] for m in monthly_values], color=[color[m[0]] for m in monthly_values])
    ax.set_yticks([])
    dx = year_month_df.max() / 200
    for i, (name, value) in enumerate(monthly_values):
        ax.text(value+dx, i, f'{value:,.0f}',  size=14, ha='left',  va='center')
        ax.text(value-dx, i, name, weight=600, size=14, ha='right', va='bottom')
    ax.text(1, 0.4, year_month, transform=ax.transAxes, color='#777777', size=46, ha='right', weight=800)
    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
    ax.xaxis.set_ticks_position('top')
    ax.set_title('Total number of commits till {} - Top {}'.format(year_month, max_number))
    ax.grid(which='major', axis='x', linestyle='-')
    plt.box(False)


fig, ax = plt.subplots(figsize=(15, 8))
anim = FuncAnimation(fig=fig, func=get_month_plot, frames=year_months)
anim

# Number of commits vs. time per repo

In [None]:
%matplotlib inline
year_df = processed_df.groupby(['repo', 'year']).date.sum().reset_index().sort_values(['repo', 'year'])

def get_repo_time_plot(repo):
    repo_df = year_df[year_df.repo == repo]
    ax2.clear()
    line, = ax2.plot(repo_df.year, repo_df.date, label=repo, figure=fig2, color=color[repo])
    ax2.set_xlim(repo_df.year.min() - 1, repo_df.year.max() + 1)
    ax2.set_ylim(repo_df.date.min() - 1, repo_df.date.max() + 1)
    ax2.set_title(repo)
    ax2.set_xlabel('Year', size=20)
    ax2.set_ylabel('Commits', size=20)
    ax2.set_title(repo, size=20)
    plt.subplots_adjust(left=0.25, bottom=0.25)
    fig2.canvas.draw_idle()


fig2, ax2 = plt.subplots(figsize=(16, 8))
anim4 = FuncAnimation(fig=fig2, func=get_repo_time_plot, frames=repos)
anim4

In [None]:
%matplotlib inline
fig4 = plt.figure(figsize=(20, 10))
ax4 = fig4.add_axes([0.1, 0.1, 0.5, 0.75])

year_df = processed_df.groupby(['repo', 'year']).date.sum().reset_index().sort_values(['repo', 'year'])

def get_all_year(year):
    ax4.clear()
    ax4.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:.0f}'))
    for repo in repos:
        repo_df = year_df[(year_df.repo == repo) & (year_df.year <= year)]
        line, = ax4.plot(repo_df.year, repo_df.date, label=repo, figure=fig4, color=color[repo])
    ax4.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    ax4.set_xlabel('Year', size=20)
    ax4.set_ylabel('Number of commits', size=20)
    ax4.set_title('Number of commits per year', size=20)

anim2 = FuncAnimation(fig=fig4, func=get_all_year, frames=range(processed_df.year.min(), processed_df.year.max() + 1))
anim2

# Number of commits per repo

In [None]:
s = processed_df.groupby('repo').date.sum().sort_values(ascending=False)
pie_fig, pie_ax = plt.subplots(figsize=(12, 8))
pie_ax.pie(s.tolist(), labels=s.index.tolist(), autopct='%1.1f%%', startangle=90)
pie_ax.axis('equal')
plt.show()

# Number of unique users vs. Number of commits across repos

In [None]:
user_commit_df = df.groupby('repo').nunique().sort_values('commit', ascending=False)[['commit', 'author']]

In [None]:
scatter_fig, scatter_axis = plt.subplots(figsize=(10, 5))
scatter_axis.scatter(user_commit_df.commit, user_commit_df.author, c=[color[repo] for repo in user_commit_df.index])
scatter_axis.set_xlabel('Number of commits', size=20)
scatter_axis.set_ylabel('Number of users', size=20)
scatter_axis.set_title('Number of unique users vs. Number of commits', size=20)
plt.show()

### One single point i.e., torvalds/linux is causing a huge shift, ignoring that we can see the scatter plot again as follows

In [None]:
user_commit_df2 = user_commit_df.drop(labels=['torvalds/linux'])
scatter_fig2, scatter_axis2 = plt.subplots(figsize=(10, 5))
scatter_axis2.scatter(user_commit_df2.commit, user_commit_df2.author, c=[color[repo] for repo in user_commit_df2.index])
scatter_axis2.set_xlabel('Number of commits', size=20)
scatter_axis2.set_ylabel('Number of users', size=20)
scatter_axis2.set_title('Number of unique users vs. Number of commits without torvalds/linux', size=20)
plt.show()

# Users contributing to more than one repo

In [None]:
heatmap_df = pandas.DataFrame(index=repos, columns=repos)
for i in range(len(repos)):
    for j in range(len(repos)):
        k = df[df.repo.isin([repos[i], repos[j]])].groupby('author').repo.nunique()
        if i != j:
            heatmap_df.iloc[i, j] = set(k[k>1].index)
        else:
            heatmap_df.iloc[i, j] = set(k.index)

for i in range(len(repos)):
    all_authors = heatmap_df.iloc[i].tolist()
    del all_authors[i]
    mix_authors = set(set().union(*all_authors))
    heatmap_df.iloc[i, i] = heatmap_df.iloc[i, i] - mix_authors

heatmap_df = heatmap_df.applymap(len)
heatmap_df

### Inference: Very rarely users contribute across projects

# WordCloud of various commit messages

In [None]:
%matplotlib inline
text_df = df.groupby('repo').message.apply(lambda x: ''.join(x).lower())
stopwords = set(STOPWORDS)
stopwords.update(['pr', 'close', 'commit', 'spark'])

def get_wordcloud(repo):
    cloud_ax.clear()
    text = text_df.loc[repo].lower()
    wordcloud = WordCloud(stopwords=stopwords).generate(text)
    cloud_ax.imshow(wordcloud, interpolation='bilinear')
    cloud_ax.axis("off")
    cloud_ax.set_title(repo, size=20)
    plt.subplots_adjust(left=0.25, bottom=0.25)
    cloud_fig.canvas.draw_idle()


cloud_fig, cloud_ax = plt.subplots(figsize=(16, 8))
cloud_anim = FuncAnimation(fig=cloud_fig, func=get_wordcloud, frames=repos)
cloud_anim