## Preparation

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import re

In [None]:
!pip install plotly-express ipywidgets jupyter-dash

## Topic analysis

In [None]:
def split_by_topics(row):
    topics = row['topics']
    row['topic'] = [m.group(1) for m in re.finditer(r"'([-\w]+)'", topics)]
    if len(row['topic']) == 0: row['topic'] = 'NOTOPIC'
    return row

def expand(file):
    df = pd.read_csv(file, parse_dates=['created_at', 'updated_at', 'pushed_at'])
    df = df.apply(lambda row: split_by_topics(row), axis=1)
    df = df.explode(['topic'])
    return df

df = expand("repo-candidates/main.csv")
sum = df.groupby("topic").agg(
    repositories=pd.NamedAgg(column="id", aggfunc="count"),
    stars=pd.NamedAgg(column="stargazers_count", aggfunc="sum"),
    folks=pd.NamedAgg(column="forks_count", aggfunc="sum"),
    watchers=pd.NamedAgg(column="watchers_count", aggfunc="sum"),
).sort_values("repositories", ascending=False)

df['year'] = df['created_at'].dt.year
sum2 = df.groupby(["year", "topic"]).agg(
    topic_members=pd.NamedAgg(column="id", aggfunc="count"),
    topic_stars=pd.NamedAgg(column="stargazers_count", aggfunc="sum"),
    topic_forks=pd.NamedAgg(column="forks_count", aggfunc="sum"),
)


df_vis = sum2.query("topic_members > 15")
df_vis.reset_index(level=1, inplace=True)
df_vis.reset_index(level=0, inplace=True)


In [None]:
stats = sum.describe(percentiles=[0.25, 0.5, 0.75, 0.95, 0.99])
stats = stats.astype('int32')

In [None]:
stats.to_latex("__stats.tex")

In [None]:
sum.query("repositories > 2").head(20).to_latex("__top20.tex")

### Prepare dataframe for treemap and sunburst

In [None]:
# remove NOTOPIC, go, golang as they overlap too much
df_vis = df_vis.query("topic != 'NOTOPIC' and topic != 'go' and topic != 'golang'")
fig = px.treemap(
    df_vis, path=[px.Constant("all"), 'year', 'topic'],
    values='topic_members',
    #color_continuous_scale=px.colors.sequential.Sunset,
    color_continuous_scale=px.colors.sequential.Sunsetdark,
    color='topic_forks',
    width=1200, height=820,
    title="Github Golang Topic Distribution by Year")
fig.update_layout(margin = dict(t=20, l=25, r=25, b=25))
fig.show()

In [None]:
df_vis2 = df_vis.query("topic_members > 59")
fig = px.sunburst(
    df_vis2, path=[px.Constant("all"), 'year', 'topic'],
    values='topic_members',
    #color_continuous_scale=px.colors.sequential.Sunset,
    color_continuous_scale=px.colors.sequential.Sunsetdark,
    color='topic_forks',
    width=1200, height=820,
    title="Github Golang Topic Distribution by Year")
fig.update_layout(margin = dict(t=20, l=25, r=25, b=25))
fig.show()