# GitHub analysis

In [None]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import tol_colors as tc
from datetime import datetime
from datetime import timezone
from matplotlib import pyplot as plt

## Engagement numbers

In [None]:
stars_df = pd.read_csv("../data/analysis/stars.csv", index_col=0)
stars_df.head()

In [None]:
bins = np.hstack([0, 10**(np.arange(0,6))])
counts, bins, _ = plt.hist(stars_df.groupby("github_user_cleaned_url").count()["user"], bins=bins, ec="black", alpha=0.7)
plt.xscale('log')

In [None]:
print(counts)
print(bins)

In [None]:
forks_df = pd.read_csv("../data/analysis/forks.csv", index_col=0)
forks_df.head()

In [None]:
bins = np.hstack([0, 10**(np.arange(0,5))])
counts, bins, _ = plt.hist(forks_df.groupby("github_user_cleaned_url").count()["user"], bins = bins, ec="black", alpha=0.7)
plt.xscale('log')

In [None]:
fork_counts = forks_df.groupby("github_user_cleaned_url")["user"].count()
fork_counts.rename("forks_no", inplace=True)
star_counts = stars_df.groupby("github_user_cleaned_url")["user"].count()
star_counts.rename("stars_no", inplace=True)
engagement = pd.merge(left=fork_counts, right=star_counts, on="github_user_cleaned_url")
engagement.head()

In [None]:
plt.scatter(engagement["forks_no"], engagement["stars_no"], alpha=0.5)
plt.xlabel("Number of forks")
plt.ylabel("Number of stars")
plt.xscale('log')
plt.yscale('log')

In [None]:
cell_text = [
    [f"{fork_counts.mean():.2f}", f"{fork_counts.std():.2f}", f"{fork_counts.median():.2f}", f"{fork_counts.min():.2f}", f"{fork_counts.max():.2f}"],
    [f"{star_counts.mean():.2f}", f"{star_counts.std():.2f}", f"{star_counts.median():.2f}", f"{star_counts.min():.2f}", f"{star_counts.max():.2f}"]
    ]
fig, ax = plt.subplots()

# hide axes
#fig.patch.set_visible(False)
ax.axis('off')
ax.axis('tight')

table = plt.table(cellText=cell_text,
                  rowLabels=["forks", "stars"],
                  colLabels=["mean", "std", "median", "min", "max"])

fig.tight_layout()

## Timelines

In [None]:
def plot_against_days_since_creation(df, column_name, label):
    """Plots number of markers set on all days since creation (total, one year, one month).

    Args:
        df (pd.DataFrame): input dataframe
        column_name (str): name of column with days since creation info
        label (str): what number we are looking at
    """
    counted = df.groupby(column_name).count()
    fig = plt.figure(figsize=(20, 4))
    plt.subplot(131)
    plt.bar(counted.index, counted.github_user_cleaned_url)
    plt.xlabel("days since repository creation")
    plt.ylabel(f"number of {label} on that day")
    plt.title("Complete timeline across all repositories")

    plt.subplot(132)
    plt.bar(counted.index[:365], counted.github_user_cleaned_url[:365])
    plt.xlabel("days since repository creation")
    plt.ylabel(f"number of {label} on that day")
    plt.title("First year")

    plt.subplot(133)
    plt.bar(counted.index[:31], counted.github_user_cleaned_url[:31])
    plt.xlabel("days since repository creation")
    plt.ylabel(f"number of {label} on that day")
    plt.title("First month")

    plt.show()

In [None]:
def plot_against_weeks_since_creation(df, column_name, label):
    """Plots number of markers set on all days since creation (total, one year, one month).

    Args:
        df (pd.DataFrame): input dataframe
        column_name (str): name of column with days since creation info
        label (str): what number we are looking at
    """
    df[f"weekly_{column_name}"] = df[column_name]//7
    counted = df.groupby(f"weekly_{column_name}").count()
    fig = plt.figure(figsize=(20, 4))
    plt.subplot(131)
    plt.bar(counted.index, counted.github_user_cleaned_url)
    plt.xlabel("weeks since repository creation")
    plt.ylabel(f"number of label on that day")
    plt.title("Complete timeline across all repositories")

    plt.subplot(132)
    plt.bar(counted.index[:52], counted.github_user_cleaned_url[:52])
    plt.xlabel("weeks since repository creation")
    plt.ylabel(f"number of label on that day")
    plt.title("First year")

    plt.subplot(133)
    plt.bar(counted.index[:4], counted.github_user_cleaned_url[:4])
    plt.xlabel("weeks since repository creation")
    plt.ylabel(f"number of label on that day")
    plt.title("First month")

    plt.show()

In [None]:
data_dir = "../data/analysis"

In [None]:
metadata = pd.read_csv(os.path.join(data_dir, "metadata.csv"), index_col=0)
metadata.created_at = pd.to_datetime(metadata.created_at, utc=True)
metadata.rename(columns={"created_at": "repo_created_at"}, inplace=True)
stars = pd.read_csv(os.path.join(data_dir, "stars.csv"), index_col=0)
stars.date = pd.to_datetime(stars.date, utc=True)

In [None]:
df = pd.merge(metadata, stars, on="github_user_cleaned_url")
df.dropna(subset=["date"], inplace=True)
df["starred_on_day_since_creation"] = (df.date - df.repo_created_at).dt.days

In [None]:
plot_against_days_since_creation(df, "starred_on_day_since_creation", "stars")

In [None]:
plot_against_weeks_since_creation(df, "starred_on_day_since_creation", "stars")

In [None]:
forks = pd.read_csv(os.path.join(data_dir, "forks.csv"), index_col=0)
forks.date = pd.to_datetime(forks.date, utc=True)
df = pd.merge(metadata, forks, on="github_user_cleaned_url")
df.dropna(subset=["date"], inplace=True)
df["forked_on_day_since_creation"] = (df.date - df.repo_created_at).dt.days
plot_against_days_since_creation(df, "forked_on_day_since_creation", "forks")

In [None]:
plot_against_weeks_since_creation(df, "forked_on_day_since_creation", "forks")

In [None]:
issues = pd.read_csv(os.path.join(data_dir, "issues.csv"), index_col=0)
issues.created_at = pd.to_datetime(issues.created_at, utc=True)
issues.closed_at = pd.to_datetime(issues.closed_at, utc=True)
df = pd.merge(metadata, issues, on="github_user_cleaned_url")
df.dropna(subset=["created_at"], inplace=True)
df["issue_opened_on_day_since_creation"] = (df.created_at - df.repo_created_at).dt.days
df = df[df.issue_opened_on_day_since_creation >= 0]
plot_against_days_since_creation(df, "issue_opened_on_day_since_creation", "issues opened")

In [None]:
plot_against_weeks_since_creation(df, "issue_opened_on_day_since_creation", "issues opened")

## file presence

In [None]:
data_dir = "../data/analysis/"

In [None]:
contents = pd.read_csv(os.path.join(data_dir, "contents.csv"), index_col=0)
contents.head()

In [None]:
contents[pd.notna(contents.citation_added) == True]

In [None]:
pd.notna(contents.contributing_added).value_counts()

In [None]:
contents.loc[pd.notna(contents.contributing_added)]

In [None]:
contents[contents.readme_emojis > 0]

In [None]:
contents_df = pd.merge(metadata, contents, on="github_user_cleaned_url")
contents_df["citation_added"] = pd.to_datetime(contents_df.citation_added, utc=True)
contents_df["citation_added"] = (contents_df.citation_added - contents_df.created_at).dt.days // 7

In [None]:
contents_df

In [None]:
for l in contents.license.unique():
    tmp = contents[contents.license == l]
    plt.scatter(contents.readme_size, contents.contributing_size, alpha=0.3, marker='.', label=l)
plt.xlabel("Size of README file")
plt.ylabel("Size of CONTRIBUTING.md")
plt.xlim(-1000, 30000)
plt.ylim(-10, 300)
plt.legend()
plt.show()

In [None]:
contents.license = contents.license.fillna('None')
contents.license.value_counts().plot(kind='bar')

In [None]:
permissive_licenses = ["mit", "gpl-3.0", "apache-2.0", "bsd-3-clause", "gpl-2.0", "bsd-2-clause"] # https://en.wikipedia.org/wiki/Permissive_software_license

In [None]:
contents.license = contents.license.fillna('None')
contents["license_type"] = np.where(contents.license.isin(permissive_licenses), "permissive", np.where(contents.license == "None", "None", np.where(contents.license == "other", "unknown", "non-permissive")))

In [None]:
contents.license_type.value_counts().plot(kind='bar')

In [None]:
contents.github_user_cleaned_url[contents.license == "other"]

In [None]:
contents.github_user_cleaned_url[contents.license == "bsd-3-clause"]

In [None]:
contents.license = contents.license.fillna('None')
contents.plot(x="license", y="readme_size", kind="scatter", alpha=0.3, rot=50)

### README size

In [None]:
readme_size_series = contents.set_index("github_user_cleaned_url").readme_size

In [None]:
bins = [0, 1, 300, 1500, 10000]
binmeanings = ["none", "ultra-short", "short", "informative", "detailed"]
if readme_size_series.max() > bins[-1]:
    bins.append(readme_size_series.max())
counts, bins = np.histogram(readme_size_series, bins)
binlabels = [f"{binmeanings[i]}\n[{bins[i]} - {bins[i+1]})" for i in range(len(bins)-2)]
binlabels += [f"{binmeanings[-1]}\n[{bins[-2]} - {bins[-1]}]"]
fig, ax = plt.subplots(ncols=1, nrows=1)#, figsize=(18, 12))
ax.bar(binlabels, counts)
ax.bar_label(ax.containers[0])
ax.tick_params(axis='x', labelrotation=45)
ax.set(xlabel="size of README in Bytes", ylabel="repository count")

In [None]:
bins = [0, 1, 300, 1500, 10000]
binmeanings = ["none", "ultra-short", "short", "informative", "detailed"]
if readme_size_series.max() > bins[-1]:
    bins.append(readme_size_series.max())
counts, bins = np.histogram(readme_size_series, bins)
binlabels = [f"{binmeanings[i]}\n[{bins[i]} - {bins[i+1]})" for i in range(len(bins)-2)]
binlabels += [f"{binmeanings[-1]}\n[{bins[-2]} - {bins[-1]}]"]
fig, ax = plt.subplots(ncols=1, nrows=1)#, figsize=(18, 12))
ax.pie(counts, labels=binlabels)
#ax.bar(binlabels, counts)
#ax.bar_label(ax.containers[0])
#ax.tick_params(axis='x', labelrotation=45)
#ax.set(xlabel="size of README in Bytes", ylabel="repository count")

In [None]:
bins = [0, 1, 300, 1500, 10000]
if readme_size_series.max() > bins[-1]:
    bins.append(readme_size_series.max())
lower = bins[0]
for upper in bins[1:]:
    tmp = readme_size_series[readme_size_series.between(lower, upper)]
    tmp = tmp.sort_values()
    samples_low = tmp.iloc[:3]
    samples_high = tmp.iloc[-3:]
    print((lower, upper))
    print("lower:")
    print(samples_low)
    print("higher:")
    print(samples_high)
    print()
    lower = upper


In [None]:
series = readme_size_series[readme_size_series.between(6000, 15000)]
plt.scatter(x=series, y=[1]*len(series), alpha=0.5)

### connect with engagement numbers

In [None]:
forks = pd.read_csv(os.path.join(data_dir, "forks.csv"))

In [None]:
forks_count = forks.groupby("github_user_cleaned_url").date.count().rename("no_forks")

In [None]:
df = pd.merge(contents, forks_count, left_on="github_user_cleaned_url", right_index=True)

In [None]:
df.plot(
    kind="scatter",
    x="license_type",
    y="no_forks",
    alpha=0.5
)

## Team size


In [None]:
metadata = pd.read_csv(os.path.join(data_dir, "metadata.csv"), index_col=0)
metadata["created_at"] = pd.to_datetime(metadata.created_at)
contributions = pd.read_csv(os.path.join(data_dir, "contributions.csv"), index_col=0)
contributions["week_co"] = pd.to_datetime(contributions.week_co)

In [None]:
contrib_df = pd.merge(metadata[["github_user_cleaned_url", "created_at"]], contributions)
contrib_df["week_since_repo_creation"] = (contrib_df.week_co - contrib_df.created_at).dt.days // 7
team_df = contrib_df[["github_user_cleaned_url", "author", "week_since_repo_creation", "commits"]].set_index(["github_user_cleaned_url", "author", "week_since_repo_creation"]).sort_index()
# user is active contributor if made at least one commit in last 12 weeks
windowed_team_df = team_df.groupby(level="author").rolling(window=12, min_periods=0).sum().droplevel(0)
windowed_team_df["active contributors"] = windowed_team_df.commits > 0
# team size
team_size = windowed_team_df.groupby(level=["github_user_cleaned_url", "week_since_repo_creation"])["active contributors"].value_counts()[:,:,True]
max_team_size = team_size.groupby(level="github_user_cleaned_url").max()
max_team_size

In [None]:
ax = max_team_size.hist()
ax.bar_label(ax.containers[0])
plt.show()

### team size vs license

In [None]:
contents = pd.read_csv(os.path.join(data_dir, "contents.csv"), index_col=0)
permissive_licenses = ["mit", "gpl-3.0", "apache-2.0", "bsd-3-clause", "gpl-2.0", "bsd-2-clause"] # https://en.wikipedia.org/wiki/Permissive_software_license
contents.license = contents.license.fillna('None')
contents["license_type"] = np.where(
    contents.license.isin(permissive_licenses), "permissive", np.where(
    contents.license == "None", "None", np.where(
    contents.license == "other", "unknown", "non-permissive")))

In [None]:
df = pd.merge(max_team_size, contents[["github_user_cleaned_url", "license_type", "license"]], left_index=True, right_on="github_user_cleaned_url")

In [None]:
n = len(df.license_type.unique())
fig, axs = plt.subplots(ncols=n, nrows=1, figsize=(6*n, 6))
for i, lt in enumerate(df.license_type.unique()):
    ax = axs[i]
    bins = [1, 2, 5, 10]
    tmp = df[df["license_type"] == lt]
    if tmp["active contributors"].max() > bins[-1]:
        bins.append(tmp["active contributors"].max())
    counts, bins = np.histogram(tmp["active contributors"], bins)
    binlabels = [f"[{bins[i]} - {bins[i+1]})" for i in range(len(bins)-2)]
    binlabels += [f"[{bins[-2]} - {bins[-1]}]"]
    ax.bar(binlabels, counts)
    ax.bar_label(ax.containers[0])
    ax.set(xlabel="maximum team size", ylabel="repository count", title=lt)
plt.suptitle("Team size per license type")
plt.show()

In [None]:
n = len(df.license_type.unique())
fig, axs = plt.subplots(ncols=n, nrows=1, figsize=(6*n, 6))
for i, lt in enumerate(df.license_type.unique()):
    ax = axs[i]
    bins = [1, 2, 5, 10]
    tmp = df[df["license_type"] == lt]
    if tmp["active contributors"].max() > bins[-1]:
        bins.append(tmp["active contributors"].max())
    counts, bins = np.histogram(tmp["active contributors"], bins)
    binlabels = [f"[{bins[i]} - {bins[i+1]})" for i in range(len(bins)-2)]
    binlabels += [f"[{bins[-2]} - {bins[-1]}]"]
    ax.pie(x=counts, labels=binlabels, autopct='%1.1f%%')
    #ax.bar_label(ax.containers[0])
    #ax.set(xlabel="maximum team size", ylabel="repository count", title=lt)
    ax.set(title=f"{lt} ({len(tmp)} repos)")
plt.suptitle("Maximum team size per license type")
plt.savefig("../data/analysis/overall/team_size_per_license_type.png")
plt.show()

In [None]:
bins = [0, 1, 2, 5, 10]
if df["active contributors"].max() > bins[-1]:
    bins.append(df["active contributors"].max())
counts, bins = np.histogram(df["active contributors"], bins)
binlabels = [f"[{bins[i]} - {bins[i+1]})" for i in range(len(bins)-2)]
binlabels += [f"[{bins[-2]} - {bins[-1]}]"]

n = len(bins)-2
fig, axs = plt.subplots(ncols=n, nrows=1, figsize=(6*n, 6))
lower=bins[1]
# iterate
for i, upper in enumerate(bins[2:-1]):
    ax = axs[i]
    tmp = df[(df["active contributors"] >= lower) & (df["active contributors"] < upper)]
    tmp.license_type.value_counts().sort_index().plot(
        kind='bar',
        ax=ax,
        xlabel="license type",
        ylabel="repository count",
        title="team size "+binlabels[1+i]
    )
    lower = upper
# add last with inclusive upper limit
ax = axs[-1]
upper = bins[-1]
tmp = df[(df["active contributors"] >= lower) & (df["active contributors"] <= upper)]
tmp.license_type.value_counts().sort_index().plot(
    kind='bar',
    ax=ax,
    xlabel="license type",
    ylabel="repository count",
    title="team size "+binlabels[-1]
)
plt.suptitle("License type per team size")
plt.show()

In [None]:
bins = [0, 1, 2, 5, 10]
if df["active contributors"].max() > bins[-1]:
    bins.append(df["active contributors"].max())
counts, bins = np.histogram(df["active contributors"], bins)
binlabels = [f"[{bins[i]} - {bins[i+1]})" for i in range(len(bins)-2)]
binlabels += [f"[{bins[-2]} - {bins[-1]}]"]

n = len(bins)-2
fig, axs = plt.subplots(ncols=n, nrows=1, figsize=(6*n, 6))
lower=bins[1]
# iterate
for i, upper in enumerate(bins[2:-1]):
    ax = axs[i]
    tmp = df[(df["active contributors"] >= lower) & (df["active contributors"] < upper)]
    tmp.license.value_counts().sort_index().plot(
        kind='bar',
        ax=ax,
        xlabel="license",
        ylabel="repository count",
        title="team size "+binlabels[1+i]
    )
    lower = upper
# add last with inclusive upper limit
ax = axs[-1]
upper = bins[-1]
tmp = df[(df["active contributors"] >= lower) & (df["active contributors"] <= upper)]
tmp.license.value_counts().sort_index().plot(
    kind='bar',
    ax=ax,
    xlabel="license",
    ylabel="repository count",
    title="team size "+binlabels[-1]
)
plt.suptitle("License type per team size")
plt.show()

In [None]:
bins = [0, 1, 2, 5, 10]
if df["active contributors"].max() > bins[-1]:
    bins.append(df["active contributors"].max())
counts, bins = np.histogram(df["active contributors"], bins)
binlabels = [f"[{bins[i]} - {bins[i+1]})" for i in range(len(bins)-2)]
binlabels += [f"[{bins[-2]} - {bins[-1]}]"]

n = len(bins)-2
fig, axs = plt.subplots(ncols=n, nrows=1, figsize=(6*n, 6))
lower=bins[1]
# iterate
for i, upper in enumerate(bins[2:-1]):
    ax = axs[i]
    tmp = df[(df["active contributors"] >= lower) & (df["active contributors"] < upper)]
    tmp.license_type.value_counts().sort_index().plot(
        kind='pie',
        ax=ax,
        #xlabel="license type",
        ylabel="",
        title=f"team size {binlabels[1+i]} ({len(tmp)} repos)",
        autopct='%1.1f%%'
    )
    lower = upper
# add last with inclusive upper limit
ax = axs[-1]
upper = bins[-1]
tmp = df[(df["active contributors"] >= lower) & (df["active contributors"] <= upper)]
tmp.license_type.value_counts().sort_index().plot(
    kind='pie',
    ax=ax,
    #xlabel="license type",
    ylabel="",
    title=f"team size {binlabels[-1]} ({len(tmp)} repos)",
    autopct='%1.1f%%'
)
plt.suptitle("License type per team size")
plt.savefig("../data/analysis/overall/license_type_per_team_size.png")
plt.show()

## README headlines

In [None]:
readme_df = pd.read_csv(os.path.join(data_dir, "readme_history.csv"), index_col=0)
readme_df.head()

In [None]:
readme_df[readme_df.added_headings.str.contains("Neo4J 2.0.1", na=False)]

In [None]:
import re
import ast

pattern = "\[(.+?)\]\(.+?\)"
text = 'COVID-19 image data collection ([🎬 video about the project](https://www.youtube.com/watch?v=ineWmqfelEQ))'
re.sub(pattern, r'\1', text, count=0)

In [None]:
import string
import re

def clean_heading(h):
    to_remove = string.digits + string.whitespace + ".:"
    h = h.lstrip(to_remove)
    pattern = "\[(.+?)\]\(.+?\)"
    h = re.sub(pattern, r'\1', h, count=0)
    h = h.replace(string.punctuation, "")
    h = h.strip(string.punctuation)
    h = h.lower()
    return h
    

In [None]:
import ast

headings = []
for l in readme_df.added_headings.dropna():
    headings += ast.literal_eval(l)
headings = [clean_heading(h) for h in headings]

In [None]:
readme_df.added_cites[(readme_df.added_cites != "[]") & (readme_df.added_cites.notna())]

### word cloud

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
stopwords = STOPWORDS
custom = set(["trades", "glosat", "glosat_table_dataset", "nilmtk", "bert", "lemon", "cascadetabnet"])
stopwords = stopwords.union(custom)

In [None]:
wordcloud = WordCloud(
    collocation_threshold=15,
    stopwords=stopwords,
    scale=10,
    background_color="white",
    random_state=42
    ).generate(" ".join(headings))

In [None]:
plt.figure(figsize=(20, 10))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

### timelines

In [None]:
def analyse_headings(df):
    interesting_words = {
        "ownership": ["license", "example", "reference", "citation", "cited", "publication", "paper"],
        "usage": ["requirements", "using", "example", "usage", "run", "install", "installing", "installation", "tutorial", "tutorials", "build", "guide", "documentation"]
    }
    df["ownership_addition"] = df.added_headings.str.contains("|".join(interesting_words["ownership"]))
    df["usage_addition"] = df.added_headings.str.contains("|".join(interesting_words["usage"]))
    return df

In [None]:
readme_df.author_date = pd.to_datetime(readme_df.author_date, utc=True)
df = pd.merge(metadata, readme_df, on="github_user_cleaned_url")
df.dropna(subset=["author_date"], inplace=True)
df["authored_on_day_since_creation"] = (df.author_date - df.repo_created_at).dt.days

In [None]:
df = analyse_headings(df)

In [None]:
df[df["ownership_addition"] | df["usage_addition"]].head()

## example: rOpenHealth/ClinicalCodes

In [None]:
def load_data(data_dir, filename, repo, to_datetime=None):
    df = pd.read_csv(os.path.join(data_dir, filename), index_col=0)
    df = df[df["github_user_cleaned_url"] == repo]
    if type(to_datetime) == list:
        for dt in to_datetime:
            df[dt] = pd.to_datetime(df[dt], utc=True)
    elif type(to_datetime) == str:
        df[to_datetime] = pd.to_datetime(df[to_datetime], utc=True)
    return df

In [None]:
data_dir = "../data/analysis"
repo = "esbmc/esbmc"
contents = load_data(data_dir, "contents.csv", repo, ["citation_added", "contributing_added"])
contributions = load_data(data_dir, "contributions.csv", repo, "week_co")
forks = load_data(data_dir, "forks.csv", repo, "date")
issues = load_data(data_dir, "issues.csv", repo, ["created_at", "closed_at"])
metadata = load_data(data_dir, "metadata.csv", repo, "created_at")
readme_history = load_data(data_dir, "readme_history.csv", repo, "author_date")
stars = load_data(data_dir, "stars.csv", repo, "date")

### User type wrt. issues

In [None]:
end = (datetime.now(tz=timezone.utc) - metadata.created_at.iloc[0]).days // 7
x_data = pd.Series(np.arange(end), name="week_since_repo_creation")

In [None]:
merged_df = pd.merge(issues, metadata, on="github_user_cleaned_url", suffixes=(None,"_repo"))

In [None]:
merged_df["created_at"] = (merged_df["created_at"] - merged_df["created_at_repo"]).dt.days // 7
merged_df["closed_at"] = (merged_df["closed_at"] - merged_df["created_at_repo"]).dt.days // 7

In [None]:
created = merged_df.groupby(["user", "created_at"])["state"].count().rename("created_count")
created.index.rename({"created_at": "week_since_repo_creation"}, inplace=True)

In [None]:
closed = merged_df.groupby(["closed_by", "closed_at"])["state"].count().rename("closed_count")
closed.index.rename({"closed_at": "week_since_repo_creation", "closed_by": "user"}, inplace=True)

In [None]:
issues_by_user = pd.merge(created, closed, left_index=True, right_index=True, how="outer")

In [None]:
df = pd.merge(x_data, pd.Series(issues_by_user.index.unique(level="user")), how="cross").set_index(["user", "week_since_repo_creation"])
df = pd.merge(df, issues_by_user, left_index=True, right_index=True, how="outer")
df.fillna(0, inplace=True)

In [None]:
df

In [None]:
windowed_df = df.groupby(level="user").rolling(window=12, min_periods=0).sum().droplevel(0)

In [None]:
idx = pd.IndexSlice
windowed_df.loc[idx[:, 0], :]

In [None]:
conditions = [(windowed_df.created_count > 0) & (windowed_df.closed_count == 0), (windowed_df.created_count == 0) & (windowed_df.closed_count > 0), (windowed_df.created_count > 0) & (windowed_df.closed_count > 0)]
choices = ["opening", "closing", "both"]
windowed_df["status"] = np.select(conditions, choices, default="inactive")

In [None]:
cset_light = tc.tol_cset('light')
cset_light

In [None]:
plt.cm.register_cmap('rainbow_discrete_12', tc.tol_cmap('rainbow_discrete', 12))

In [None]:
palette = sns.color_palette("rainbow_discrete_12", n_colors=12)
palette

In [None]:
palette_short = [palette[i] for i in range(len(palette)) if i in [1, 4, 6, 9]]

In [None]:
fig, ax = plt.subplots(figsize=(20, 4))
sns.scatterplot(
    ax=ax,
    data=windowed_df,
    x="week_since_repo_creation",
    y="user",
    hue="status",
    hue_order=["inactive", "opening", "closing", "both"],
    palette=palette_short,
    marker="|",
    s=500,
    )
left, right = ax.get_xlim()
ax.set_xlim(left=0, right=right+10)
plt.show()

### Team size

In [None]:
contrib_df = pd.merge(metadata[["github_user_cleaned_url", "created_at"]], contributions)
contrib_df["week_since_repo_creation"] = (contrib_df.week_co - contrib_df.created_at).dt.days // 7

In [None]:
team_df = contrib_df[["author", "week_since_repo_creation", "commits"]].set_index(["author", "week_since_repo_creation"]).sort_index()

In [None]:
windowed_team_df = team_df.groupby(level="author").rolling(window=12, min_periods=0).sum().droplevel(0)

In [None]:
windowed_team_df["active contributor"] = windowed_team_df.commits > 0

In [None]:
fig, ax = plt.subplots(figsize=(20, 4))
sns.scatterplot(
    ax=ax,
    data=windowed_team_df,
    x="week_since_repo_creation",
    y="author",
    hue="active contributor",
    hue_order=[False, True],
    palette=['#d62728', '#2ca02c'],
    marker="|",
    s=500,
    )
left, right = ax.get_xlim()
ax.set_ylabel("user")
ax.set_xlim(left=0, right=right+10)
plt.show()

In [None]:
team_size = windowed_team_df.groupby(level="week_since_repo_creation")["active contributor"].value_counts()[:,True].reindex(windowed_team_df.index.levels[1], fill_value=0)

In [None]:
team_size.plot(
    figsize=(20, 4),
    xlabel="week since repo creation",
    ylabel="contributor team size",
)
plt.show()

In [None]:
became_active = team_df.groupby(level="author").cumsum()
became_active["contributor"] = became_active.commits > 0
became_active

In [None]:
team_growth = became_active.groupby(level="week_since_repo_creation")["contributor"].value_counts()[:,True].reindex(became_active.index.levels[1], fill_value=0)
#windowed_team_df["active contributor"] = windowed_team_df.commits > 0

In [None]:
team_growth.plot(
    figsize=(20, 4),
    xlabel="week since repo creation",
    ylabel="contributor team size",
)
plt.show()

### Issues opened and closed

In [None]:
issues_timeline_df = pd.merge(metadata, issues, on="github_user_cleaned_url", suffixes=("_repo", None))

In [None]:
issues_timeline_df["opened_in_week_since_repo_creation"] = (issues_timeline_df.created_at - issues_timeline_df.created_at_repo).dt.days // 7
issues_timeline_df["closed_in_week_since_repo_creation"] = (issues_timeline_df.closed_at - issues_timeline_df.created_at_repo).dt.days // 7

In [None]:
issues_timeline_df

In [None]:
end = (datetime.now(tz=timezone.utc) - metadata.created_at.iloc[0]).days // 7
x_data = pd.Series(np.arange(end), name="week_since_repo_creation")

In [None]:
issue_count_timeline = pd.DataFrame(x_data)
issue_count_timeline["open_issues_count"] = issue_count_timeline.apply(lambda x: len(issues_timeline_df[
                                                                                        (issues_timeline_df.opened_in_week_since_repo_creation <= x.week_since_repo_creation) &
                                                                                        ((issues_timeline_df.closed_in_week_since_repo_creation >= x.week_since_repo_creation) |
                                                                                         (issues_timeline_df.closed_in_week_since_repo_creation.isna()))
                                                                                        ]), axis=1)
issue_count_timeline["closed_issues_count"] = issue_count_timeline.apply(lambda x: len(issues_timeline_df[
                                                                                        (issues_timeline_df.closed_in_week_since_repo_creation < x.week_since_repo_creation)
                                                                                        ]), axis=1)

In [None]:
issue_count_timeline.rename(columns={"open_issues_count": "open issues", "closed_issues_count": "closed issues"}).plot(
    x="week_since_repo_creation",
    y=["open issues", "closed issues"],
    xlabel="week since repo creation",
    ylabel="count"
    )
plt.show()

In [None]:
open_issues_count_p = np.diff(issue_count_timeline['open_issues_count']) / np.diff(issue_count_timeline['week_since_repo_creation'])
week_since_repo_creation_p = (np.array(issue_count_timeline['week_since_repo_creation'])[:-1] + np.array(issue_count_timeline['week_since_repo_creation'])[1:]) / 2

In [None]:

plt.plot(issue_count_timeline['week_since_repo_creation'], issue_count_timeline['open_issues_count'], label="open issues")
plt.plot(week_since_repo_creation_p, open_issues_count_p, label="diff open issues")
plt.legend()
plt.show()

### Engagement

In [None]:
forks_df = pd.merge(forks, metadata, on="github_user_cleaned_url")
forks_df["week_since_repo_creation"] = (forks_df.date - forks_df.created_at).dt.days // 7
forks_df = forks_df[["week_since_repo_creation", "user"]].groupby("week_since_repo_creation").count().rename(columns={"user": "no_forks"}).sort_index()

In [None]:
stars_df = pd.merge(stars, metadata, on="github_user_cleaned_url")
stars_df["week_since_repo_creation"] = (stars_df.date - stars_df.created_at).dt.days // 7
stars_df = stars_df[["week_since_repo_creation", "user"]].groupby("week_since_repo_creation").count().rename(columns={"user": "no_stars"}).sort_index()

In [None]:
end = (datetime.now(tz=timezone.utc) - metadata.created_at.iloc[0]).days // 7
x_data = pd.Series(np.arange(end), name="week_since_repo_creation")
engagement_df = pd.merge(x_data, forks_df, on="week_since_repo_creation", how="outer")
engagement_df = pd.merge(engagement_df, stars_df, on="week_since_repo_creation", how="outer").fillna(0)
engagement_df = engagement_df.set_index("week_since_repo_creation")

In [None]:
engagement_df = engagement_df.cumsum()

In [None]:
engagement_df.plot()

### Diffs

In [None]:
def get_diff_series(x, y):
    y_p = np.diff(y) / np.diff(x)
    x_p = (np.array(x)[:-1] + np.array(x)[1:]) / 2
    return (x_p, y_p)

In [None]:
x_p, y_p = get_diff_series(team_growth.index, team_growth)
plt.plot(x_p, y_p, label="diff team growth")
x_p, y_p = get_diff_series(engagement_df.index, engagement_df["no_stars"])
plt.plot(x_p, y_p, label="diff stars")
x_p, y_p = get_diff_series(engagement_df.index, engagement_df["no_forks"])
plt.plot(x_p, y_p, label="diff forks")
plt.legend()
plt.show()

Useless if not fitting curve first.

### Highlights

In [None]:
import string
import re

def clean_heading(h):
    to_remove = string.digits + string.whitespace + ".:"
    h = h.lstrip(to_remove)
    pattern = "\[(.+?)\]\(.+?\)"  # markdown links
    h = re.sub(pattern, r'\1', h, count=0)
    h = h.replace(string.punctuation, "")
    h = h.strip(string.punctuation)
    h = h.lower()
    return h

In [None]:
import ast

headings = readme_history.added_headings.dropna().apply(ast.literal_eval).explode().dropna()

In [None]:
headings = headings.apply(clean_heading)

In [None]:
def analyse_headings(df):
    interesting_words = {
        "ownership": ["license", "example", "reference", "citation", "cited", "publication", "paper"],
        "usage": ["requirements", "using", "example", "usage", "run", "install", "installing", "installation", "tutorial", "tutorials", "build", "guide", "documentation"]
    }
    df["ownership_addition"] = df.added_headings.str.contains("|".join(interesting_words["ownership"]), case=False)
    df["usage_addition"] = df.added_headings.str.contains("|".join(interesting_words["usage"]), case=False)
    return df

In [None]:
df = pd.merge(metadata, readme_history, on="github_user_cleaned_url")
df.dropna(subset=["author_date"], inplace=True)
df["authored_in_week_since_creation"] = ((df.author_date - df.created_at).dt.days // 7)#.clip(0)
#df["authored_in_week_since_creation"].clip(0, inplace=True)

In [None]:
df

In [None]:
df = analyse_headings(df)

In [None]:
df

In [None]:
issue_count_timeline.rename(columns={"open_issues_count": "open issues", "closed_issues_count": "closed issues"}).plot(
    x="week_since_repo_creation",
    y=["open issues", "closed issues"],
    xlabel="week since repo creation",
    ylabel="count"
    )
ownership_added = df[df.ownership_addition].authored_in_week_since_creation
plt.scatter(ownership_added, (-2 * np.ones((len(ownership_added),))), marker="v", color="black", label="ownership heading")
usage_added = df[df.usage_addition].authored_in_week_since_creation
plt.scatter(usage_added, (-1 * np.ones((len(usage_added),))), marker="v", color="red", label="usage heading")
plt.legend()

In [None]:
contents.head()

In [None]:
def analyse_headings(df):
    interesting_words = {
        "ownership": ["license", "example", "reference", "citation", "cited", "publication", "paper"],
        "usage": ["requirements", "using", "example", "usage", "run", "install", "installing", "installation", "tutorial", "tutorials", "build", "guide", "documentation"]
    }
    df["ownership_addition"] = df.added_headings.str.contains("|".join(interesting_words["ownership"]), case=False)
    df["usage_addition"] = df.added_headings.str.contains("|".join(interesting_words["usage"]), case=False)
    return df

In [None]:
def date_highlights(readme_history, contents, metadata, ax):
    df = pd.merge(metadata, readme_history, on="github_user_cleaned_url")
    df.dropna(subset=["author_date"], inplace=True)
    df["authored_in_week_since_creation"] = (df.author_date - df.created_at).dt.days // 7
    contents_df = pd.merge(metadata, contents, on="github_user_cleaned_url")
    contents_df.citation_added = (contents_df.citation_added - contents_df.created_at).dt.days // 7
    contents_df.contributing_added = (contents_df.contributing_added - contents_df.created_at).dt.days // 7
    # headings
    df = analyse_headings(df)
    ownership_added = df[df.ownership_addition].authored_in_week_since_creation
    ax.scatter(ownership_added, (1 * np.ones((len(ownership_added),))), marker="v", s=100, label="ownership heading")
    usage_added = df[df.usage_addition].authored_in_week_since_creation
    ax.scatter(usage_added, (2 * np.ones((len(usage_added),))), marker="v", label="usage heading")
    # citation in README
    citation_added = df[(df.added_cites != "[]") & (df.added_cites.notna())]
    ax.scatter(citation_added, (3 * np.ones((len(citation_added),))), marker="v", label="citation in README")
    # citation file
    citation_file_added = contents_df[contents_df.citation_added.notna()]
    ax.scatter(citation_file_added, (4* np.ones((len(citation_file_added),))), marker="v", label="citation file")
    # contributing file
    contributing_file_added = contents_df[contents_df.contributing_added.notna()]
    ax.scatter(contributing_file_added, (5* np.ones((len(contributing_file_added),))), marker="v", label="contributing file")

In [None]:
fig, ax = plt.subplots(1, 1)
date_highlights(readme_history, contents, metadata, ax)
plt.legend()
plt.show()

In [None]:
paper_data = pd.read_csv("../data/analysis/cleaned_links/joined.csv")
pd.merge(metadata, paper_data, on="github_user_cleaned_url")

### User fork/star highlight

In [None]:
users = np.unique(np.concatenate([issues.user.unique(), issues.closed_by.dropna().unique(), contributions.author.unique()]))

In [None]:
forks_users_df = forks[forks.user.isin(users)]

In [None]:
forks_users_df

In [None]:
stars_users_df = stars[stars.user.isin(users)]
stars_users_df

## test aggregated datasets

In [None]:
repo_id = "ziqizhang/sti"

In [None]:
dfs = {}
dfs["overall_timeline_df"] = pd.read_csv("../data/analysis/aggregated_timeline.csv")
dfs["commit_author_df"] = pd.read_csv("../data/analysis/aggregated_commit_author_timeline.csv")
dfs["issue_user_df"] = pd.read_csv("../data/analysis/aggregated_issue_user_timeline.csv")
dfs["overall_df"] = pd.read_csv("../data/analysis/aggregated_overall.csv")
for k, v in dfs.items():
    dfs[k] = v[v.github_user_cleaned_url == repo_id]

In [None]:
# prepare figure
fig = plt.figure(figsize=(20, 20))
overlay_axis = fig.subplots()
overlay_axis.axis('off')
axs = fig.subplots(nrows=6, sharex=True, height_ratios=[3, 3, 2, 2, 2, 1])
for ax in axs:
    ax.patch.set_alpha(0)

In [None]:
# user_type_wrt_issues
sns.scatterplot(
    ax=axs[0],
    data=dfs["issue_user_df"],
    x="week_since_repo_creation",
    y="user",
    hue="user_status",
    hue_order=["inactive", "opening", "closing", "both"],
    palette=['#d62728', '#1f77b4', '#ff7f0e', '#2ca02c'],
    marker="|",
    s=500,
    )
axs[0].set_ylabel("issue user")
axs[0].legend(loc="upper right")
axs[0].grid(True, axis="x")

In [None]:
# contributor_team
dfs["commit_author_df"]["active_contributors"] = dfs["commit_author_df"]["active_contributors"].map({True: "active", False: "inactive"})
# plot per-user status
sns.scatterplot(
    ax=axs[1],
    data=dfs["commit_author_df"],
    x="week_since_repo_creation",
    y="author",
    hue="active_contributors",
    hue_order=["inactive", "active"],
    palette=['#d62728', '#2ca02c'],
    marker="|",
    s=500,
)
axs[1].set_ylabel("contributing user")
dfs["overall_timeline_df"].plot(
    ax=axs[2],
    lw=2,
    x="week_since_repo_creation",
    y=["active_contributors", "contributors"],
    ylabel="number of contributors"
)
axs[1].grid(True, axis="x")
axs[1].legend()
axs[2].legend(loc="upper right")
axs[2].grid(True)

In [None]:
# no_open_and_closed_issues
dfs["overall_timeline_df"].plot(
    ax=axs[3],
    x="week_since_repo_creation",
    y=["open_count", "closed_count"],
    ylabel="issues"
)
axs[3].legend(loc="upper right")
axs[3].grid(True)

In [None]:
# engagement
dfs["overall_timeline_df"].plot(
    ax=axs[4],
    x="week_since_repo_creation",
    y=["forks_count", "stars_count"],
    ylabel="issues"
)
axs[4].legend(loc="upper right")
axs[4].grid(True)

In [None]:
# helper function for highlights
def calc_y_timeline(data):
    ys = [[] for _ in range(len(data))]
    seen_x = []
    for i in range(len(data)):
        for x in data[i]:
            ys[i].append(-1 * seen_x.count(x))
            seen_x.append(x)
    return ys

In [None]:
# date_highlights
ax=axs[5]
ax.set(ylim=(-6, 0.4), yticks=[])
ax.set_xlabel("weeks since repository creation", loc="right")
ax.xaxis.set_label_position('top')
ax.xaxis.tick_top()
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
events_df = dfs["overall_timeline_df"]
data = [events_df[events_df.ownership_added].week_since_repo_creation,
        events_df[events_df.usage_added].week_since_repo_creation, 
        events_df[events_df.citation_added].week_since_repo_creation, 
        events_df[events_df.citation_file_added].week_since_repo_creation, 
        events_df[events_df.contributing_file_added].week_since_repo_creation, 
        events_df[events_df.paper_published].week_since_repo_creation]
ys = calc_y_timeline(data)
labels = ["ownership heading", "usage heading", "citation in README", "citation file", "contributing file", "mention in publication"]
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']
ymax = 86
for i in range(len(data)):
    ax.scatter(data[i], ys[i], marker="^", s=100, label=labels[i], color=colors[i])
    overlay_axis.vlines(data[i], ys[i], ymax, linestyles='dashed', color=colors[i])
axs[5].legend(loc="upper right", ncols=2)

In [None]:
# final adjustments
ymax = 86
xl, xr = plt.xlim()
plt.xlim(xl, xr+15)
overlay_axis.set(xlim=(xl, xr+15), ylim=(-6, ymax))
fig.suptitle(repo_id)
fig.tight_layout(rect=[0, 0.03, 1, 0.98])

In [None]:
fig