# 0. Datenbankabfrage und Definition von Funktionen

In [None]:
from datetime import date, datetime, timedelta
from dateutil.relativedelta import relativedelta
import pandas as pd
import numpy as np

np.seterr(all="ignore")
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sqlalchemy import create_engine
import mysql.connector
import psycopg2

pd.options.mode.chained_assignment = None
from IPython.display import display, Markdown, HTML
import matplotlib.pyplot as plt
import seaborn as sns

display(Markdown(f"### Letztes Update: {date.today()}"))

In [None]:
mysql_engine = create_engine("mysql+mysqlconnector://root:secret@localhost:3306/serlo")

mysql_db = mysql_engine.raw_connection()

postgres_db = create_engine(
    "postgresql+psycopg2://serlo:secret@localhost:5432/kratos"
).raw_connection()


def cached(func):
    cache = dict()

    def return_func(arg):
        if arg in cache:
            return cache[arg]
        else:
            result = func(arg)
            cache[arg] = result
            return result

    return return_func


def query(db, sql):
    c = db.cursor()
    c.execute(sql)

    return c.fetchall()


def querySingleton(sql):
    return [x[0] for x in query(mysql_db, sql)]


@cached
def getParent(termId):
    return querySingleton(
        """
        select parent_id from term_taxonomy where id = %s;
    """
        % termId
    )[0]


def getTermName(termId):
    return querySingleton(
        """
        select term.name from term_taxonomy
        join term on term.id = term_taxonomy.term_id
        where term_taxonomy.id = %s;
    """
        % termId
    )[0]


@cached
def getSubject(termId):
    if int(termId) in [
        79733,
        81317,
        20852,
        87814,
        87827,
        85477,
        87860,
        75049,
        76750,
        87496,
        75678,
        91252,
        91253,
    ]:
        return "Prüfungsbereich Mathematik"
    if int(termId) in [106082]:
        return getTermName(termId)

    parent = getParent(termId)
    grandparent = getParent(parent)

    if parent == 106081:
        return getTermName(termId)

    return getSubject(parent) if grandparent != None else getTermName(termId)


@cached
def getSubjectFromUuid(uuid):
    taxonomyTerms = querySingleton(
        f"""
        select term_taxonomy_id from term_taxonomy_entity
        where term_taxonomy_entity.entity_id  = {uuid};
    """
    )

    if len(taxonomyTerms) > 0:
        return getSubject(taxonomyTerms[0])

    parents = querySingleton(
        f"""
        select parent_id from entity_link
        where entity_link.child_id  = {uuid};
    """
    )

    if len(parents) > 0:
        return getSubjectFromUuid(parents[0])

    return None

In [None]:
def read_event_log_edits():
    df = pd.read_sql(
        """
        select event_log.id, event_log.actor_id, event_log.date, user.username, event_parameter_uuid.uuid_id from event_log
        join user on user.id = event_log.actor_id
        join event_parameter on event_parameter.log_id = event_log.id
        join event_parameter_uuid on event_parameter_uuid.event_parameter_id = event_parameter.id
        where event_log.event_id = 5
        and year(event_log.date) > 2018
        and user.username != "Legacy"
    """,
        con=mysql_engine,
    )
    df.set_index("id", inplace=True)
    df.rename(columns={"uuid_id": "uuid"}, inplace=True)
    df["subject"] = df["uuid"].map(getSubjectFromUuid)
    return df


event_log_edits = read_event_log_edits()

interest_df = pd.DataFrame(
    query(
        postgres_db,
        "SELECT traits ->> 'username', (metadata_public ->> 'legacy_id')::int, traits ->> 'interest' FROM identities;",
    ),
    columns=["username", "legacy_id", "interest"],
)

merged_df_edits = (
    pd.merge(event_log_edits, interest_df, left_on="actor_id", right_on="legacy_id")
    .drop(["legacy_id", "username_y"], axis=1)
    .rename(columns={"username_x": "username"})
)

In [None]:
def get_number_of_all_authors(
    days=90, edits=10, week=0, year=0, days2=0, interest="all", subject="all"
):
    lower_date = pd.Timestamp.today() - pd.Timedelta(
        days=days + days2 + week * 7 + year * 365
    )
    upper_date = pd.Timestamp.today() - pd.Timedelta(days=days2 + week * 7 + year * 365)

    df2 = merged_df_edits[
        (lower_date < merged_df_edits["date"]) & (upper_date > merged_df_edits["date"])
    ]

    if interest == "teacher":
        df2 = df2.loc[df2["interest"].isin(["teacher"])]
    elif interest == "no teachers":
        df2 = df2.loc[df2["interest"].isin([""])]

    if subject != "all":
        df2 = df2.loc[df2["subject"].isin([subject])]

    df2 = df2.reset_index()
    df3 = df2.groupby(by=["actor_id", "username"], as_index=False).count()
    # Delete all authors under baseline
    df4 = df3
    df4["isActive"] = df4["uuid"].apply(lambda x: 1 if x >= edits else 0)

    return df4[df4.isActive == 1].actor_id.count()

In [None]:
def get_number_of_all_edits(
    days=90, edits=1, week=0, year=0, days2=0, interest="all", subject="all"
):
    lower_date = pd.Timestamp.today() - pd.Timedelta(
        days=days + days2 + week * 7 + year * 365
    )
    upper_date = pd.Timestamp.today() - pd.Timedelta(days=days2 + week * 7 + year * 365)

    df2 = merged_df_edits[
        (lower_date < merged_df_edits["date"]) & (upper_date > merged_df_edits["date"])
    ]

    if interest == "teacher":
        df2 = df2.loc[df2["interest"].isin(["teacher"])]
    elif interest == "no teachers":
        df2 = df2.loc[df2["interest"].isin([""])]

    if subject != "all":
        df2 = df2.loc[df2["subject"].isin([subject])]

    df2 = df2.reset_index()

    df3 = df2.groupby(by=["actor_id", "username"], as_index=False).count()
    # Delete all authors under baseline
    df4 = df3
    df4["isActive"] = df4["uuid"].apply(lambda x: 1 if x >= edits else 0)

    df2 = df2[df2["actor_id"].isin(df4[df4.isActive == 1].actor_id)]

    return df2.actor_id.count()

In [None]:
def get_one_year_dates(days=365, year1=0, year2=0):
    lower_date = pd.Timestamp.today() - pd.Timedelta(
        days=days + year1 * 365 + year2 * 365
    )
    upper_date = pd.Timestamp.today() - pd.Timedelta(days=year2 * 365)

    one_year_dates = []
    current_date = lower_date + relativedelta(days=1)

    while current_date <= upper_date:
        one_year_dates.append(current_date)
        current_date += relativedelta(days=1)

    return one_year_dates


def new_range(days=365):
    return list(reversed(range(days)))

In [None]:
event_log_contents = pd.read_sql(
    """
        select event_log.id, event_log.actor_id, event_log.date, user.username from event_log
        join user on user.id = event_log.actor_id
        where event_log.event_id = 4
        and year(event_log.date) > 2018
        and user.username != "Legacy"
    """,
    con=mysql_engine,
)

# Join for getting the interest
merged_df_contents = (
    pd.merge(event_log_contents, interest_df, left_on="actor_id", right_on="legacy_id")
    .drop(["legacy_id", "username_y"], axis=1)
    .rename(columns={"username_x": "username"})
)

In [None]:
def get_number_of_created_contents(
    days=90, edits=1, week=0, year=0, days2=0, interest="all"
):
    lower_date = pd.Timestamp.today() - pd.Timedelta(
        days=days + days2 + week * 7 + year * 365
    )
    upper_date = pd.Timestamp.today() - pd.Timedelta(days=days2 + week * 7 + year * 365)

    df2 = merged_df_contents[
        (lower_date < merged_df_contents["date"])
        & (upper_date > merged_df_contents["date"])
    ]

    if interest == "teacher":
        df2 = df2.loc[df2["interest"].isin(["teacher"])]
    elif interest == "no teachers":
        df2 = df2.loc[df2["interest"].isin([""])]

    df2 = df2.reset_index()

    df3 = df2.groupby(by=["actor_id", "username"], as_index=False).count()
    # Delete all authors under baseline
    df4 = df3
    df4["isActive"] = df4["id"].apply(lambda x: 1 if x >= edits else 0)

    df2 = df2[df2["actor_id"].isin(df4[df4.isActive == 1].actor_id)]

    return df2.actor_id.count()

In [None]:
def get_top_authors(
    days=90, week=0, year=0, days2=0, interest="all", subject="all", top=10
):
    lower_date = pd.Timestamp.today() - pd.Timedelta(
        days=days + days2 + week * 7 + year * 365
    )
    upper_date = pd.Timestamp.today() - pd.Timedelta(days=days2 + week * 7 + year * 365)

    df2 = merged_df_edits[
        (lower_date < merged_df_edits["date"]) & (upper_date > merged_df_edits["date"])
    ]

    if interest == "teacher":
        df2 = df2.loc[df2["interest"].isin(["teacher"])]
    elif interest == "no teachers":
        df2 = df2.loc[df2["interest"].isin([""])]

    if subject != "all":
        df2 = df2.loc[df2["subject"].isin([subject])]

    df2 = df2.reset_index()

    df3 = df2.groupby(by=["username"], as_index=False)["index"].count()
    df3.rename(columns={"index": "edits"}, inplace=True)
    df3 = df3.sort_values(by=["edits"], ascending=[False])
    df3 = df3.reset_index()
    df3 = df3.drop(columns=["index"])

    df_top = df3.head(top)
    sum_of_others = df3.iloc[top:, 1].sum()
    df_combined = pd.concat(
        [
            df_top,
            pd.DataFrame({"username": "others", "edits": sum_of_others}, index=[0]),
        ],
        ignore_index=True,
    )
    df_combined = df_combined.sort_values(by=["edits"], ascending=[False])

    return df_combined

In [None]:
def create_kpi_table():
    index = pd.MultiIndex.from_product(
        [["all authors", "active authors", "very active authors"], ["all", "teacher"]]
    )
    kpi_df = pd.DataFrame(index=index, columns=[])

    def compute_percent_change(current, previous):
        if previous == 0:
            if current == 0:
                return 0
            else:
                return 1
        else:
            return (current - previous) / previous

    number_of_authors_column = []
    for edits in (1, 10, 100):
        for interest in ("all", "teacher"):
            number_of_authors_column.append(
                get_number_of_all_authors(days=90, edits=edits, interest=interest)
            )

    kpi_df["number of authors"] = number_of_authors_column

    last_year_column = []
    for edits in (1, 10, 100):
        for interest in ("all", "teacher"):
            last_year_column.append(
                get_number_of_all_authors(
                    days=90, year=1, edits=edits, interest=interest
                )
            )

    kpi_df["last year"] = last_year_column

    percent_change_column = []
    for edits in (1, 10, 100):
        for interest in ("all", "teacher"):
            percent_change_column.append(
                compute_percent_change(
                    get_number_of_all_authors(
                        days=90, year=0, edits=edits, interest=interest
                    ),
                    get_number_of_all_authors(
                        days=90, year=1, edits=edits, interest=interest
                    ),
                )
            )

    kpi_df["% change"] = percent_change_column

    kpi_df["% change"] = kpi_df["% change"] * 100
    kpi_df["% change"] = kpi_df["% change"].fillna(0).round().astype(int)

    edits_column = []
    for edits in (1, 10, 100):
        for interest in ("all", "teacher"):
            edits_column.append(
                get_number_of_all_edits(days=90, edits=edits, interest=interest)
            )

    kpi_df["edits"] = edits_column

    last_year_column = []
    for edits in (1, 10, 100):
        for interest in ("all", "teacher"):
            last_year_column.append(
                get_number_of_all_edits(days=90, year=1, edits=edits, interest=interest)
            )

    kpi_df["last_year"] = last_year_column

    percent_change_column = []
    for edits in (1, 10, 100):
        for interest in ("all", "teacher"):
            percent_change_column.append(
                compute_percent_change(
                    get_number_of_all_edits(
                        days=90, year=0, edits=edits, interest=interest
                    ),
                    get_number_of_all_edits(
                        days=90, year=1, edits=edits, interest=interest
                    ),
                )
            )

    kpi_df["%_change"] = percent_change_column

    kpi_df["%_change"] = kpi_df["%_change"] * 100
    kpi_df["%_change"] = kpi_df["%_change"].fillna(0).round().astype(int)

    created_contents_column = []
    for edits in (1, 10, 100):
        for interest in ("all", "teacher"):
            created_contents_column.append(
                get_number_of_created_contents(days=90, edits=edits, interest=interest)
            )

    kpi_df["created contents"] = created_contents_column

    last_year_column = []
    for edits in (1, 10, 100):
        for interest in ("all", "teacher"):
            last_year_column.append(
                get_number_of_created_contents(
                    days=90, year=1, edits=edits, interest=interest
                )
            )

    kpi_df["last._year"] = last_year_column

    percent_change_column = []
    for edits in (1, 10, 100):
        for interest in ("all", "teacher"):
            percent_change_column.append(
                compute_percent_change(
                    get_number_of_created_contents(
                        days=90, year=0, edits=edits, interest=interest
                    ),
                    get_number_of_created_contents(
                        days=90, year=1, edits=edits, interest=interest
                    ),
                )
            )

    kpi_df["%._change"] = percent_change_column

    kpi_df["%._change"] = kpi_df["%._change"] * 100
    kpi_df["%._change"] = kpi_df["%._change"].fillna(0).round().astype(int)

    def get_color(val):
        color_map = {
            100: "#2DAE58",
            75: "#3DC067",
            50: "#4ED276",
            25: "#8EE5A7",
            1: "#BFF1CE",
            -0.9: "#F2FAF4",
            -25: "#F9E2D4",
            -50: "#FAAE8D",
            -75: "#F48767",
            -100: "#ED6A49",
        }

        for threshold, color in color_map.items():
            if val >= threshold:
                return color

        return "#884324"

    standard_color = ["white", "#D6EAF8"] * 5
    number_of_authors_color = [get_color(kpi_df["% change"][i]) for i in range(6)]
    number_of_edits_color = [get_color(kpi_df["%_change"][i]) for i in range(6)]
    number_of_contents_color = [get_color(kpi_df["%._change"][i]) for i in range(6)]

    header_list = [
        "",
        "",
        "<b>Anzahl Autor*innen<b>",
        "letztes<br>Jahr",
        "% Veränderung",
        "<b>Anzahl Bearbeitungen<b>",
        "letztes<br>Jahr",
        "% Veränderung",
        "<b>Anzahl erstellte Inhalte<b>",
        "letztes<br>Jahr",
        "% Veränderung",
    ]

    # creating kpi_table

    fig = go.Figure(
        data=[
            go.Table(
                header=dict(
                    values=header_list,
                    fill_color="darkblue",
                    font_color="white",
                    align="center",
                    line_color="grey",
                ),
                cells=dict(
                    values=[
                        [
                            "<b>alle Autor*innen<b>",
                            "Bearbeit. >= 1",
                            "<b>aktive Autor*innen<b>",
                            "Bearbeit. >= 10",
                            "<b>sehr aktive Autor*innen<b>",
                            "Bearbeit. >= 100",
                        ],
                        ["alle", "Lehrkräfte"] * 3,
                        kpi_df["number of authors"],
                        kpi_df["last year"],
                        kpi_df["% change"],
                        kpi_df["edits"],
                        kpi_df["last_year"],
                        kpi_df["%_change"],
                        kpi_df["created contents"],
                        kpi_df["last._year"],
                        kpi_df["%._change"],
                    ],
                    height=70,
                    align="center",
                    font=dict(size=14),
                    fill_color=[standard_color] * 4
                    + [number_of_authors_color]
                    + [standard_color] * 2
                    + [number_of_edits_color]
                    + [standard_color] * 2
                    + [number_of_contents_color],
                    line_color="grey",
                ),
            )
        ],
    )

    fig.update_layout(
        height=700,
        width=1200,
        title="KPIs kumuliert über die letzten 90 Tage",
        title_x=0.5,
        title_font=dict(size=24),
    )

    fig.show(renderer="png")

In [None]:
def create_kpi_figure(title, get_value_func, split="activity"):
    split_mapping = {
        "activity": [
            ["alle", "aktive", "sehr aktive"],
            [1, 10, 100],
            ["all", "all", "all"],
        ],
        "interest": [
            ["alle", "Lehrkräfte", "Anteil Lehrkräfte in %"],
            [1, 1, 1],
            ["all", "teacher", "no teachers"],
        ],
    }

    fig = make_subplots(
        rows=1,
        cols=3,
        subplot_titles=("KPI 1", "KPI 2", "KPI 3"),
        specs=[[{"type": "indicator"}] * 3],
        horizontal_spacing=0.2,
    )
    annotations = []

    def add_kpi_card(fig, annotations, title, value, last_year_value, row, col):
        col_mapping = {1: 0.1, 2: 0.5, 3: 0.9}

        annotations.append(
            go.layout.Annotation(
                text=f"letztes Jahr: {last_year_value}",
                x=col_mapping.get(col),
                y=-0.15,
                showarrow=False,
                font=dict(size=25, color="black"),
            )
        )

        fig.add_trace(
            go.Indicator(
                mode="number+delta",
                value=value,
                delta={"reference": last_year_value, "font": {"size": 25}},
                title={"text": title, "font": {"color": "black"}},
                number={"valueformat": ",d", "font": {"color": "black"}},
            ),
            row=row,
            col=col,
        )

    for i in range(2):
        add_kpi_card(
            fig,
            annotations,
            split_mapping.get(split)[0][i],
            get_value_func(
                edits=split_mapping.get(split)[1][i],
                interest=split_mapping.get(split)[2][i],
            ),
            get_value_func(
                edits=split_mapping.get(split)[1][i],
                interest=split_mapping.get(split)[2][i],
                year=1,
            ),
            1,
            i + 1,
        )

    if split == "activity":
        add_kpi_card(
            fig,
            annotations,
            split_mapping.get(split)[0][2],
            get_value_func(
                edits=split_mapping.get(split)[1][2],
                interest=split_mapping.get(split)[2][2],
            ),
            get_value_func(
                edits=split_mapping.get(split)[1][2],
                interest=split_mapping.get(split)[2][2],
                year=1,
            ),
            1,
            2 + 1,
        )

    elif split == "interest":
        add_kpi_card(
            fig,
            annotations,
            split_mapping.get(split)[0][2],
            round(
                (
                    get_value_func(
                        edits=split_mapping.get(split)[1][2],
                        interest=split_mapping.get(split)[2][1],
                    )
                    * 100
                )
                / get_value_func(
                    edits=split_mapping.get(split)[1][2],
                    interest=split_mapping.get(split)[2][0],
                )
            ),
            round(
                (
                    get_value_func(
                        edits=split_mapping.get(split)[1][2],
                        interest=split_mapping.get(split)[2][1],
                        year=1,
                    )
                    * 100
                )
                / get_value_func(
                    edits=split_mapping.get(split)[1][2],
                    interest=split_mapping.get(split)[2][0],
                    year=1,
                )
            ),
            1,
            2 + 1,
        )

    fig.update_layout(
        title={
            "text": title,
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
            "font": dict(size=26, color="black"),
        },
        annotations=annotations,
        height=400,
        width=1000,
        paper_bgcolor="#D6EAF8",
    )

    for i in [0.7, 0.3]:
        fig.add_shape(
            type="rect",
            x0=i - 0.008,
            y0=-0.5,
            x1=i + 0.008,
            y1=1.13,
            line=dict(color="white", width=3),
            fillcolor="white",
            layer="below",
            opacity=1,
        )

    fig.add_shape(
        type="rect",
        x0=-0.1,
        x1=1.1,
        y0=1.13,
        y1=1.17,
        line=dict(color="white", width=3),
        fillcolor="white",
    )

    fig.show(renderer="png")

In [None]:
def create_time_chart(title, get_data_func, split):
    if split == "interest":
        y = [
            [get_data_func(days2=i, edits=1) for i in new_range(days=365 * 4)],
            [
                get_data_func(days2=i, edits=1, interest="teacher")
                for i in new_range(days=365 * 4)
            ],
        ]

        new_labels = {
            "wide_variable_0": "alle Autor*innen",
            "wide_variable_1": "Lehrkräfte",
        }

    elif split == "activity":
        y = [
            [get_data_func(days2=i, edits=1) for i in new_range(days=365 * 4)],
            [get_data_func(days2=i, edits=10) for i in new_range(days=365 * 4)],
            [get_data_func(days2=i, edits=100) for i in new_range(days=365 * 4)],
        ]

        new_labels = {
            "wide_variable_0": "alle Autor*innen",
            "wide_variable_1": "aktive Autor*innen",
            "wide_variable_2": "sehr aktive Autor*innen",
        }

    fig = px.line(x=get_one_year_dates(days=365 * 4), y=y, title=title)
    fig.update_layout(
        title={"text": title, "x": 0.5},
        plot_bgcolor="#ECF2FF",
        paper_bgcolor="#ECF2FF",
        height=600,
        width=1000,
        legend_title=None,
        xaxis_title=None,
        yaxis_title=None,
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
    )
    fig.for_each_trace(lambda t: t.update(name=new_labels[t.name]))

    fig.show(renderer="png")

In [None]:
def create_share_graph(get_data_func, split):
    if split == "interest":
        None

    elif split == "activity":
        label_mapping = {
            get_number_of_all_edits: ["Anteil an Bearbeitungen"],
            get_number_of_created_contents: ["Anteil an erstellten Inhalten"],
        }

    fig = go.Figure()

    fig.add_trace(
        go.Bar(
            x=["Anteil Autor*innen"],
            y=[
                (
                    (
                        get_number_of_all_authors(edits=1)
                        - get_number_of_all_authors(edits=10)
                    )
                    * 100
                )
                / get_number_of_all_authors(edits=1)
            ],
            name="andere",
            marker_color="lightgray",
        )
    )

    fig.add_trace(
        go.Bar(
            x=["Anteil Autor*innen"],
            y=[
                (
                    (
                        get_number_of_all_authors(edits=10)
                        - get_number_of_all_authors(edits=100)
                    )
                    * 100
                )
                / get_number_of_all_authors(edits=1)
            ],
            name="aktive Autor*innen",
            marker_color="lightblue",
        )
    )

    fig.add_trace(
        go.Bar(
            x=["Anteil Autor*innen"],
            y=[
                (get_number_of_all_authors(edits=100) * 100)
                / get_number_of_all_authors(edits=1)
            ],
            name="sehr aktive Autor*innen",
            marker_color="pink",
        )
    )

    fig.add_trace(
        go.Bar(
            x=label_mapping.get(get_data_func),
            y=[
                ((get_data_func(edits=1) - get_data_func(edits=10)) * 100)
                / get_data_func(edits=1)
            ],
            name="andere",
            marker_color="lightgray",
            showlegend=False,
        )
    )

    fig.add_trace(
        go.Bar(
            x=label_mapping.get(get_data_func),
            y=[
                ((get_data_func(edits=10) - get_data_func(edits=100)) * 100)
                / get_data_func(edits=1)
            ],
            name="aktive Autor*innen",
            marker_color="lightblue",
            showlegend=False,
        )
    )

    fig.add_trace(
        go.Bar(
            x=label_mapping.get(get_data_func),
            y=[(get_data_func(edits=100) * 100) / get_data_func(edits=1)],
            name="sehr aktive Autor*innen",
            marker_color="pink",
            showlegend=False,
        )
    )

    fig.update_traces(texttemplate="%{y:.0f}%", textposition="inside")

    fig.update_layout(
        title="Anteil aktiver und sehr aktiver Autor*innen",
        title_x=0.5,
        barmode="stack",
        height=550,
        width=1000,
        plot_bgcolor="#ECF2FF",
        paper_bgcolor="#ECF2FF",
    )

    fig.show(renderer="png")

In [None]:
def create_top_author_chart(year=0):
    title_mapping = {
        0: "aktive Autor*Innen aktuell",
        1: "aktive Autor*Innen vor einem Jahr",
    }

    fig = px.bar(get_top_authors(year=year), x="username", y="edits")
    fig.update_layout(title_x=0.5, height=550, width=1000)
    fig.show(renderer="png")

In [None]:
def get_list_of_new_authors(days=90):
    data = merged_df_edits.copy()
    data["date"] = pd.to_datetime(data["date"])

    cutoff_date = data["date"].max() - pd.Timedelta(days=days)
    prior_period_cutoff = cutoff_date - pd.Timedelta(days=days)

    recent_data = data[data["date"] > cutoff_date]
    prior_data = data[
        (data["date"] > prior_period_cutoff) & (data["date"] <= cutoff_date)
    ]

    new_authors = np.setdiff1d(
        recent_data["username"].unique(), prior_data["username"].unique()
    )

    final_df = (
        recent_data[recent_data["username"].isin(new_authors)]
        .groupby("username")
        .agg(
            edits=("username", "count"),
            interest=("interest", "first"),
            subjects=("subject", lambda x: ", ".join(x.dropna().unique())),
            in_last_90_days=("date", "min"),
            latest_appearance=("date", "max"),
        )
        .reset_index()
    )

    first_appearance_df = data.groupby("username")["date"].min().reset_index()
    first_appearance_df["date"] = first_appearance_df["date"].dt.date

    final_df = pd.merge(final_df, first_appearance_df, on="username", how="left")
    final_df = final_df.rename(columns={"date": "first_appearance"})

    final_df = final_df.sort_values(by="edits", ascending=False).reset_index(drop=True)
    final_df.index = np.arange(1, len(final_df) + 1)

    final_df = final_df[
        [
            "username",
            "edits",
            "interest",
            "subjects",
            "first_appearance",
            "in_last_90_days",
            "latest_appearance",
        ]
    ]

    final_df["in_last_90_days"] = final_df["in_last_90_days"].dt.date
    final_df["latest_appearance"] = final_df["latest_appearance"].dt.date

    return final_df

In [None]:
# Activation Rate pro Fach


def calc_activation_rates(days, edits, baseline):
    subject_list = list(filter(None, list(merged_df_edits["subject"].unique())))
    rates_df = pd.DataFrame(columns=["subject", "activation_rate", "loss_rate"])

    for subject in subject_list:
        between_df = pd.DataFrame()
        activation_rate = int()
        loss_rate = int()
        for month in range(0, 2):
            lower_date = pd.Timestamp.today() - pd.Timedelta(days=days + month * 30)
            upper_date = pd.Timestamp.today() - pd.Timedelta(days=month * 30)
            df1 = pd.DataFrame()
            df2 = pd.DataFrame()
            df3 = pd.DataFrame()
            df4 = pd.DataFrame()
            df5 = pd.DataFrame()

            df1 = merged_df_edits[lower_date < merged_df_edits["date"]]
            df2 = df1[df1["date"] < upper_date]
            df3 = df2[df2["subject"] == subject]
            df4 = df3.groupby(
                by=["actor_id", "username", "subject"], as_index=False
            ).count()
            # Delete all authors under baseline
            df5 = df4[df4["uuid"] >= baseline]
            df5["isActive"] = df5["uuid"].apply(lambda x: 1 if x >= edits else 0)

            if between_df.empty:
                between_df = df5
            else:
                between_df = pd.merge(
                    between_df, df5[["actor_id", "isActive"]], on=["actor_id"]
                )

        if (
            not between_df.empty
            and "isActive_x" in between_df.columns
            and "isActive_y" in between_df.columns
        ):
            between_df["change"] = between_df["isActive_x"] - between_df["isActive_y"]
            activation_rate = between_df["change"][
                between_df["change"] > 0
            ].sum() / len(between_df)
            loss_rate = between_df["change"][between_df["change"] < 0].sum() / len(
                between_df
            )
        else:
            continue
        append_df = pd.DataFrame(
            [[subject, round(activation_rate, 2), round(loss_rate, 2)]],
            columns=["subject", "activation_rate", "loss_rate"],
        )
        rates_df = pd.concat([rates_df, append_df], ignore_index=True)

        # Define a less intense color map

    def get_color(val):
        color_map = {
            0.5: "#2DAE58",
            0.4: "#3DC067",
            0.3: "#4ED276",
            0.15: "#8EE5A7",
            0.00001: "#BFF1CE",
            -0.0: "#F2FAF4",
            -0.15: "#F9E2D4",
            -0.3: "#FAAE8D",
            -0.4: "#F48767",
            -0.5: "#ED6A49",
            -2: "#eac1af",
        }

        for threshold, color in color_map.items():
            if val >= threshold and val != 0:
                return f"background-color: {color};"
        return ""

    # Apply the coloring function to the dataframe
    styled_table = rates_df.style.applymap(
        get_color, subset=["activation_rate", "loss_rate"]
    ).to_html()

    # Display the styled table with the new color map
    display(HTML(styled_table))

In [None]:
# Activation Rate über alle Fächer hinweg


def calc_act_rates_wo_subj(days, edits, baseline):
    result_df = pd.DataFrame()

    for month in range(0, 2):
        lower_date = pd.Timestamp.today() - pd.Timedelta(days=days + month * 30)
        upper_date = pd.Timestamp.today() - pd.Timedelta(days=month * 30)
        df1 = pd.DataFrame()
        df2 = pd.DataFrame()
        df3 = pd.DataFrame()

        df1 = merged_df_edits[lower_date < merged_df_edits["date"]]
        df2 = df1[df1["date"] < upper_date]
        df3 = df2.groupby(by=["actor_id", "username"], as_index=False).count()
        # Delete all authors under baseline
        df4 = df3[df3["uuid"] >= baseline]
        df4["isActive"] = df4["uuid"].apply(lambda x: 1 if x >= edits else 0)

        if result_df.empty:
            result_df = df4
        else:
            result_df = pd.merge(
                result_df, df4[["actor_id", "isActive"]], on=["actor_id"]
            )

    if (
        not result_df.empty
    ):  # and 'isActive_x' in between_df.columns and 'isActive_y' in between_df.columns:
        result_df["change"] = result_df["isActive_x"] - result_df["isActive_y"]
        activation_rate = result_df["change"][result_df["change"] > 0].sum() / len(
            result_df
        )
        loss_rate = result_df["change"][result_df["change"] < 0].sum() / len(result_df)

    return activation_rate, loss_rate

In [None]:
def show_active_authors(
    active_authors_data, new_authors=list(get_list_of_new_authors()["username"])
):
    if len(active_authors_data) == 0:
        return
    # Ensure the data has a 'username' column
    if "username" not in active_authors_data.columns:
        active_authors_data["username"] = active_authors_data.index

    soft_blue = "#1f77b4"
    soft_red = "#d62728"
    soft_turquoise = "#40E0D0"

    fig_height = len(active_authors_data) * 0.6
    fig, ax = plt.subplots(figsize=(15, fig_height))
    fig.set_facecolor("#ECF2FF")
    ax.set_facecolor("#ECF2FF")

    colors = []
    for idx, row in active_authors_data.iterrows():
        if row["username"] == "others":
            colors.append(soft_red)
        elif (
            merged_df_edits[merged_df_edits["username"] == row["username"]][
                "interest"
            ].iloc[0]
            == "teacher"
        ):
            colors.append(soft_turquoise)
        else:
            colors.append(soft_blue)

    bars = sns.barplot(
        y=active_authors_data["username"],
        x=active_authors_data["edits"],
        ax=ax,
        orient="h",
        palette=colors,
    )

    # Highlight new authors with a red 'x'
    for bar in bars.patches:
        y = bar.get_y() + bar.get_height() / 2
        username = active_authors_data.index[int(y)]
        if username in new_authors:
            ax.text(
                bar.get_x() + bar.get_width() / 2,
                y,
                "x",
                color="red",
                ha="center",
                va="center",
                weight="bold",
                fontsize=12,
            )

    # Display the number of edits next to the bars
    for index, value in enumerate(active_authors_data["edits"]):
        ax.text(value, index, " " + str(value), color="black", va="center")

    # Set title and labels
    ax.set_title("Active Authors Currently")
    ax.set_xlabel("Number of Edits")
    ax.set_ylabel("Username")

    # Create a custom legend
    legend_labels = ["teacher", "new authors"]
    legend_colors = [soft_turquoise, "red"]
    legend_hatches = ["", "x"]
    handles = [
        plt.Rectangle((0, 0), 1, 1, color=color, hatch=hatch)
        for color, hatch in zip(legend_colors, legend_hatches)
    ]
    ax.legend(handles, legend_labels)

    # Remove the borders for a cleaner look
    sns.despine(left=True, bottom=True)

    plt.show()

In [None]:
def show(subject=None, lower=10, time_interval=90):
    df = merged_df_edits.copy()

    if subject:
        df = df[df["subject"] == subject]
        if len(df) == 0:
            return

    edits_per_day = compute_edits_per_day_per_user(df)
    df = edits_per_day.rolling(time_interval, min_periods=time_interval).sum()
    df.dropna(inplace=True)

    display(Markdown(f"### Plot Anzahl Autor:innen mit Edits >= {lower}"))

    fig, ax = plt.subplots(figsize=(15, 6))
    sns.lineplot(
        data=df.apply(lambda x: x.map(lambda y: y >= lower).sum(), axis=1),
        ax=ax,
        color="#1f77b4",
    )
    ax.set_title(
        f"Number of Authors with Edits >= {lower} over the past {time_interval} days"
    )
    ax.set_xlabel("Date")
    ax.set_ylabel("Number of Authors")
    ax.grid(True, which="both", linestyle="--", linewidth=0.5)
    ax.set_facecolor("#ECF2FF")
    fig.set_facecolor("#ECF2FF")
    sns.despine(left=True, bottom=True)

    plt.show()
    # df.apply(lambda x: x.map(lambda y: y >= lower).sum(), axis=1).plot(figsize=(10,10))
    # plt.show()

    df2 = pd.DataFrame(
        {
            "edits": df.loc[df.index[-1]],
            "edits_before": df.loc[df.index[-1 - time_interval]],
        }
    )

    count = (df2["edits"] >= lower).sum()

    display(Markdown(f"Anzahl Autor:innen mit Edits >= {lower}: {count}"))

    display(Markdown(f"### Autor:innen mit aktuellen Edits >= {lower}"))
    d = df2[df2["edits"] >= lower][["edits"]]
    d.sort_values("edits", inplace=True, ascending=False)
    show_active_authors(d)

    display(
        Markdown(
            f"### Verlorene Autor:innen mit aktuellen Edits < {lower} und vorher Edits >= {lower}"
        )
    )
    d = df2[(df2["edits"] < lower) & (df2["edits_before"] >= lower)][["edits"]]
    d.sort_values("edits", inplace=True, ascending=False)
    display(d)

    if subject:
        display(
            Markdown(
                f"### Neue Autor:innen (Personen, die in den letzten {time_interval} Tagen dazugekommen sind)"
            )
        )
        df3 = edits_per_day.cumsum()
        df3 = pd.DataFrame(
            {
                "edits": df3.loc[df.index[-1]],
                "edits_before": df3.loc[df.index[-1 - time_interval]],
            }
        )
        d = df3[(df3["edits"] > 0) & (df3["edits_before"] == 0)][["edits"]]
        d.sort_values("edits", inplace=True, ascending=False)
        display(d)


def compute_edits_per_day_per_user(df, since=4 * 365.25):
    current_date = df["date"].max()  # This should probably be set to today!
    df = df[df["date"] > current_date - pd.Timedelta(days=since)]

    def user_df(username):
        # display(df.head())
        u = df[df["username"] == username].copy()
        u.set_index("date", inplace=True)
        u = u.resample("D").count()[["actor_id"]]
        u.rename(columns={"actor_id": username}, inplace=True)
        return u

    df = pd.concat([user_df(u) for u in df["username"].unique()], axis=1)
    df.fillna(0, inplace=True)

    return df

In [None]:
def show_no_authors():
    def current_no_authors(d):
        return d[pd.Timestamp.today() - d["date"] < pd.Timedelta("90 days")][
            "username"
        ].nunique()

    def last_year_no_authors(d):
        return d[
            (d["date"] < pd.Timestamp.today() - pd.Timedelta("365 days"))
            & (d["date"] > pd.Timestamp.today() - pd.Timedelta("455 days"))
        ]["username"].nunique()

    no_authors_data = merged_df_edits.groupby("subject").apply(
        lambda d: pd.Series(
            {
                "current no authors": current_no_authors(d),
                "last year no authors": last_year_no_authors(d),
            }
        )
    )

    no_authors_data["percentage difference"] = (
        (
            no_authors_data["current no authors"]
            - no_authors_data["last year no authors"]
        )
        / no_authors_data["last year no authors"]
    ) * 100

    # 2. Round only the numeric values to 0 decimal places
    mask = no_authors_data["percentage difference"].apply(np.isfinite)
    no_authors_data.loc[mask, "percentage difference"] = (
        no_authors_data.loc[mask, "percentage difference"].round().astype(int)
    )

    # 3. Append the "%" sign only to numeric values
    no_authors_data["percentage difference"] = no_authors_data[
        "percentage difference"
    ].apply(lambda x: str(int(x)) + "%" if pd.notna(x) and np.isfinite(x) else x)

    # 4. Sort the dataframe by "current no authors"
    no_authors_data = no_authors_data.sort_values(
        by="current no authors", ascending=False
    )

    # 5. Define a function to color values
    def color_percentage(val):
        if val == "0%":
            return "color: black"
        elif isinstance(val, str) and (val.startswith("-") or val == -np.inf):
            return "color: red"
        elif (
            isinstance(val, str)
            and (val.replace("%", "").isdigit() and not val.startswith("-"))
        ) or val == np.inf:
            return "color: green"
        return ""

    # 6. Apply the coloring function to the "percentage difference" column
    styled_table_sorted = no_authors_data.style.applymap(
        color_percentage, subset=["percentage difference"]
    ).to_html()

    display(HTML(styled_table_sorted))

In [None]:
def get_color(rate):
    if rate > 0:
        return "green"
    elif rate < 0:
        return "red"
    else:
        return "grey"