In [24]:
from datetime import date, datetime
import pandas as pd
import mysql.connector
pd.options.mode.chained_assignment = None

from IPython.display import display, Markdown, HTML

In [2]:
db = mysql.connector.connect(
    host="localhost",
    user="root",
    password="secret",
    port="3306",
    database="serlo",
    charset="latin1"
)

def cached(func):
    cache = dict()
    
    def return_func(arg):
        if (arg in cache):
            return cache[arg]
        else:
            result = func(arg)
            cache[arg] = result
            return result
    
    return return_func

def query(sql):
    c = db.cursor()
    c.execute(sql)
    
    return c.fetchall()

def querySingleton(sql):
    return [ x[0] for x in query(sql) ]

@cached
def getParent(termId):
    return querySingleton("""
        select parent_id from term_taxonomy where id = %s;
    """ % termId)[0]

def getTermName(termId):
    return querySingleton("""
        select term.name from term_taxonomy
        join term on term.id = term_taxonomy.term_id
        where term_taxonomy.id = %s;
    """ % termId)[0]

@cached
def getSubject(termId):
    if int(termId) in [79733, 81317, 20852, 87814, 87827, 85477, 87860, 75049, 76750, 87496, 75678, 91252, 91253]:
        return "Prüfungsbereich Mathematik"
    if int(termId) in [106082]:
        return getTermName(termId)
    
    parent = getParent(termId)
    grandparent = getParent(parent)
    
    if (parent == 106081):
        return getTermName(termId)
    
    return getSubject(parent) if grandparent != None else getTermName(termId)

@cached
def getSubjectFromUuid(uuid):
    taxonomyTerms = querySingleton(f"""
        select term_taxonomy_id from term_taxonomy_entity
        where term_taxonomy_entity.entity_id  = {uuid};
    """)
    
    if len(taxonomyTerms) > 0:
        return getSubject(taxonomyTerms[0])

    parents = querySingleton(f"""
        select parent_id from entity_link
        where entity_link.child_id  = {uuid};
    """)
    
    if len(parents) > 0:
        return getSubjectFromUuid(parents[0])
    
    return None


In [3]:
import json

json.dumps(querySingleton("""
select distinct(entity_link.parent_id ) from event_log join entity_link on entity_link.child_id = event_log.uuid_id where event_log.event_id = 4 and event_log.date > Date("2020-02-01");
"""));

In [4]:
def read_event_log():
    df = pd.read_sql("""
        select event_log.id, event_log.actor_id, event_log.date, user.username, event_parameter_uuid.uuid_id from event_log
        join user on user.id = event_log.actor_id
        join event_parameter on event_parameter.log_id = event_log.id
        join event_parameter_uuid on event_parameter_uuid.event_parameter_id = event_parameter.id
        where event_log.event_id = 5
        and year(event_log.date) > 2018
        and user.username != "Legacy"
    """, db)
    df.set_index("id", inplace=True)
    df.rename(columns={"uuid_id": "uuid"}, inplace=True)
    df["subject"] = df["uuid"].map(getSubjectFromUuid)
    return df

event_log = read_event_log()
event_log.head(100)

Unnamed: 0_level_0,actor_id,date,username,uuid,subject
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
252170,95850,2019-01-02 10:35:03,Leo1,127338,Chemie
252172,95850,2019-01-02 10:58:42,Leo1,127338,Chemie
252180,95849,2019-01-02 11:46:55,hernlmax,63496,Chemie
252185,95849,2019-01-02 11:51:49,hernlmax,127428,Chemie
252187,95849,2019-01-02 11:52:47,hernlmax,127428,Chemie
...,...,...,...,...,...
252503,121732,2019-01-04 16:48:53,Jonathan,127576,Mathe
252505,121732,2019-01-04 16:49:01,Jonathan,127602,Mathe
252510,95854,2019-01-04 16:50:43,markus_janker,127600,Chemie
252512,95854,2019-01-04 16:57:43,markus_janker,127600,Chemie


In [5]:
user_df = pd.read_sql("""
        select id, username,date, logins from user where date > '2022-01-01';
    """, db)

In [6]:
user_df.head()

Unnamed: 0,id,username,date,logins
0,234298,johnkiz12,2022-01-01 09:39:09,1
1,234306,ggacor77,2022-01-01 18:37:06,1
2,234309,blushinxmia,2022-01-01 19:21:36,0
3,234312,Amy_DuhCuteOne,2022-01-01 20:37:11,0
4,234364,fun88sktlive,2022-01-03 04:49:53,1


# 1. Wie viele User* Innen sind in 2022 dazu gekommen?

In [7]:
new_user_list = list(user_df.id.unique())

this_year_df = event_log['2021-12-31' < event_log['date']] 
editing_users_list = list(this_year_df.actor_id)
new_editing_users_list = set(editing_users_list) & set(new_user_list)

display(Markdown(f"Neue User*Innen in 2022: {len(new_user_list)}"))
display(Markdown(f"User*Innen mit mindestens einer Bearbeitung: {len(new_editing_users_list)}"))

Neue User*Innen in 2022: 2122

User*Innen mit mindestens einer Bearbeitung: 69

## 1.1 Darunter wie viele aktive Autor* Innen?

In [8]:
days = 90
edits = 10
result_df = pd.DataFrame()
week = 0
  
lower_date = pd.Timestamp.today() - pd.Timedelta(days = days + week*7)
upper_date = pd.Timestamp.today() - pd.Timedelta(days = week*7)
 
df2 = event_log[lower_date < event_log['date']] 

df3 = df2.groupby(by = ['actor_id', 'username'], as_index = False).count()
#Delete all authors under baseline
df4 = df3
df4['isActive'] = df4['uuid'].apply(lambda x: 1 if x >= edits else 0)

active_users_df = df4[df4['isActive'] == 1]

active_authors_list = list(active_users_df.actor_id)


new_active_authors_list = set(active_authors_list) & set(new_user_list)
display(Markdown(f"Neue Aktive Autor*Innen: {len(new_active_authors_list)}"))

Neue Aktive Autor*Innen: 10

## 1.2 Darunter wie viele sehr aktive Autor* Innen?

In [9]:
df4['isVeryActive'] = df4['uuid'].apply(lambda x: 1 if x >= 100 else 0)

very_active_users_df = df4[df4['isVeryActive'] == 1]

very_active_authors_list = list(very_active_users_df.actor_id)

new_very_active_authors_list = set(very_active_authors_list).intersection(new_user_list)
display(Markdown(f"Sehr aktive Autor*Innen: {len(new_very_active_authors_list)}"))

Sehr aktive Autor*Innen: 1

In [10]:
very_active_users_df[very_active_users_df['actor_id'].isin(new_very_active_authors_list)]

Unnamed: 0,actor_id,username,date,uuid,subject,isActive,isVeryActive
35,240298,Felix_Eccardt,2072,2072,2053,1,1


# 2. Wie viele neue Inhalte in 2022?

In [11]:
entity_create_df = pd.read_sql("""
        select * from event_log where event_id = 4 and date > '2022-01-01'
    """, db)

In [12]:
display(Markdown(f"Anzahl neuer Inhalte in 2022: {len(entity_create_df)}"))

Anzahl neuer Inhalte in 2022: 5079

# 3. Wie viele Bearbeitungen in 2022?

In [13]:
len(this_year_df)

17289

# 4. Liste der 10 aktivsten Autor* Innen und Zahl deren Bearbeitungen

In [14]:
active_allyear_df = this_year_df.groupby(by = ['actor_id', 'username'], as_index = False).count()
top_ten_df = active_allyear_df.nlargest(10, 'uuid')
top_ten_df = top_ten_df.drop(['date', 'subject'], axis=1)
top_ten_df = top_ten_df.rename(columns={"uuid": "edits"})

In [15]:
top_ten_df.reset_index()

Unnamed: 0,index,actor_id,username,edits
0,79,240298,Felix_Eccardt,4939
1,11,64900,Kowalsky,1934
2,38,148701,Sascha_Lill_95,1145
3,16,87602,Karin,1143
4,41,163773,LinaMaria,1143
5,44,169563,Peter,667
6,73,235184,korbi_d,545
7,51,197401,Annika-Hemlein,499
8,32,146667,BerndF,470
9,14,73435,kathongi,461
