In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import mysql.connector

db = mysql.connector.connect(
    host="localhost",
    user="root",
    password="secret",
    port="3306",
    database="serlo",
    charset="latin1"
)

pd.set_option('display.max_rows', None)



In [2]:
from IPython.display import display, Markdown

def fix_encoding(title):
    return bytes(title, "latin1").decode("utf8")

def m(text):
    display(Markdown(text))

In [3]:
def cached(func):
    cache = dict()
    
    def return_func(arg):
        if (arg in cache):
            return cache[arg]
        else:
            result = func(arg)
            cache[arg] = result
            return result
    
    return return_func

def query(sql):
    c = db.cursor()
    c.execute(sql)
    
    return c.fetchall()

def querySingleton(sql):
    return [ x[0] for x in query(sql) ]

@cached
def getParent(termId):
    return querySingleton("""
        select parent_id from term_taxonomy where id = %s;
    """ % termId)[0]

def getTermName(termId):
    return fix_encoding(querySingleton("""
        select term.name from term_taxonomy
        join term on term.id = term_taxonomy.term_id
        where term_taxonomy.id = %s;
    """ % termId)[0])

@cached
def getSubject(termId):
    #if int(termId) in [79733, 81317, 20852, 87814, 87827, 85477, 87860, 75049, 76750, 87496, 75678, 91252, 91253]:
    #    return "Prüfungsbereich Mathematik"
    if int(termId) in [106082]:
        return getTermName(termId)
    
    parent = getParent(termId)
    
    if parent == 106081:
        return getTermName(termId)
    
    if parent == None:
        return None
    
    grandparent = getParent(parent)
        
    return getSubject(parent) if grandparent != None else getTermName(termId)

@cached
def getSubjectFromUuid(uuid):
    taxonomyTerms = querySingleton(f"""
        select term_taxonomy.id from term_taxonomy
        where term_taxonomy.id  = {uuid};
    """)
    
    if len(taxonomyTerms) > 0:
        return getSubject(taxonomyTerms[0])
    
    taxonomyTerms = querySingleton(f"""
        select term_taxonomy_id from term_taxonomy_entity
        where term_taxonomy_entity.entity_id  = {uuid};
    """)
    
    if len(taxonomyTerms) > 0:
        return getSubject(taxonomyTerms[0])

    parents = querySingleton(f"""
        select parent_id from entity_link
        where entity_link.child_id  = {uuid};
    """)
    
    if len(parents) > 0:
        return getSubjectFromUuid(parents[0])
    
    parents = query(f"""
        select parent_id, uuid_id from comment
        where comment.id  = {uuid};
    """)
    
    if len(parents) > 0:
        parent_id, uuid_id = parents[0]
        
        if parent_id:
            return getSubjectFromUuid(parent_id)
        else:
            return getSubjectFromUuid(uuid_id)
    
    parents = querySingleton(f"""
        select repository_id from entity_revision
        where entity_revision.id  = {uuid};
    """)
    
    if len(parents) > 0:
        return getSubjectFromUuid(parents[0])
    
    return None

#for uuid in [3, 82047, 127338, 63496, 1, 170741, 167497, 93379, 93387]:
#    display(getSubjectFromUuid(uuid))

In [4]:
@cached
def getEntityTypeFromUuid(uuid):
    results = querySingleton(f"""
        select type.name
        from type
        join entity on entity.type_id = type.id
        where entity.id = {uuid}
    """)
    
    if len(results) > 0:
        return results[0]
    
    results = querySingleton(f"""
        select type.name
        from type
        join entity on entity.type_id = type.id
        join entity_revision on entity.id = entity_revision.repository_id
        where entity_revision.id = {uuid}
    """)
    
    if len(results) > 0:
        return results[0]
    
    return None

In [5]:
@cached
def getInstanceFromUuid(uuid):
    results = querySingleton(f"""
        select instance.subdomain
        from term_taxonomy
        join term on term_taxonomy.term_id = term.id
        join instance on instance.id = term.instance_id
        where term_taxonomy.id = {uuid}
    """)
    
    if len(results) > 0:
        return results[0]

    results = querySingleton(f"""
        select instance.subdomain
        from entity
        join instance on instance.id = entity.instance_id
        where entity.id = {uuid};
    """)
    
    if len(results) > 0:
        return results[0]
    
    results = querySingleton(f"""
        select instance.subdomain
        from comment
        join instance on instance.id = comment.instance_id
        where comment.id = {uuid};
    """)
    
    if len(results) > 0:
        return results[0]
    
    parents = querySingleton(f"""
        select repository_id from entity_revision
        where entity_revision.id  = {uuid};
    """)
    
    if len(parents) > 0:
        return getInstanceFromUuid(parents[0])
    
    return None

#for uuid in [82047, 127338, 63496, 1, 170741, 167497, 93379, 93387]:
#    display(getInstanceFromUuid(uuid))

In [9]:
def event_category(row):
    # mysql> select * from event;
    # +----+-----------------------------+-------------+
    # | id | name                        | description |
    # +----+-----------------------------+-------------+
    # |  1 | taxonomy/term/associate     | NULL        |
    # |  2 | taxonomy/term/create        | NULL        |
    # |  3 | license/object/set          | NULL        |
    # |  4 | entity/create               | NULL        |
    # |  5 | entity/revision/add         | NULL        |
    # |  6 | entity/revision/checkout    | NULL        |
    # |  7 | entity/link/create          | NULL        |
    # |  8 | discussion/create           | NULL        |
    # |  9 | discussion/comment/create   | NULL        |
    # | 10 | uuid/trash                  | NULL        |
    # | 11 | entity/revision/reject      | NULL        |
    # | 12 | taxonomy/term/update        | NULL        |
    # | 13 | uuid/restore                | NULL        |
    # | 14 | discussion/comment/archive  | NULL        |
    # | 15 | taxonomy/term/parent/change | NULL        |
    # | 16 | discussion/restore          | NULL        |
    # | 17 | taxonomy/term/dissociate    | NULL        |
    # | 18 | entity/link/remove          | NULL        |
    # +----+-----------------------------+-------------+
    
    event_id, uuid_type = row[0], row[1]
    
    if event_id in [10,13] and uuid_type =="comment":
        return "moderation"
    elif event_id in [1,2,7, 12,15, 17, 18]:
        return "taxonomy-architekt"
    elif event_id in [4,5]:
        return "edit"
    elif event_id in [11,6]:
        return "review"
    elif event_id in [14,16]:
        return "moderation"
    elif event_id in [8,9]:
        return "commenting"
    elif event_id in [10, 13, 3]:
        return "admin"
    
def delete_consecutive_events(df):
    df.sort_values(["user_id", "date"], inplace=True)
    df["time_diff_seconds"] = df["date"].diff().map(lambda x: x.total_seconds())
    
    to_delete = (df["time_diff_seconds"] >= 0) & (df["time_diff_seconds"] < 3)
    df.drop(df[to_delete].index, inplace=True)
    df.drop("time_diff_seconds", axis=1, inplace=True)
    
    return df

In [19]:
def load_event_log():
    d = pd.read_sql("""
        select
            event_log.id as event_log_id,
            event_log.date,
            user.id as user_id,
            event.id as event_id,
            event.name as event_name,
            user.username,
            event_log.uuid_id as uuid,
            uuid.trashed as uuid_trashed,
            uuid.discriminator as uuid_type
        from event_log
        join user on user.id = event_log.actor_id
        join event on event.id = event_log.event_id
        join uuid on event_log.uuid_id = uuid.id
        where year(event_log.date) >= 2018
            and event_log.event_id in (4,5)
        order by event_log.id
    """, db)
    d.set_index("event_log_id", inplace=True)
    delete_consecutive_events(d)
    d.sort_values(["event_log_id"], inplace=True)
    
    d["subject"] = d["uuid"].map(getSubjectFromUuid)
    d["instance"] = d["uuid"].map(getInstanceFromUuid)
    d["entity_type"] = d["uuid"].map(getEntityTypeFromUuid)
    d["event_category"] = d[["event_id", "uuid_type"]].apply(event_category, axis=1)
    
    return d

In [20]:
df = load_event_log()

In [21]:
df["year"] = df["date"].map(lambda x: x.year)

In [22]:
df.head()

Unnamed: 0_level_0,date,user_id,event_id,event_name,username,uuid,uuid_trashed,uuid_type,subject,instance,entity_type,event_category,year
event_log_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
193360,2018-01-01 15:36:17,62384,5,entity/revision/add,Rebi,93379,0,entityRevision,Mathe,de,text-solution,edit,2018
193362,2018-01-01 15:38:00,62384,5,entity/revision/add,Rebi,93380,0,entityRevision,Mathe,de,text-solution,edit,2018
193367,2018-01-02 10:41:16,70443,5,entity/revision/add,sansmile,93381,0,entityRevision,Biologie,de,article,edit,2018
193371,2018-01-02 10:53:29,10,5,entity/revision/add,Simon,93382,0,pageRevision,,,,edit,2018
193373,2018-01-02 12:03:13,10,5,entity/revision/add,Simon,93383,0,entityRevision,Community,de,article,edit,2018


In [25]:
for year in [2019,2020,2021,2022]:
    m(f"# Jahr {year}")
    
    edits = len(df[(df["year"] == year) & 
                   (df["subject"] == "Nachhaltigkeit") &
                   (df["event_category"] == "edit")])
    
    m(f"Anzahl Bearbeitungen: {edits}")
    
    edits = len(df[(df["year"] == year) & 
                   (df["subject"] == "Nachhaltigkeit") &
                   (df["event_id"] == 4)])
    
    m(f"Davon Neuerstellungen: {edits}")

# Jahr 2019

Anzahl Bearbeitungen: 628

Davon Neuerstellungen: 80

# Jahr 2020

Anzahl Bearbeitungen: 215

Davon Neuerstellungen: 32

# Jahr 2021

Anzahl Bearbeitungen: 447

Davon Neuerstellungen: 89

# Jahr 2022

Anzahl Bearbeitungen: 529

Davon Neuerstellungen: 71