# EventKG+Click dataset



This is a step by step walkthrough to the creation of EventKG+Click dataset which aims to facilitate the creation and evaluation of multilingual user interaction models and reflects the language-specific relevance of events and their relations.  Our dataset EventKG+Click is based on two data sources:
* The Wikipedia clickstream2 that refleects real-world user interactions withevents and their relations within language-specific Wikipedia editions; and 
* The EventKG knowledge graph that contains semantic information regarding events and their relations that partially originates from Wikipedia. EventKG+Click is available online3 to enable further analyses and applications.

##  Dataset preparation

Link to clickstream dataset: 
https://dumps.wikimedia.org/other/clickstream/
    
As EventKG+Click and our analysis are based on Wikipedia click behaviour, we only consider those (source, target) click pairs in the clickstream where both the source and target are Wikipedia articles connected by a hyperlink. In our dataset, we adopter Wikipedia clickstream that covers the period from December 1, 2019, to December 31, 2019 and in three language versions, **English**, **German** and **Russian**.

The following example shows how we get information from english version:

In [None]:
import pandas as pd
import json
from SPARQLWrapper import SPARQLWrapper, JSON, POST

English = pd.read_csv("C:/Users/Admin/Downloads/clickstream-enwiki-2019-12.tsv", sep='\t',header=None,  error_bad_lines=False) 
English.columns = ['source','target','link_type','count']
English=English.loc[English['link_type'] == 'link'] ### we only choose those that are in the wikipedia


In [135]:
wds = "http://eventkginterface.l3s.uni-hannover.de/sparql"

entity_mapping_rq='''

SELECT substr(str(?entity), 45)
{{
?entity owl:sameAs {0}
}}
'''

comention_rq='''
SELECT sum(?a) as ?cnt WHERE
{{
?relation2 rdf:subject eventKG-r:{0} .
?relation2 rdf:object eventKG-r:{1} .
?relation2 eventKG-s:mentions ?a. 
}}
'''


event_location_rq= '''
SELECT DISTINCT "{0}" AS ?event ?location
WHERE {{
eventKG-r:{0} sem:hasPlace ?locationEventKG.
?locationEventKG so:containedInPlace ?co .
?co rdfs:label ?location .
?co rdf:type dbo:Country .
FILTER (LANG(?location)="en")
}}
'''

event_time_rq ='''
SELECT distinct "{0}" as ?event ?start
WHERE
{{
eventKG-r:{0}  sem:hasBeginTimeStamp ?start .
}}
'''

In [12]:
def sparql_request(service, query):
    
    sparql = SPARQLWrapper(service)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()
    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']
    out = []
    for row in processed_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)
    return pd.DataFrame(out, columns=cols)

### Entity_mapping function

In order to leverage the rich information provided in EventKG and find interlinked wikipedia pages in different languages, we need to map title of Wikipedia pages to EventKG. 

*input: label of wikipedia pages*
*output: entity_id in EventKG*

In [None]:
def entity_mapping (label):
    #options for language: {"en","de","ru"}
    dbpedia_page="<http://dbpedia.org/resource/"+label+">"
    temp=get_sparql_dataframe(wds, entity_mapping_rq.format(dbpedia_page))
    return temp.iloc[0][0]


In order to show how we have collected data using functions, we work on a small dataset which contains all clicked wikipedia 
pages after exploring **"World War II"**. 

In [None]:
en_data=pd.read_csv("World_War_II_clickstream.txt", sep='\t',error_bad_lines=False)
en_entity=list(en_data["source"].unique())+list(en_data["target"].unique())
mapped_labels = pd.DataFrame(columns=('label','ekg_entity'))

for t in range(len(en_entity)):
    try:
        temp=entity_mapping(str(en_entity[t]),"")
        mapped_labels=mapped_labels.append({'label' : en_entity[t] , 'ekg_entity':temp} , ignore_index=True)
    except:
        print("The corresponding entity doesn't exist on EventKG")
        
en_mapped=pd.merge(left=en_data, right=en_mapped, how="left", left_on="target", right_on="label")
en_mapped=en_mapped.rename(columns={"ekg_entity":"target_ekg"})
en_mapped=pd.merge(left=en_data, right=en_mapped, how="left", left_on="source", right_on="label")
en_mapped=en_mapped.rename(columns={"ekg_entity":"source_ekg"})
en_mapped=en_mapped[["source","target","source_ekg","target_ekg","count"]]

In [62]:
en_data=pd.read_csv("World_War_II_clickstream", sep='\t',error_bad_lines=False)
en_entity=list(en_data["source"].unique())+list(en_data["target"].unique())
mapped_labels = pd.DataFrame(columns=('label','ekg_entity'))

for t in range(len(en_entity)):
    try:
        temp=entity_mapping(str(en_entity[t]))
        mapped_labels=mapped_labels.append({'label' : en_entity[t] , 'ekg_entity':temp} , ignore_index=True)
        #print(i)
    except:
        continue
            
en_mapped=pd.merge(left=en_data, right=mapped_labels, how="left", left_on="target", right_on="label")
en_mapped=en_mapped.rename(columns={"ekg_entity":"target_ekg"})
en_mapped=pd.merge(left=en_mapped, right=mapped_labels, how="left", left_on="source", right_on="label")
en_mapped=en_mapped.rename(columns={"ekg_entity":"source_ekg"})
en_mapped=en_mapped[["source","target","source_ekg","target_ekg","count"]]
en_mapped=en_mapped.dropna() ### because we are intereseted only on mapped entities and labels

### Joining three dataframes

After creating **ge_mapped** and **ru_mapped** corresponding mapped wikipedia pages to the knowledge graph, we join three dataframe to get the intersection of events. We are interested to compare relevance of events and relations with respect to the three languages. 
And since we are intereseted to analyse the events, we only keep targets which are events. to do so, we could easily use the prefixes of entities in Eventkg.

In [None]:
intersection_table=pd.merge(pd.merge(de_mapped,en_mapped,how="inner", on=['source_ekg', 'target_ekg']),ru_mapped,how="inner",on=['source_ekg', 'target_ekg'])
intersection_table=intersection_table.loc[en_mapped["target_ekg"].str.startswith("event"),]



## Balancing data

Wikipedia language versions are different in terms of their size, number of user and the amount of edited content. in order to balance the effects of size in each language versions, we normalize the number of clicks with respect to the total
number of clicks in the respective language, which leads to normalized scores in the range [0; 1]. In order to create balanced click counts, we then multiply the normalised score by the total number of clicks in the clickstreams.

$balanced\_clicks(e_s,e_t,l) = clicks(e_s,e_t,l) \cdot \frac{\sum_{l' \in L}\sum_{e_s' \in E}\sum_{e_t' \in E} clicks(e_s',e_t',l')}{\sum_{e_s' \in E}\sum_{e_t' \in E} clicks(e_s',e_t',l)} $

Using the above formula, normalized scores for each language are as follows which we use directly on the **en_data** dataframe.


* English normalized score: 1.6
* German normalized score: 4.1
* Russian normalized score: 6.2

In [None]:
intersection_table["balanced_en_count"]= 1.6 * intersection_table["en_count"]
intersection_table["balanced_de_count"]= 4.1 * intersection_table["de_count"]
intersection_table["balanced_ru_count"]= 6.2 * intersection_table["ru_count"]

## Language-specific Relation Relevance

This score assigns a relevance score to the relation between a source entity and a target event et in a given language.

$relation\_relevance(e_s,e_t,l) = \frac{balanced\_clicks(e_s,e_t,l)}{\sum_{l' \in L} balanced\_clicks(e_s,e_t,l')} \in [0,1]$


In [None]:


intersection_table['en_normalized'] = intersection_table["en_count"]/(intersection_table["ru_count"]+intersection_table["de_count"]+intersection_table["en_count"])
intersection_table['de_normalized'] = intersection_table["de_count"]/(intersection_table["ru_count"]+intersection_table["de_count"]+intersection_table["en_count"])
intersection_table['ru_normalized'] = intersection_table["ru_count"]/(intersection_table["ru_count"]+intersection_table["de_count"]+intersection_table["en_count"])


# Event Location Closeness
    
Using the following function, we aim to get a set of binary influence factors that indicate whether an event happened in a location where the respective language (*Enlish, German, Russian*) is an offcial language. To do so, we have created **country_language** dataframe that contains countries where English, German, Russian are official languages.

*input: a list of events*
*output: event and 3 binary columns for English, German and Russian*



In [103]:

#input: list of events
#output: a dataframe with 3 columns which shows whether the event has happend in a english, german and russian, speaking location
def get_location(events):
    events_location=pd.DataFrame()
    country_language=pd.read_pickle("country_language")
    events=list(en_mapped["target_ekg"].unique())
    for i in range (len(events)):
        temp=get_sparql_dataframe(wds, event_location_rq.format(events[i]))
        events_location=events_location.append(temp)
    events_language=pd.merge(left=events_location,right=country_language,how="left", left_on="location", right_on="country")
    events_language=events_language.loc[events_language["language"].notna(),]
    events_language['english'] = [1 if x =='English' else 0 for x in events_language['language']] 
    events_language['german'] = [1 if x =='German' else 0 for x in events_language['language']] 
    events_language['russian'] = [1 if x =='Russian' else 0 for x in events_language['language']] 
    events_language=events_language[["event","english","german","russian"]]
    events_language=events_language.drop_duplicates()
       
    return events_language




# Event Recency

To observe the impact of recency on the language-specific user click behaviour and using **get_recency** function we compute a recency score which is the number of days between the event start date and the start date of the clickstream dataset which is (2019-12-01)

*input: events list*
*output: dataframe of two columns: events, receny*

In [None]:
def get_recency():
   
    events_time=pd.DataFrame()
    for i in range (len(events)):
        temp=get_sparql_dataframe(wds, event_time_rq.format(events[i]))
        events_time=events_time.append(temp)
        
        
events_time["max_start_time"]=events_time.groupby(["event"])['start'].transform('max')
events_time['max_start_time'] = pd.to_datetime(events_time['max_start_time'], errors='coerce')
events_time["recency"]=events_time.apply(lambda row: pd.to_datetime("2019-12-1")-row.max_start_time, axis=1)
events_time["recency"]=events_time["recency"].dt.days
events_time=events_time[["event","recency"]]
events_time=events_time.drop_duplicates()

### since there might be more than one time for an event, therefore we use the most recent one





# Language Community Relevance - number of links to a wikipedia page

We use a dump and count the number of incoming links to wikipedia pages


In [323]:

links=pd.read_csv("worldwar_links.txt", sep="\t", error_bad_lines=False)
events_link=pd.merge(left=en_mapped, right=links, how="left", left_on=["target"], right_on=["page"])
del events_link["page"]
events_link["links"]=events_link["links"].fillna(0)


# Language Community Relevance - number of comentions

*input: mappend_data dataframe which contains source and target entities*
*output: number of comentions in whole wikipedia*

In [319]:
########## mentions

#input: source and target entities
#output: number of their comentions in whole wikipedia

def get_comentions(df):
    comention=pd.DataFrame(columns={"source_ekg","target_ekg","comentions"})
    for i in range(df.shape[0]):
        try:
            temp=get_sparql_dataframe(wds, comention_rq.format(df.iloc[i]["source_ekg"], df.iloc[i]["target_ekg"]))
            comention = comention.append({"source_ekg":df.iloc[i]["source_ekg"], "target_ekg":df.iloc[i]["target_ekg"], "comentions":temp.loc[0,"cnt"]}, ignore_index=True)    
        except:
            continue
    return (comention)
#we use EventKG for that



In [None]:

final table correlation
#en_mapped
#en_comention
#events_link
#events_time
#events_language

## Correlations with Influence Factors

Given EventKG+Click and the influence factors, we now investigate the correlations between such influence factors and the language-specific relevance scores.To this end, we compute the Pearson correlation coefficients

In [None]:
en_merge=pd.merge(pd.merge(pd.merge(pd.merge(left=en_mapped, right=en_comention, how="left", left_on=["source_ekg","target_ekg"], right_on=["source_ekg","target_ekg"]),events_link,how="left",on=['target_ekg']),events_time, how="left", left_on=["target_ekg"], right_on=["event"]), events_language, how="left", left_on=["target_ekg"], right_on=["event"])
en_merge=en_merge[["source_x","target_x","source_ekg_x","target_ekg","count_x","comentions","links","recency","english","german","russian"]]
en_merge=en_merge.rename(columns={"source_x":"en_source","target_x":"en_target","source_ekg_x":"source_ekg","count_x":"en_count"})

en_merge["english"]=en_merge["english"].fillna(0)
en_merge["german"]=en_merge["german"].fillna(0)
en_merge["russian"]=en_merge["russian"].fillna(0)
en_merge["comentions"]=en_merge["comentions"].fillna(0)
en_merge["links"]=en_merge["links"].fillna(0)
en_merge["recency"]=en_merge["recency"].fillna(-1)


en_merge.corr()
