In [29]:
import json
import pandas as pd
from glob import glob
from collections import defaultdict
import numpy as np

In [19]:
data_dir = "../data/wikidata"

In [None]:
entities_dict=defaultdict(dict)
for file in glob(data_dir+'/event_wiki2data_entities_**.json'):
    with open(file) as f:
        data = json.load(f)
    for entity_name, uri in data.items():
        entity_id= uri.split("/")[-1][1:]
        print(entity_id)
        if entity_id not in entities_dict:
            entities_dict[entity_id]={
                "label":entity_name,
                "url":uri,
                "entity_id":entity_id}

In [21]:
df_entities= pd.DataFrame.from_dict(entities_dict, orient="index")

In [23]:
df_entities_linked = pd.read_csv("../data/wikidata/entities_events_detected.csv")

In [26]:
df_entities_linked.rename(columns={"Unnamed: 0":"entity_id"}, inplace=True)

In [28]:
entity_description= dict(zip(df_entities_linked["entity_id"], df_entities_linked["description"]))

In [33]:
description_list =[]
for idx in df_entities["entity_id"]:
    idx= int(idx)
    if idx in entity_description:
        description_list.append(entity_description[idx])
    else:
        description_list.append(np.nan)

In [35]:
df_entities["description"]=description_list

In [37]:
df_entities.head()

Unnamed: 0,label,url,entity_id,description
861662,LGM-30 Minuteman,http://www.wikidata.org/entity/Q861662,861662,
4808983,Assistant to the Secretary of Defense for Publ...,http://www.wikidata.org/entity/Q4808983,4808983,
83168,Germany–Russia relations,http://www.wikidata.org/entity/Q83168,83168,
177471,Intercontinental ballistic missile,http://www.wikidata.org/entity/Q177471,177471,
6514,Dardanelles,http://www.wikidata.org/entity/Q6514,6514,strait in northwestern Turkey


In [40]:
len(df_entities)

446

In [39]:
df_entities.to_csv("../data/wikidata/events_wiki2data_entities.csv")

In [48]:
from ast import literal_eval

In [41]:
df_events= pd.read_csv("../data/events/events_actors_final.csv")

In [44]:
wiki_ents = df_events["entities"].tolist() # wikipedia entites

In [49]:
wiki_ents = [literal_eval(x) for x in wiki_ents]

In [45]:
entities_label_dict= dict(zip(df_entities["label"], df_entities["entity_id"])) # wikidata entities

In [52]:
# mapping wikipedia entities to wikidata
wikidata_labels = []
wikidata_ids = []
for ents in wiki_ents:
    labels=[]
    ids=[]
    for ent in ents:
        if ent in entities_label_dict:
            labels.append(ent)
            ids.append(entities_label_dict[ent])
    wikidata_labels.append(labels)
    wikidata_ids.append(ids)
    
    

In [53]:
df_events["wiki2data_ids"]= wikidata_ids
df_events["wiki2data_labels"]= wikidata_labels


In [55]:
df_events.to_csv("../data/events/events_actors_wiki2data.csv", index=False)

In [46]:
df_events

Unnamed: 0,category,event_text,main_event,date,news,entities,first_sentence,first_actor,wikidata_ids,wikidata_labels
0,Armed conflicts and attacks,Russian troops enter the Chernobyl Exclusion Z...,Russo-Ukrainian War,2022-02-24,['https://www.dailysabah.com/world/europe/russ...,"['Chernobyl Exclusion Zone', 'Chernobyl Nuclea...",Russian troops enter the Chernobyl Exclusion Z...,Russian troops,"[1080137, 1050926]","['troop', 'Chernobyl Exclusion Zone']"
1,Armed conflicts and attacks,Ukraine reports that Russian Navy warships hav...,Russo-Ukrainian War,2022-02-24,['https://www.indiatvnews.com/news/world/russi...,"['Russian Navy', 'Snake Island (Black Sea)', '...",Ukraine reports that Russian Navy warships hav...,russian navy warships,"[212, 559549, 155868, 184429]","['Ukraine', 'United States Naval Academy', 'Sn..."
2,Armed conflicts and attacks,Russian troops enter Sumy and take control of ...,Russo-Ukrainian War,2022-02-24,['https://nv.ua/ukraine/events/voyna-rossii-pr...,"['Sumy', 'Kyiv', 'Moscow']",Russian troops enter Sumy and take control of ...,Russian troops,"[1080137, 156752, 11175, 34442, 1899, 649]","['troop', 'Sumy', 'circumflex', 'road', 'Kiev'..."
3,Armed conflicts and attacks,Russian gunships bombard Hostomel Airport near...,Russo-Ukrainian War,2022-02-24,['https://www.themoscowtimes.com/2022/02/24/ru...,"['Armed helicopter', 'Hostomel Airport', 'Russ...",Russian gunships bombard Hostomel Airport near...,Russian gunships,"[1316223, 409022, 1899]","['gunship', 'Airport', 'Kiev']"
4,Armed conflicts and attacks,"An-225 Mriya, the largest aircraft in the worl...",Russo-Ukrainian War,2022-02-24,['https://www.radiosvoboda.org/a/news-v-rezult...,"['An-225 Mriya', 'List of large aircraft', 'Wi...","An-225 Mriya, the largest aircraft in the worl...",,"[178351, 11436, 16502, 11423, 245097, 2380335,...","['An-225 Mriya', 'aircraft', 'world', 'mass', ..."
...,...,...,...,...,...,...,...,...,...,...
238,International relations,Russian President Vladimir Putin says that Rus...,Reactions to the 2021–2022 Russo-Ukrainian crisis,2022-03-04,['https://www.cotswoldjournal.co.uk/news/natio...,"['President of Russia', 'Vladimir Putin', 'Rus...",Russian President Vladimir Putin says that Rus...,russia,"[7747, 159, 454, 212]","['Vladimir Putin', 'Russia', 'peace', 'Ukraine']"
239,Law and crime,President Vladimir Putin signs amendments to t...,Reactions to the 2022 Russian invasion of Ukraine,2022-03-04,"['https://www.interfax.ru/russia/826193', 'htt...","['Vladimir Putin', 'Criminal Code of Russia', ...",President Vladimir Putin signs amendments to t...,President Vladimir Putin,"[7747, 1269627, 2091694, 159, 28478874, 499137...","['Vladimir Putin', 'amendment', 'Criminal Code..."
240,Law and crime,Ukranian President Volodymyr Zelenskyy address...,Reactions to the 2022 Russian invasion of Ukraine,2022-03-04,['https://guernseypress.com/news/world-news/20...,"['President of Ukraine', 'Volodymyr Zelenskyy'...",Ukranian President Volodymyr Zelenskyy address...,Ukranian President Volodymyr Zelenskyy,"[3874799, 515, 458]","['Volodymyr Zelenskiy', 'city', 'European Union']"
241,Law and crime,Italian police seize the yacht of Russian olig...,Reactions to the 2022 Russian invasion of Ukraine,2022-03-04,['https://www.theguardian.com/world/2022/mar/0...,"['Law enforcement in Italy', 'Yacht', 'Russian...",Italian police seize the yacht of Russian olig...,Italian police,"[35535, 170173, 1395387, 858321, 13318]","['police', 'yacht', 'Alexei Mordashov', 'port'..."


In [9]:
df_ls=[]
for file in glob(data_dir+'/entities_tweets_**.csv'):
    print(file)
    df=pd.read_csv(file, index_col=0)
    df["entity_id"]= df.index
    df_ls.append(df)
    

../data/wikidata/entities_tweets_2022-02-28 00:00:00.csv
../data/wikidata/entities_tweets_2022-03-01 00:00:00.csv
../data/wikidata/entities_tweets_2022-02-25 00:00:00.csv
../data/wikidata/entities_tweets_2022-02-24 00:00:00.csv
../data/wikidata/entities_tweets_2022-02-26 00:00:00.csv
../data/wikidata/entities_tweets_2022-02-27 00:00:00.csv
../data/wikidata/entities_tweets_2022-03-04 00:00:00.csv
../data/wikidata/entities_tweets_2022-03-03 00:00:00.csv
../data/wikidata/entities_tweets_2022-03-02 00:00:00.csv
../data/wikidata/entities_tweets_2022-03-05 00:00:00.csv


In [10]:
pd_merged = pd.concat(df_ls)

In [14]:
pd_merged.drop_duplicates(subset=["entity_id"], inplace=True)

In [15]:
pd_merged.to_csv(data_dir+'/entities_tweets.csv')

In [16]:
pd_merged.head()

Unnamed: 0,description,label,url,entity_id
278485,word or an unspaced phrase prefixed with the n...,hashtag,https://www.wikidata.org/wiki/Q278485,278485
212,sovereign state in Eastern Europe,Ukraine,https://www.wikidata.org/wiki/Q212,212
2995644,final consequence of a sequence of actions or ...,result,https://www.wikidata.org/wiki/Q2995644,2995644
49776,work stoppage caused by the mass refusal of em...,strike,https://www.wikidata.org/wiki/Q49776,49776
8473,organized body primarily tasked with preparing...,military,https://www.wikidata.org/wiki/Q8473,8473
