In [1]:
%cd ..

/Users/mateoibarguen/Desktop/CSC 482/genealogy-extraction


In [2]:
import pandas as pd
from src.wiki_referencer.wiki_reference import WikiReferencer
from wikidata.client import Client
import numpy as np

In [3]:
wiki_referencer = WikiReferencer()

### Find gender of entity

In [6]:
wiki_referencer.get_entity_gender('Q255457')

'female'

### Find the name of an entity

In [7]:
wiki_referencer.get_entity_name('Q274606')

'Berengar I of Italy'

#### Find the aliases of an entity (if it has any)

In [8]:
wiki_referencer.get_entity_aliases('Q274606')

['Berengar', 'Berengario I']

### Find the article text

In [9]:
wiki_referencer.get_article_text('1467835')[:100]

'Berengar I (Latin: Berengarius, Perngarius; Italian: Berengario; c. 845 – 7 April 924) was the king '

### Find the entities of an article

In [10]:
wiki_referencer.get_article_entities('1467835')

['Q274606',
 'Q3769073',
 'Q59522651',
 'Q919247',
 'Q72067',
 'Q284400',
 'Q1662981',
 'Q3605160',
 'Q743131',
 'Q28778035']

### Find an entity's relationships

In [11]:
# Find all relations for entity: Q274606
wiki_referencer.get_entity_relations('Q274606')

{'Q3769073': 'P40',
 'Q59522651': 'P40',
 'Q919247': 'P22',
 'Q72067': 'P26',
 'Q284400': 'P25',
 'Q1662981': 'P3373',
 'Q3605160': 'P3373',
 'Q743131': 'P3373',
 'Q28778035': 'P3373'}

#### Labels
I already split the labels such that 20% of the **articles** are in the test set and 80% of the **articles** are in the test set. For the labels, each row has an entity_a id, entity_b id, relation and the article_id where that relation is found. Therefore, each row in the labels is a relation, not an article. 

In [12]:
train_labels, test_labels = pd.read_pickle('data/train_labels.pkl'), pd.read_pickle('data/test_labels.pkl')

In [13]:
count_relations = train_labels.groupby('relation')['relation'].count()
proportion_relations = count_relations / count_relations.sum()

In [14]:
proportion_relations.index

Index(['P22', 'P25', 'P26', 'P3373', 'P40'], dtype='object', name='relation')

In [15]:
proportion_relations.values

array([0.07800285, 0.07766049, 0.10659058, 0.34168331, 0.39606277])

In [16]:
print(f'Training set size: {train_labels.shape[0]} relations and {train_labels.article_id.nunique()} articles. ')
print(f'Test set size: {test_labels.shape[0]} relations and {test_labels.article_id.nunique()} articles. ')

Training set size: 17723 relations and 1362 articles. 
Test set size: 4711 relations and 341 articles. 


> The methods used in `wiki_referencer` don't use API calls since all the data is already stored locally in dictionaries. So, using the methods in `wiki_referencer` should be very fast. 

In [17]:
%timeit "Berengar" in  " aslfkdjklsdf lsdfkj Berengar I of Italy"

53.1 ns ± 1.03 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [18]:
# Get the names of all entity_a ... really fast :) 
train_labels['entity_a'].apply(wiki_referencer.get_entity_name)

0     Berengar I of Italy
1     Berengar I of Italy
2     Berengar I of Italy
3     Berengar I of Italy
4     Berengar I of Italy
5     Berengar I of Italy
6     Berengar I of Italy
7     Berengar I of Italy
8     Berengar I of Italy
0       Otto von Habsburg
1       Otto von Habsburg
2       Otto von Habsburg
3       Otto von Habsburg
4       Otto von Habsburg
5       Otto von Habsburg
6       Otto von Habsburg
7       Otto von Habsburg
8       Otto von Habsburg
9       Otto von Habsburg
10      Otto von Habsburg
11      Otto von Habsburg
12      Otto von Habsburg
13      Otto von Habsburg
14      Otto von Habsburg
15      Otto von Habsburg
16      Otto von Habsburg
17      Otto von Habsburg
18      Otto von Habsburg
19      Otto von Habsburg
20      Otto von Habsburg
             ...         
1      Louis XI of France
2      Louis XI of France
3      Louis XI of France
4      Louis XI of France
5      Louis XI of France
6      Louis XI of France
7      Louis XI of France
8      Louis