# Extraction of a person's occupations from wikidata
Example with Obama and others

__author__ = "Pierre Nugues"

A few imports

In [11]:
import requests
import pandas as pd
import json

Setting the presentation options

In [12]:
pd.options.display.max_rows = 10000
pd.options.display.max_columns = 80
pd.options.display.width = 200

The query

In [13]:
prefixes = '''PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX ps: <http://www.wikidata.org/prop/statement/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
'''

In [14]:
entity = 'Q22686'

# Q6279 Joseph Biden
# Q22686 Donald Trump
# Q76 Barack Obama
# Q2740012 Stefan Löfven
# Q4935873 Magdalena Andersson
# Q1780398 Ulf Kristersson
# p: entity to statement
# ps: statement to value
# can be replaced by wdt: (truthy, missing facts)
# See: https://stackoverflow.com/questions/36023098/querying-wikidata-difference-between-p-and-wdt-default-prefix

In [15]:
english_query = '''
SELECT DISTINCT ?item ?itemLabelOcc (lang(?itemLabel) as ?lang)
WHERE 
{{
    wd:{0} p:P106 ?occupation .
    ?occupation ps:P106 ?item .
    ?item rdfs:label ?itemLabelOcc .
    FILTER (lang(?itemLabelOcc) = "en") .
}}
LIMIT 1000'''.format(entity)
print(english_query)


SELECT DISTINCT ?item ?itemLabelOcc (lang(?itemLabel) as ?lang)
WHERE 
{
    wd:Q22686 p:P106 ?occupation .
    ?occupation ps:P106 ?item .
    ?item rdfs:label ?itemLabelOcc .
    FILTER (lang(?itemLabelOcc) = "en") .
}
LIMIT 1000


We query the data from wikidata

In [16]:
headers = {
    'User-Agent': 'NLP-project/1.0 (pierre.nugues@cs.lth.se)'
}

In [17]:
url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
data = requests.get(url, 
                    params={'query': prefixes + english_query, 'format': 'json'},
                    headers=headers).json()
data

{'head': {'vars': ['item', 'itemLabelOcc', 'lang']},
 'results': {'bindings': [{'item': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q33999'},
    'itemLabelOcc': {'xml:lang': 'en', 'type': 'literal', 'value': 'actor'}},
   {'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q36180'},
    'itemLabelOcc': {'xml:lang': 'en', 'type': 'literal', 'value': 'writer'}},
   {'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q43845'},
    'itemLabelOcc': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'businessperson'}},
   {'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q215536'},
    'itemLabelOcc': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'merchant'}},
   {'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q82955'},
    'itemLabelOcc': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'politician'}},
   {'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q131524'},
    '

And we print it

In [18]:
profession = []
for item in data['results']['bindings']:
    profession.append({
        'id': item.get('item', {}).get('value'),
        'occupation': item.get('itemLabelOcc', {}).get('value'),
        'lang': item.get('itemLabelOcc', {}).get('xml:lang'),
    })

df = pd.DataFrame(profession)
df

Unnamed: 0,id,occupation,lang
0,http://www.wikidata.org/entity/Q33999,actor,en
1,http://www.wikidata.org/entity/Q36180,writer,en
2,http://www.wikidata.org/entity/Q43845,businessperson,en
3,http://www.wikidata.org/entity/Q215536,merchant,en
4,http://www.wikidata.org/entity/Q82955,politician,en
5,http://www.wikidata.org/entity/Q131524,entrepreneur,en
6,http://www.wikidata.org/entity/Q484876,chief executive officer,en
7,http://www.wikidata.org/entity/Q557880,investor,en
8,http://www.wikidata.org/entity/Q578109,television producer,en
9,http://www.wikidata.org/entity/Q3282637,film producer,en


Going multilingual

In [19]:
multilingual_query = '''
SELECT DISTINCT ?item ?itemLabelOcc (lang(?itemLabel) as ?lang)
WHERE 
{{
    wd:{0} p:P106 ?occupation .
    ?occupation ps:P106 ?item .
    ?item rdfs:label ?itemLabelOcc . 
}}
LIMIT 1000'''.format(entity)

The query

In [20]:
url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
data = requests.get(url, 
                    params={'query': prefixes + multilingual_query, 'format': 'json'},
                    headers=headers).json()
#data

And we print it

In [21]:
profession = []
for item in data['results']['bindings']:
    profession.append({
        'id': item.get('item', {}).get('value'),
        'occupation': item.get('itemLabelOcc', {}).get('value'),
        'lang': item.get('itemLabelOcc', {}).get('xml:lang'),
    })

df = pd.DataFrame(profession)
print(df)

                                         id                          occupation       lang
0     http://www.wikidata.org/entity/Q33999                               actor         en
1     http://www.wikidata.org/entity/Q33999                             هنرپیشه         fa
2     http://www.wikidata.org/entity/Q33999                              attore         it
3     http://www.wikidata.org/entity/Q33999                                  俳優         ja
4     http://www.wikidata.org/entity/Q33999                              ਅਦਾਕਾਰ         pa
5     http://www.wikidata.org/entity/Q33999                              اداکار        pnb
6     http://www.wikidata.org/entity/Q33999                              oyuncu         tr
7     http://www.wikidata.org/entity/Q33999                               aktor         sq
8     http://www.wikidata.org/entity/Q33999                              aktyor         az
9     http://www.wikidata.org/entity/Q33999                         skuespiller         da