# Extraction of a person's occupations from wikidata
Example with Obama and others

__author__ = "Pierre Nugues"

A few imports

In [1]:
import requests
import pandas as pd
import json

Setting the presentation options

In [2]:
pd.options.display.max_rows = 10000
pd.options.display.max_columns = 80
pd.options.display.width = 200

The query

In [3]:
prefixes = '''PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX ps: <http://www.wikidata.org/prop/statement/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
'''

In [4]:
entity = 'Q1780398'

# Q6279 Joseph Biden
# Q22686 Donald Trump
# Q76 Barack Obama
# Q2740012 Stefan Löfven
# Q4935873 Magdalena Andersson
# Q1780398 Ulf Kristersson
# p: entity to statement
# ps: statement to value
# can be replaced by wdt: (truthy, missing facts)
# See: https://stackoverflow.com/questions/36023098/querying-wikidata-difference-between-p-and-wdt-default-prefix

In [5]:
english_query = '''
SELECT DISTINCT ?item ?itemLabelOcc (lang(?itemLabel) as ?lang)
WHERE 
{{
    wd:{0} p:P106 ?occupation .
    ?occupation ps:P106 ?item .
    ?item rdfs:label ?itemLabelOcc .
    FILTER (lang(?itemLabelOcc) = "en") .
}}
LIMIT 1000'''.format(entity)
print(english_query)


SELECT DISTINCT ?item ?itemLabelOcc (lang(?itemLabel) as ?lang)
WHERE 
{
    wd:Q1780398 p:P106 ?occupation .
    ?occupation ps:P106 ?item .
    ?item rdfs:label ?itemLabelOcc .
    FILTER (lang(?itemLabelOcc) = "en") .
}
LIMIT 1000


We query the data from wikidata

In [6]:
headers = {
    'User-Agent': 'NLP-project/1.0 (pierre.nugues@cs.lth.se)'
}

In [7]:
url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
data = requests.get(url, 
                    params={'query': prefixes + english_query, 'format': 'json'},
                    headers=headers).json()
data

{'head': {'vars': ['item', 'itemLabelOcc', 'lang']},
 'results': {'bindings': [{'item': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q82955'},
    'itemLabelOcc': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'politician'}}]}}

And we print it

In [8]:
profession = []
for item in data['results']['bindings']:
    profession.append({
        'id': item.get('item', {}).get('value'),
        'occupation': item.get('itemLabelOcc', {}).get('value'),
        'lang': item.get('itemLabelOcc', {}).get('xml:lang'),
    })

df = pd.DataFrame(profession)
df

Unnamed: 0,id,occupation,lang
0,http://www.wikidata.org/entity/Q82955,politician,en


Going multilingual

In [9]:
multilingual_query = '''
SELECT DISTINCT ?item ?itemLabelOcc (lang(?itemLabel) as ?lang)
WHERE 
{{
    wd:{0} p:P106 ?occupation .
    ?occupation ps:P106 ?item .
    ?item rdfs:label ?itemLabelOcc . 
}}
LIMIT 1000'''.format(entity)

The query

In [10]:
url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
data = requests.get(url, 
                    params={'query': prefixes + multilingual_query, 'format': 'json'},
                    headers=headers).json()
#data

And we print it

In [11]:
profession = []
for item in data['results']['bindings']:
    profession.append({
        'id': item.get('item', {}).get('value'),
        'occupation': item.get('itemLabelOcc', {}).get('value'),
        'lang': item.get('itemLabelOcc', {}).get('xml:lang'),
    })

df = pd.DataFrame(profession)
print(df)

                                        id                occupation       lang
0    http://www.wikidata.org/entity/Q82955                 سیاست دان        pnb
1    http://www.wikidata.org/entity/Q82955               Kawpaq runa         qu
2    http://www.wikidata.org/entity/Q82955                   політік        rue
3    http://www.wikidata.org/entity/Q82955               राजनैतिज्ञः         sa
4    http://www.wikidata.org/entity/Q82955                  бэлиитик        sah
5    http://www.wikidata.org/entity/Q82955                දේශපාලනඥයා         si
6    http://www.wikidata.org/entity/Q82955               polijtikere        sma
7    http://www.wikidata.org/entity/Q82955               politihkkár        smj
8    http://www.wikidata.org/entity/Q82955                politikkár        smn
9    http://www.wikidata.org/entity/Q82955                sopolitiki         ss
10   http://www.wikidata.org/entity/Q82955                     ፖለቲከኛ         ti
11   http://www.wikidata.org/entity/Q829