# Extraction of a person's occupations from wikidata
Example with Obama

__author__ = "Pierre Nugues"

A few imports

In [1]:
import requests
import pandas as pd

Setting the presentation options

In [2]:
pd.options.display.max_rows = 10000
pd.options.display.max_columns = 80
pd.options.display.width = 200

The query

In [3]:
prefixes = '''PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX ps: <http://www.wikidata.org/prop/statement/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
'''

# Q22686 Donald Trump
# Q76 Barack Obama
# p: entity to statement
# ps: statement to value
# can be replaced by wdt: (truthy, missing a facts)
# See: https://stackoverflow.com/questions/36023098/querying-wikidata-difference-between-p-and-wdt-default-prefix
english_query = '''
SELECT DISTINCT ?item ?itemLabelOcc (lang(?itemLabel) as ?lang)
WHERE 
{
    wd:Q76 p:P106 ?occupation .
    ?occupation ps:P106 ?item .
    ?item rdfs:label ?itemLabelOcc .
    FILTER (lang(?itemLabelOcc) = "en") .
}
LIMIT 1000'''

We query the data from wikidata

In [4]:
url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
data = requests.get(url, params={'query': prefixes + english_query, 'format': 'json'}).json()
data

{'head': {'vars': ['item', 'itemLabelOcc', 'lang']},
 'results': {'bindings': [{'item': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q40348'},
    'itemLabelOcc': {'xml:lang': 'en', 'type': 'literal', 'value': 'lawyer'}},
   {'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q82955'},
    'itemLabelOcc': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'politician'}},
   {'item': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q15958642'},
    'itemLabelOcc': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'political writer'}},
   {'item': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q28532974'},
    'itemLabelOcc': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'community organizer'}},
   {'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q372436'},
    'itemLabelOcc': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'statesperson'}}]}}

And we print it

In [5]:
profession = []
for item in data['results']['bindings']:
    profession.append({
        'id': item.get('item', {}).get('value'),
        'occupation': item.get('itemLabelOcc', {}).get('value'),
        'lang': item.get('itemLabelOcc', {}).get('xml:lang'),
    })

df = pd.DataFrame(profession)
df

Unnamed: 0,id,occupation,lang
0,http://www.wikidata.org/entity/Q40348,lawyer,en
1,http://www.wikidata.org/entity/Q82955,politician,en
2,http://www.wikidata.org/entity/Q15958642,political writer,en
3,http://www.wikidata.org/entity/Q28532974,community organizer,en
4,http://www.wikidata.org/entity/Q372436,statesperson,en


Going multilingual

In [6]:
multilingual_query = '''
SELECT DISTINCT ?item ?itemLabelOcc (lang(?itemLabel) as ?lang)
WHERE 
{
    wd:Q76 p:P106 ?occupation .
    ?occupation ps:P106 ?item .
    ?item rdfs:label ?itemLabelOcc . 
}
LIMIT 1000'''

The query

In [7]:
url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
data = requests.get(url, params={'query': prefixes + multilingual_query, 'format': 'json'}).json()
#data

And we print it

In [8]:
profession = []
for item in data['results']['bindings']:
    profession.append({
        'id': item.get('item', {}).get('value'),
        'occupation': item.get('itemLabelOcc', {}).get('value'),
        'lang': item.get('itemLabelOcc', {}).get('xml:lang'),
    })

df = pd.DataFrame(profession)
print(df)

                                           id                      occupation       lang
0       http://www.wikidata.org/entity/Q40348                       Prokureur         af
1       http://www.wikidata.org/entity/Q40348                      Mmranimfoɔ         ak
2       http://www.wikidata.org/entity/Q40348                             ጠበቃ         am
3       http://www.wikidata.org/entity/Q40348                        Advogato         an
4       http://www.wikidata.org/entity/Q40348                           محامي         ar
5       http://www.wikidata.org/entity/Q40348                          abogáu        ast
6       http://www.wikidata.org/entity/Q40348                        Arxatiri         ay
7       http://www.wikidata.org/entity/Q40348                           vəkil         az
8       http://www.wikidata.org/entity/Q40348                            وکیل        azb
9       http://www.wikidata.org/entity/Q40348                         адвокат         ba
10      http://www.wi