# Exploring the Archaeologists in Wikidata

This Jupyter notebook explores Wikidata to find the number of archaeologists, what information is documented on them, and see if this population can be used for some analysis.

This will mimic what is presented in the seminar.

## SPARQL wrapper

This is the python script to get SPARQL queries in a specific SPARQL Endpoint.

In [1]:
from typing import List, Dict
from SPARQLWrapper import SPARQLWrapper, JSON, SPARQLExceptions
from urllib.error import HTTPError
import pandas as pd

def __handle_row(row: Dict[str, dict]) -> Dict[str, str]:
    """Transform an object coming from a SPARQL query (through SPARQLWrapper) into a dictionnary for better use."""

    obj: Dict[str, str] = {}
    for key in row.keys():
        obj[key] = row[key]["value"]
    return obj

def query(sparql_url: str, request: str) -> List[Dict[str, str]]:
    """
    Execute the given request on the given endpoint
    Request needs to be only SELECT: won't work if it is a INSERT or DELETE request.
    """

    # Init the endpoint
    sparql_endpoint = SPARQLWrapper(
        sparql_url,
        agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    )
    sparql_endpoint.setReturnFormat(JSON)

    # Prepare the query
    sparql_endpoint.setQuery(request)

    # DEBUG
    # print('==============')
    # print(request)

    # Execute the query, handles errors,
    try: 
        response = sparql_endpoint.queryAndConvert()["results"]["bindings"]
    except SPARQLExceptions.QueryBadFormed as error:
        print(error.msg)
        return False
    except HTTPError as error:
        raise Exception(f"HTTP Error {error.code}: {error.reason}")
    # and transform the object
    response = list(map(__handle_row, response))

    # If the answer is empty, return an actual empty array
    if response == [{}]: 
        return []
    
    return response


def execute(sparql_url: str, request: str) -> None:
    """
    Execute the given request against the previously set endpoint.
    Request needs to be only INSERTs or DELETEs.
    """
    
    # Init the endpoint
    sparql_endpoint = SPARQLWrapper(sparql_url)
        
    # Prepare the query
    sparql_endpoint.setQuery(request)
    sparql_endpoint.method = "POST"

    # DEBUG
    # print('==============')
    # print(request)

    # Execute the query
    try: 
        sparql_endpoint.query()
    except SPARQLExceptions.QueryBadFormed as error:
        print(error.msg)
        return False
    except HTTPError as error:
        raise Exception(f"HTTP Error {error.code}: {error.reason}")



def run(sparql_url: str, query_string: str) -> List[Dict[str, str]]:
    """ Wrapper of "query" and "execute" function."""

    if 'delete' in query_string.lower() or 'insert' in query_string.lower():
        return execute(sparql_url, query_string)
    elif 'select' in query_string.lower():
        return pd.DataFrame(data=query(sparql_url, query_string))
    else:
        raise Exception('Query error: Only "SELECT", "INSERT", "DELETE" are supported.')


## Explore the archaeologists

In the next querry, we will count the number of persons that have the Occupation Achaeologist (Q3621491) or Field of activity Archaeology (Q23498)

In [2]:
run("https://query.wikidata.org/sparql",
    """ 
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>

SELECT (count(*) as ?eff) # What is the difference between ?number and ?eff?
WHERE {
        {?item wdt:P106 wd:Q3621491}
        UNION
        {?item wdt:P101 wd:Q23498}
        
        ?item wdt:P31 wd:Q5
}
"""
)

Unnamed: 0,eff
0,28629


We can limit the search to archaeologists that were bore from 1800 to 1951

In [4]:
run("https://query.wikidata.org/sparql",
    """ 
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>

SELECT (count(*) as ?eff) # What is the difference between ?number and ?eff?
WHERE {
        {?item wdt:P106 wd:Q3621491}
        UNION
        {?item wdt:P101 wd:Q23498}
        
        ?item wdt:P31 wd:Q5;
              wdt:P569 ?birthDate.

        BIND(REPLACE(str(?birthDate), "(.*)([0-9]{4})(.*)", "$2") AS ?year)
        FILTER(xsd:integer(?year) > 1800 && xsd:integer(?year) < 1951)
}
"""
)

Unnamed: 0,eff
0,13838


We can then find the properties associated to those archaeologists

In [5]:
run("https://query.wikidata.org/sparql",
    """ 
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX bd: <http://www.bigdata.com/rdf#>
PREFIX wikibase: <http://wikiba.se/ontology#>

SELECT ?p ?propLabel ?eff
WHERE {
{
SELECT ?p  (count(*) as ?eff)
WHERE {
        {?item wdt:P106 wd:Q3621491}
        UNION
        {?item wdt:P101 wd:Q23498}    
    ?item wdt:P31 wd:Q5; # Any instance of a human.
            wdt:P569 ?birthDate.
    ?item  ?p ?o.

    BIND(REPLACE(str(?birthDate), "(.*)([0-9]{4})(.*)", "$2") AS ?year)
    FILTER(xsd:integer(?year) > 1800 && xsd:integer(?year) < 1951)
    }
GROUP BY ?p 

    }
?prop wikibase:directClaim ?p .

SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } 


}  
ORDER BY DESC(?eff)
"""
)

Unnamed: 0,p,propLabel,eff
0,http://www.wikidata.org/prop/direct/P106,occupation,38243
1,http://www.wikidata.org/prop/direct/P569,date of birth,14393
2,http://www.wikidata.org/prop/direct/P735,given name,13841
3,http://www.wikidata.org/prop/direct/P31,instance of,13840
4,http://www.wikidata.org/prop/direct/P101,field of work,13764
...,...,...,...
1488,http://www.wikidata.org/prop/direct/P13204,Wellcome Collection concept ID,1
1489,http://www.wikidata.org/prop/direct/P13275,A Dictionary of Cultural Anthropology entry ID,1
1490,http://www.wikidata.org/prop/direct/P13337,domain name,1
1491,http://www.wikidata.org/prop/direct/P13377,Memoria Chilena ID,1
