In [1]:
%pip install SPARQLWrapper
%pip install fuzzywuzzy
%pip install geopy
from SPARQLWrapper import SPARQLWrapper, JSON
from fuzzywuzzy import fuzz
from geopy.distance import geodesic
import math

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.




In [17]:
def query_wikidata_coordinates(latitude : float, longitude : float, label : str, data=None):
    if data is None:
        print("Querying Wikidata......")
        query_wikidata = f"""
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>
        PREFIX wd: <http://www.wikidata.org/entity/>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX bd: <http://www.bigdata.com/rdf#>
        PREFIX wikibase: <http://wikiba.se/ontology#>
        PREFIX geo: <http://www.opengis.net/ont/geosparql#>

        SELECT DISTINCT ?city ?cityLabel ?location
        WHERE {{
        SERVICE wikibase:around {{
            ?city wdt:P625 ?location .
            bd:serviceParam wikibase:center "Point({longitude} {latitude})"^^geo:wktLiteral .
            bd:serviceParam wikibase:radius "50" . 
        }}
        ?city wdt:P31/wdt:P279* wd:Q515 . 
        ?city rdfs:label ?cityLabel .
        FILTER(LANG(?cityLabel) = "en") .
        
        }} LIMIT 1000
    
        """
        sparql_wdata = SPARQLWrapper("https://query.wikidata.org/sparql")

        sparql_wdata.setQuery(query_wikidata)

        sparql_wdata.setReturnFormat(JSON)

        results = sparql_wdata.query().convert()
    else:
        results = data

        
    matched_cities = []
    
    for r in results["results"]["bindings"]:
        ratio = fuzz.partial_ratio(r["cityLabel"]["value"], label)
        if ratio > 80:
            matched_cities.append((ratio, r))

    if len(matched_cities) == 0:
        min_distance = math.inf
        for r in results["results"]["bindings"]:
            coords = r["location"]["value"].split("(")[1].split(")")[0].split(" ")
            coords = (float(coords[1]), float(coords[0]))
            distance = geodesic((latitude, longitude), coords).kilometers
            if distance < min_distance:
                min_distance = distance
                closest_city = r
        return closest_city
    
    matched_cities.sort(key=lambda x: x[0], reverse=True)
    return matched_cities[0][1]

def get_wikidata_results(latitude : float, longitude : float):
    query_wikidata = f"""
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX bd: <http://www.bigdata.com/rdf#>
    PREFIX wikibase: <http://wikiba.se/ontology#>
    PREFIX geo: <http://www.opengis.net/ont/geosparql#>

    SELECT DISTINCT ?city ?cityLabel ?location
    WHERE {{
    SERVICE wikibase:around {{
        ?city wdt:P625 ?location .
        bd:serviceParam wikibase:center "Point({longitude} {latitude})"^^geo:wktLiteral .
        bd:serviceParam wikibase:radius "50" . 
    }}
    ?city wdt:P31/wdt:P279* wd:Q515 . 
    ?city rdfs:label ?cityLabel .
    FILTER(LANG(?cityLabel) = "en") .

    }} LIMIT 1000

    """
     
    sparql_wdata = SPARQLWrapper("https://query.wikidata.org/sparql")

    sparql_wdata.setQuery(query_wikidata)

    sparql_wdata.setReturnFormat(JSON)

    results = sparql_wdata.query().convert()
    return results



def query_graphDB():
    query_my_data = """
    PREFIX schema: <https://schema.org/>
    PREFIX wdtn: <http://www.wikidata.org/prop/direct-normalized/>
    PREFIX data: <http://mydata.example.org/>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX voc: <http://vocabulary.example.org/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    SELECT DISTINCT ?cityName ?longitude ?latitude ?City
    WHERE {
    ?s rdf:type schema:Place .
    ?s schema:containedInPlace ?City .
    ?City rdf:type schema:City .
    ?City schema:name ?cityName .
    ?s schema:longitude ?longitude .
    ?s schema:latitude ?latitude . 
    }
    """
    endpoint = "http://localhost:7200/repositories/test"

    sparlq_graphdb = SPARQLWrapper(endpoint)
    sparlq_graphdb.setQuery(query_my_data)
    sparlq_graphdb.setReturnFormat(JSON)
    results_Graphdb = sparlq_graphdb.query().convert()

    return results_Graphdb    

In [None]:

query_my_data = """
PREFIX schema: <https://schema.org/>
PREFIX wdtn: <http://www.wikidata.org/prop/direct-normalized/>
PREFIX data: <http://mydata.example.org/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX voc: <http://vocabulary.example.org/>
PREFIX wd: <http://www.wikidata.org/entity/>
SELECT DISTINCT ?cityName ?longitude ?latitude ?City
WHERE {
?s rdf:type schema:Place .
?s schema:containedInPlace ?City .
?City rdf:type schema:City .
?City schema:name ?cityName .
?s schema:longitude ?longitude .
?s schema:latitude ?latitude . 
}
"""
# "http://localhost:7200/repositories/yourRepository"

sparlq_graphdb = SPARQLWrapper("http://localhost:7200/repositories/test")
sparlq_graphdb.setQuery(query_my_data)
sparlq_graphdb.setReturnFormat(JSON)
results_Graphdb = sparlq_graphdb.query().convert()


In [6]:
results = query_graphDB()
data = {}
matches= []
for c in results["results"]["bindings"]:
    label = c["cityName"]["value"]
    longitude = float(c["longitude"]["value"])
    latitude = float(c["latitude"]["value"])
    data[label] = get_wikidata_results(latitude, longitude)



In [7]:
data

{'Abbotsford': {'head': {'vars': ['city', 'cityLabel', 'location']},
  'results': {'bindings': [{'city': {'type': 'uri',
      'value': 'http://www.wikidata.org/entity/Q430267'},
     'location': {'datatype': 'http://www.opengis.net/ont/geosparql#wktLiteral',
      'type': 'literal',
      'value': 'Point(-122.478833333 48.754388888)'},
     'cityLabel': {'xml:lang': 'en',
      'type': 'literal',
      'value': 'Bellingham'}},
    {'city': {'type': 'uri',
      'value': 'http://www.wikidata.org/entity/Q244025'},
     'location': {'datatype': 'http://www.opengis.net/ont/geosparql#wktLiteral',
      'type': 'literal',
      'value': 'Point(-122.949167 49.25)'},
     'cityLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Burnaby'}},
    {'city': {'type': 'uri',
      'value': 'http://www.wikidata.org/entity/Q629056'},
     'location': {'datatype': 'http://www.opengis.net/ont/geosparql#wktLiteral',
      'type': 'literal',
      'value': 'Point(-122.803 49.0199)'},
     'cityLabel':

In [35]:
type(data)

dict

In [18]:
results = query_graphDB()
matches= []
for c in results["results"]["bindings"]:
    label = c["cityName"]["value"]
    longitude = float(c["longitude"]["value"])
    latitude = float(c["latitude"]["value"])
    print(label, longitude, latitude)
    result_wikidata = (c,query_wikidata_coordinates(latitude, longitude, label, data=data[label]))
    matches.append(result_wikidata)
results

Abbotsford -122.35 49.083333
Charnwood -1.2097 52.7709
Montreal -73.561668 45.508888
London -0.076544 51.464462
Boston -71.064709 42.360338
City of Edinburgh -3.1883 55.9533


{'head': {'vars': ['cityName', 'longitude', 'latitude', 'City']},
 'results': {'bindings': [{'cityName': {'type': 'literal',
     'value': 'Abbotsford'},
    'longitude': {'datatype': 'http://www.w3.org/2001/XMLSchema#double',
     'type': 'literal',
     'value': '-1.2235E2'},
    'latitude': {'datatype': 'http://www.w3.org/2001/XMLSchema#double',
     'type': 'literal',
     'value': '4.9083333E1'},
    'City': {'type': 'uri',
     'value': 'http://mydata.example.org/public-locations-City/Abbotsford'}},
   {'cityName': {'type': 'literal', 'value': 'Charnwood'},
    'longitude': {'datatype': 'http://www.w3.org/2001/XMLSchema#double',
     'type': 'literal',
     'value': '-1.2097E0'},
    'latitude': {'datatype': 'http://www.w3.org/2001/XMLSchema#double',
     'type': 'literal',
     'value': '5.27709E1'},
    'City': {'type': 'uri',
     'value': 'http://mydata.example.org/public-locations-City/Charnwood'}},
   {'cityName': {'type': 'literal', 'value': 'Montreal'},
    'longitude': {

In [36]:
# matches[0][0]["City"]["value"], matches[0][1]["city"]["value"]

# s = "<"+matches[0][0]["City"]["value"] +"> " + "<http://www.w3.org/2002/07/owl#sameAs> " + "<"+matches[0][1]["city"]["value"] +"> ."
triples = []
for m in matches:
    s = "<"+m[0]["City"]["value"] +"> " + "<http://www.w3.org/2002/07/owl#sameAs> " + "<"+m[1]["city"]["value"] +"> .\n"
    triples.append(s)

In [38]:
with open("matches.nt", "w") as f:
    f.writelines(triples)