Get all Wikidata objects and checks if they are connected to an OSM object using an [API](https://osm.wikidata.link/tagged)

* this [Notebook](https://github.com/salgo60/ProjectOutdoorGyms/blob/main/Jupyter/OSM_Wikidata.ipynb)

* API [Wikidata to OpenStreetMap](https://osm.wikidata.link/tagged)
  * eg. [https://osm.wikidata.link/tagged/api/item/Q106708773](https://osm.wikidata.link/tagged/api/item/Q106708773)

* Another tool [osm.wikidata.link](https://osm.wikidata.link/search)
  

TODO: 
* 


In [1]:
from datetime import datetime
start_time  = datetime.now()
print("Last run: ", start_time)

Last run:  2023-08-01 19:47:41.235111


In [2]:
import pandas as pd


In [3]:
#
# pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/

import sys,json
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"
 
#
queryWD = """SELECT DISTINCT (REPLACE(STR(?site), ".*Q", "Q") AS ?qid) ?site WHERE {
  #?site wdt:P17 wd:Q33. 
  #?site wdt:P17/wdt:P30 wd:Q46. #Europe
  ?site wdt:P31/wdt:P279* wd:Q46831.
  ?site wdt:P625 ?coordinates.
   minus {
    { ?site wdt:P10689 ?OSMid. }
    UNION
    { ?site wdt:P402 ?OSMrelid. }
    UNION 
    { ?site wdt:P11693 ?OSMnode. }
  }
}"""


def get_sparql_dataframe(endpoint_url, query):
    """
    Helper function to convert SPARQL results into a Pandas data frame.
    """
    user_agent = "salgo60/%s.%s" % (sys.version_info[0], sys.version_info[1])
 
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()

    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']

    out = []
    for row in processed_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)

    return pd.DataFrame(out, columns=cols)

WDo = get_sparql_dataframe(endpoint_url, queryWD )
WDo["Source"] = "WD"     
WDo.shape

(42543, 3)

In [4]:
WDo.head()

Unnamed: 0,qid,site,Source
0,Q72307,http://www.wikidata.org/entity/Q72307,WD
1,Q72931,http://www.wikidata.org/entity/Q72931,WD
2,Q72989,http://www.wikidata.org/entity/Q72989,WD
3,Q73004,http://www.wikidata.org/entity/Q73004,WD
4,Q74259,http://www.wikidata.org/entity/Q74259,WD


In [5]:
import urllib3, json
from tqdm import tqdm
http = urllib3.PoolManager()

listWDo = []
for WD, row in tqdm(WDo.iterrows(), total=WDo.shape[0]):
    url = "https://osm.wikidata.link/tagged/api/item/" + row["qid"] 
    
    new_item = dict()
    new_item['wikidata'] = row["qid"] 
    try:
        r = http.request('GET', url) 
        data = json.loads(r.data.decode('utf-8'))
    except:
        print (r.status, url)
    #print (r.status)
    try:
        #print(data)
        osmid = data["osm"][0]["id"]            
        osmtype = data["osm"][0]["type"]              
    except:
        #print ("error")
        #print(data)
        osmid =""
        osmtype =""
    new_item['osmid'] = osmid  
    new_item['type'] = osmtype 
    
    listWDo.append(new_item)
print (len(listWDo))

100%|██████████| 42543/42543 [3:52:59<00:00,  3.04it/s]   

42543





In [6]:
OSMtot = pd.DataFrame(listWDo,
                  columns=['wikidata','type','osmid'])
OSMtot.shape


(42543, 3)

In [7]:
pd.set_option('max_colwidth', 400)
OSMtot.head(10)

Unnamed: 0,wikidata,type,osmid
0,Q72307,relation,14019460.0
1,Q72931,,
2,Q72989,,
3,Q73004,,
4,Q74259,node,4855618221.0
5,Q75596,node,1208394572.0
6,Q76034,,
7,Q76570,,
8,Q83788,,
9,Q84155,,


In [8]:
#OSMempty = OSMtot.osmid.notnull()
OSMtot[(OSMtot['osmid']=="")].shape

(40858, 3)

In [9]:
OSMEmpty =OSMtot[(OSMtot['osmid']=="")]

In [10]:
OSMEmpty.shape

(40858, 3)

In [11]:
OSMEmpty.to_csv("WD - OSM mountain missing.csv")

OSMEmpty.head()

Unnamed: 0,wikidata,type,osmid
1,Q72931,,
2,Q72989,,
3,Q73004,,
6,Q76034,,
7,Q76570,,


In [12]:
OSMConnected=OSMtot[(OSMtot['osmid']!="")]
OSMConnected.to_csv("WD - OSM_mountain.csv")
OSMConnected.head()

Unnamed: 0,wikidata,type,osmid
0,Q72307,relation,14019460
4,Q74259,node,4855618221
5,Q75596,node,1208394572
10,Q93332,node,4857603223
11,Q105274,node,5835016385


In [13]:
print("*", start_time.strftime("%Y%m%d"),"WD objects", WDo.shape[0], "ej OSM kopplade",OSMEmpty.shape[0]) 


* 20230801 WD objects 42543 ej OSM kopplade 40858


Generate Markdown table eg.
| 20210526     | 2802 | 2050 |1147 | 254 | 213| 84|


In [14]:
print("|",start_time.strftime("%Y%m%d"),"|", \
      WDo.shape[0],"|",OSMEmpty.wikidata.nunique(),"|")


| 20230801 | 42543 | 40858 |


In [15]:
end = datetime.now()
print("Ended: ", end) 
print('Time elapsed (hh:mm:ss.ms) {}'.format(datetime.now() - start_time))

Ended:  2023-08-01 23:40:46.845889
Time elapsed (hh:mm:ss.ms) 3:53:05.611062
