Check Open Data portals for Swedish municipalities

In [1]:
from datetime import datetime
start_time  = datetime.now()
print("Last runa: ", start_time)

Last run:  2021-06-28 14:11:16.227553


In [2]:
# pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/

import sys,json
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

queryOpenData = """SELECT  (REPLACE(STR(?org), ".*Q", "Q") AS ?wikidata) ?org ?orgLabel ?portalLabel  ?www WHERE {
  ?org wdt:P8402 ?portal.
  ?org wdt:P31 wd:Q127448.
  ?org wdt:P361 ?lan.
  ?lan wdt:P31 wd:Q193556.
  OPTIONAL { ?portal wdt:P856 ?www }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "sv". }
}
GROUP BY ?org ?orgLabel ?portal ?portalLabel ?www  """


def get_sparql_dataframe(endpoint_url, query):
    """
    Helper function to convert SPARQL results into a Pandas data frame.
    """
    user_agent = "salgo60/%s.%s" % (sys.version_info[0], sys.version_info[1])
 
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()

    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']

    out = []
    for row in processed_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)

    return pd.DataFrame(out, columns=cols)

WDOpenData = get_sparql_dataframe(endpoint_url, queryOpenData)
WDOpenData.shape

(154, 5)

In [3]:
pd.set_option('max_colwidth', 400)
WDOpenData.head(10)

Unnamed: 0,wikidata,org,orgLabel,portalLabel,www
0,Q113692,http://www.wikidata.org/entity/Q113692,Haninge kommun,Stockholmskommunernas öppna data-portal,https://storsthlm.dataplatform.se
1,Q113718,http://www.wikidata.org/entity/Q113718,Botkyrka kommun,Stockholmskommunernas öppna data-portal,https://storsthlm.dataplatform.se
2,Q113730,http://www.wikidata.org/entity/Q113730,Tyresö kommun,Stockholmskommunernas öppna data-portal,https://storsthlm.dataplatform.se
3,Q492575,http://www.wikidata.org/entity/Q492575,Huddinge kommun,Stockholmskommunernas öppna data-portal,https://storsthlm.dataplatform.se
4,Q499460,http://www.wikidata.org/entity/Q499460,Nykvarns kommun,Stockholmskommunernas öppna data-portal,https://storsthlm.dataplatform.se
5,Q505090,http://www.wikidata.org/entity/Q505090,Nynäshamns kommun,Stockholmskommunernas öppna data-portal,https://storsthlm.dataplatform.se
6,Q506250,http://www.wikidata.org/entity/Q506250,Stockholms kommun,Stockholmskommunernas öppna data-portal,https://storsthlm.dataplatform.se
7,Q516080,http://www.wikidata.org/entity/Q516080,Salems kommun,Stockholmskommunernas öppna data-portal,https://storsthlm.dataplatform.se
8,Q516336,http://www.wikidata.org/entity/Q516336,Södertälje kommun,Stockholmskommunernas öppna data-portal,https://storsthlm.dataplatform.se
9,Q516336,http://www.wikidata.org/entity/Q516336,Södertälje kommun,Södertäljes öppna data,https://www.sodertalje.se/PSIdata


In [4]:
import urllib3, json
from tqdm import tqdm
http = urllib3.PoolManager()

listOpenData = []
for WD, row in tqdm(WDOpenData.iterrows(), total=WDOpenData.shape[0]):
    url = row["www"] 
    
    new_item = dict()
    new_item['wikidata'] = row["wikidata"] 
    #print(url)
    try:
        r = http.request('GET', url) 
        new_item['status'] = r.status

    except:
        print ("Error ", r.status, url, row["wikidata"] )
        new_item['status'] = r.status
    if r.status != 200:
        print (row["wikidata"] , r.status, url)
    new_item['url'] = url 
#    new_item['country'] = row["country"] 
    
    listOpenData.append(new_item)
print (len(listOpenData))

 11%|█         | 17/154 [00:03<00:34,  3.98it/s]

Q504692 404 https://www.halmstad.se/psidata


 19%|█▉        | 30/154 [00:06<00:39,  3.10it/s]

Error  200 http://www.helsingborg.se/psidata Q487648


 54%|█████▍    | 83/154 [00:27<00:25,  2.74it/s]

Error  200 https://www.hultsfred.se/psidata Q512002


 64%|██████▎   | 98/154 [00:32<00:11,  4.84it/s]

Q504505 404 https://www.vindeln.se/psidata


 83%|████████▎ | 128/154 [00:42<00:06,  3.86it/s]

Error  200 http://www.robertsfors.se/psidata Q507670


100%|██████████| 154/154 [00:54<00:00,  2.83it/s]

154





In [6]:
OpenDatatot = pd.DataFrame(listOpenData,
                  columns=['wikidata','status','url'])
OpenDatatot.shape


(154, 3)

In [9]:
OpenDatatot[OpenDatatot.status == 400]

Unnamed: 0,wikidata,status,url
