[#1436](https://github.com/salgo60/DIGG-skuggbacklog/issues/1426)

In [3]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import difflib
import re

# --- 1Ô∏è‚É£  H√§mta kommuner i Dataportalen ------------------------------------
dp = SPARQLWrapper("https://admin.dataportal.se/sparql")
dp.setQuery("""
PREFIX dcat: <http://www.w3.org/ns/dcat#>
PREFIX dct:  <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>

SELECT DISTINCT ?org ?name
WHERE {
  ?dataset a dcat:Dataset ;
           dct:publisher ?org .
  ?org foaf:name ?name .
  FILTER(CONTAINS(LCASE(?name), "kommun"))
}
ORDER BY ?name
""")
dp.setReturnFormat(JSON)
dp_res = dp.query().convert()
df_dp = pd.DataFrame([{k:v["value"] for k,v in b.items()} for b in dp_res["results"]["bindings"]])
print(f"Kommuner hittade i Dataportalen: {len(df_dp)}")

# extrahera ID ur URL: .../organisations/{id}
df_dp["dataportal_id"] = df_dp["org"].apply(lambda x: re.findall(r'/([^/]+)$', x)[0])
df_dp.to_csv("dataportal_kommuner.csv", index=False)

# --- 2Ô∏è‚É£  H√§mta kommuner fr√•n Wikidata -------------------------------------
wd = SPARQLWrapper("https://query.wikidata.org/sparql")
wd.setQuery("""
SELECT ?item ?itemLabel ?www WHERE {
  ?item wdt:P31 wd:Q127448 .
  OPTIONAL { ?item wdt:P856 ?www. }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "sv,en". }
}
ORDER BY ?itemLabel
""")
wd.setReturnFormat(JSON)
wd_res = wd.query().convert()
df_wd = pd.DataFrame([{k:v["value"] for k,v in b.items()} for b in wd_res["results"]["bindings"]])
print(f"Kommuner hittade i Wikidata: {len(df_wd)}")

# --- 3Ô∏è‚É£  Matchning ---------------------------------------------------------
matches = []
for _, row in df_dp.iterrows():
    name = row["name"].strip()
    match = difflib.get_close_matches(name, df_wd["itemLabel"], n=1, cutoff=0.85)
    if match:
        wd_row = df_wd[df_wd["itemLabel"] == match[0]].iloc[0]
        matches.append({
            "dataportal_name": name,
            "dataportal_id": row["dataportal_id"],
            "wikidata_qid": wd_row["item"].split("/")[-1],
            "wikidata_label": wd_row["itemLabel"]
        })
    else:
        matches.append({
            "dataportal_name": name,
            "dataportal_id": row["dataportal_id"],
            "wikidata_qid": None,
            "wikidata_label": None
        })

df_match = pd.DataFrame(matches)
df_match.to_csv("matchning_kommuner.csv", index=False)
print(f"Matchningar funna: {df_match['wikidata_qid'].notna().sum()}")

# --- 4Ô∏è‚É£  QuickStatements-fil ----------------------------------------------
qs_lines = []
for _, row in df_match.dropna(subset=["wikidata_qid"]).iterrows():
    qid = row["wikidata_qid"]
    dp_id = row["dataportal_id"]
    qs_line = (
        f'{qid}\tP1343\tQ92961134\tS854\t"https://www.dataportal.se/metadatakvalitet/katalog/_quality/{dp_id}"'
    )
    qs_lines.append(qs_line)

pd.Series(qs_lines).to_csv("quickstatements_kommuner.tsv", index=False, header=False)
print("‚úÖ Fil skapad: quickstatements_kommuner.tsv")


Kommuner hittade i Dataportalen: 69
Kommuner hittade i Wikidata: 308
Matchningar funna: 65
‚úÖ Fil skapad: quickstatements_kommuner.tsv


In [4]:
df_match

Unnamed: 0,dataportal_name,dataportal_id,wikidata_qid,wikidata_label
0,QGIS-server √ñrebro kommun - Projket,www.qgisserverorebrokommunprojket.se,,
1,Ume√• kommun,7a00bd3796ed09a600646432cb321722,Q507709,Ume√• kommun
2,Upplands V√§sby kommun,upplandsvasby.se,Q499425,Upplands V√§sby kommun
3,√ñrebro kommun,www.orebro.se,Q297718,√ñrebro kommun
4,Ale kommun,SE2120001439,Q498470,Ale kommun
...,...,...,...,...
64,Ving√•kers kommun,SE2120000308,Q249378,Ving√•kers kommun
65,V√§rmd√∂ kommun,SE2120000035,Q493841,V√§rmd√∂ kommun
66,Ystads kommun,SE2120001181,Q505102,Ystads kommun
67,√Ñlvkarleby kommun,SE2120000258,Q59858,√Ñlvkarleby kommun


In [5]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

wd = SPARQLWrapper("https://query.wikidata.org/sparql")
wd.setQuery("""
SELECT ?item ?itemLabel ?orgnr WHERE {
  ?item wdt:P31 wd:Q127448;
        wdt:P6460 ?orgnr.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "sv,en". }
}
ORDER BY ?itemLabel
""")
wd.setReturnFormat(JSON)
results = wd.query().convert()
df_wd_orgnr = pd.DataFrame([{k:v["value"] for k,v in b.items()} for b in results["results"]["bindings"]])
df_wd_orgnr.to_csv("wikidata_kommuner_orgnr.csv", index=False)
print(df_wd_orgnr.head())


                                     item        orgnr        itemLabel
0  http://www.wikidata.org/entity/Q498470  212000-1439       Ale kommun
1  http://www.wikidata.org/entity/Q503162  212000-1553  Alings√•s kommun
2  http://www.wikidata.org/entity/Q182007  212000-0639   Alvesta kommun
3  http://www.wikidata.org/entity/Q503167  212000-0498     Aneby kommun
4  http://www.wikidata.org/entity/Q431271  212000-2122    Arboga kommun


In [9]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import re

# -----------------------------------------------------------------------------
# 1Ô∏è‚É£ H√§mta kommuner fr√•n Dataportalen
# -----------------------------------------------------------------------------
print("üîπ H√§mtar kommuner fr√•n Dataportalen...")
dp = SPARQLWrapper("https://admin.dataportal.se/sparql")
dp.setQuery("""
PREFIX dcat: <http://www.w3.org/ns/dcat#>
PREFIX dct:  <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>

SELECT DISTINCT ?org ?name
WHERE {
  ?dataset a dcat:Dataset ;
           dct:publisher ?org .
  ?org foaf:name ?name .
  FILTER(CONTAINS(LCASE(?name), "kommun"))
}
ORDER BY ?name
""")
dp.setReturnFormat(JSON)
dp_res = dp.query().convert()
df_dp = pd.DataFrame([{k:v["value"] for k,v in b.items()} for b in dp_res["results"]["bindings"]])
print(f"  üìò Hittade {len(df_dp)} organisationer i Dataportalen")

# Extrahera orgnr ur URI: http://dataportal.se/organisation/SE2120001553
def extract_orgnr(uri):
    m = re.search(r'SE(\d{10})', uri)
    return f"{m.group(1)[:6]}-{m.group(1)[6:]}" if m else None

df_dp["orgNumber"] = df_dp["org"].apply(extract_orgnr)
df_dp = df_dp.dropna(subset=["orgNumber"])
df_dp.to_csv("dataportal_kommuner.csv", index=False)
print(f"  ‚úÖ {len(df_dp)} hade giltigt organisationsnummer")

# -----------------------------------------------------------------------------
# 2Ô∏è‚É£ H√§mta kommuner fr√•n Wikidata
# -----------------------------------------------------------------------------
print("\nüîπ H√§mtar kommuner fr√•n Wikidata...")
wd = SPARQLWrapper("https://query.wikidata.org/sparql")
wd.setQuery("""
SELECT ?item ?itemLabel ?orgnr WHERE {
  ?item wdt:P31 wd:Q127448;
        wdt:P6460 ?orgnr.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "sv,en". }
}
ORDER BY ?itemLabel
""")
wd.setReturnFormat(JSON)
wd_res = wd.query().convert()
df_wd = pd.DataFrame([{k:v["value"] for k,v in b.items()} for b in wd_res["results"]["bindings"]])
print(f"  üìó Hittade {len(df_wd)} kommuner i Wikidata")

df_wd["QID"] = df_wd["item"].apply(lambda x: x.split("/")[-1])
df_wd.to_csv("wikidata_kommuner.csv", index=False)

# -----------------------------------------------------------------------------
# 3Ô∏è‚É£ Matcha p√• organisationsnummer
# -----------------------------------------------------------------------------
print("\nüîπ Matchar dataportal ‚Üî wikidata via orgnummer...")
df_join = pd.merge(df_dp, df_wd, left_on="orgNumber", right_on="orgnr", how="outer", indicator=True)

# Matchningar
df_match = df_join[df_join["_merge"] == "both"]
df_match.to_csv("matchade_kommuner.csv", index=False)

# Bara i Dataportalen
df_only_dp = df_join[df_join["_merge"] == "left_only"][["org", "name", "orgNumber"]]
df_only_dp.to_csv("endast_i_dataportal.csv", index=False)

# Bara i Wikidata
df_only_wd = df_join[df_join["_merge"] == "right_only"][["item", "itemLabel", "orgnr"]]
df_only_wd.to_csv("endast_i_wikidata.csv", index=False)

print(f"  üîó Matchade kommuner: {len(df_match)}")
print(f"  üü¶ Endast i Dataportalen: {len(df_only_dp)}")
print(f"  üü® Endast i Wikidata: {len(df_only_wd)}")

# -----------------------------------------------------------------------------
# 4Ô∏è‚É£ Skapa QuickStatements-fil
# -----------------------------------------------------------------------------
qs_lines = []
for _, row in df_match.iterrows():
    qid = row["QID"]
    uri = row["org"]
    line = f'{qid}\tP1343\tQ92961134\tS854\t"{uri}"'
    qs_lines.append(line)

pd.Series(qs_lines).to_csv("quickstatements_kommuner.tsv", index=False, header=False)
print(f"\n‚úÖ Klar! {len(qs_lines)} rader skapade i quickstatements_kommuner.tsv")

# -----------------------------------------------------------------------------
# 5Ô∏è‚É£ Sammanfattning
# -----------------------------------------------------------------------------
print("\nüìä Sammanfattning:")
print(df_match[["QID", "itemLabel", "orgNumber", "org"]].head(10))
print("\nFiler skapade:")
print("  ‚Ä¢ dataportal_kommuner.csv")
print("  ‚Ä¢ wikidata_kommuner.csv")
print("  ‚Ä¢ matchade_kommuner.csv")
print("  ‚Ä¢ endast_i_dataportal.csv")
print("  ‚Ä¢ endast_i_wikidata.csv")
print("  ‚Ä¢ quickstatements_kommuner.tsv")


üîπ H√§mtar kommuner fr√•n Dataportalen...
  üìò Hittade 69 organisationer i Dataportalen
  ‚úÖ 58 hade giltigt organisationsnummer

üîπ H√§mtar kommuner fr√•n Wikidata...
  üìó Hittade 290 kommuner i Wikidata

üîπ Matchar dataportal ‚Üî wikidata via orgnummer...
  üîó Matchade kommuner: 57
  üü¶ Endast i Dataportalen: 1
  üü® Endast i Wikidata: 233

‚úÖ Klar! 57 rader skapade i quickstatements_kommuner.tsv

üìä Sammanfattning:
        QID          itemLabel    orgNumber  \
0   Q498470         Ale kommun  212000-1439   
1   Q503162    Alings√•s kommun  212000-1553   
2   Q113718    Botkyrka kommun  212000-2882   
3   Q503144  Eskilstuna kommun  212000-0357   
4  Q1130264      Esl√∂vs kommun  212000-1173   
5   Q503127     Essunga kommun  212000-2916   
6   Q503214  Falk√∂pings kommun  212000-1744   
7   Q501545        Falu kommun  212000-2221   
8   Q505259   Gislaveds kommun  212000-0514   
9   Q503148   Gr√§storps kommun  212000-1595   

                                     

In [10]:
PREFIX dcat: <http://www.w3.org/ns/dcat#>
PREFIX dct:  <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>

SELECT ?publisher ?name (COUNT(?dataset) AS ?datasets)
WHERE {
  ?dataset a dcat:Dataset ;
           dct:publisher ?publisher .
  ?publisher foaf:name ?name .
  FILTER(CONTAINS(LCASE(?name), "kommun"))
}
GROUP BY ?publisher ?name
ORDER BY DESC(?datasets)


SyntaxError: invalid syntax (394864254.py, line 1)