## Webscrape kulturpersoner uppsalakyrkogardar
version 0.4

checks uppsalakyrkogardar and compare with Wikidata. 

* This [notebook](https://github.com/salgo60/open-data-examples/blob/master/Check%20WD%20kulturpersoner%20uppsalakyrkogardar.ipynb)
* Wikidata SPARQL https://w.wiki/PgP
* Webpage to check https://kulturpersoner.uppsalakyrkogardar.se/galleri/

#### Other sources we sync
* [Kulturpersoner Uppsalakyrkogård](https://github.com/salgo60/open-data-examples/blob/master/Check%20WD%20kulturpersoner%20uppsalakyrkogardar.ipynb)
* [Litteraturbanken](https://github.com/salgo60/open-data-examples/blob/master/Litteraturbanken%20Author.ipynb) 
  * WD property [P5101](https://www.wikidata.org/wiki/Property_talk:P5101) [P5123](https://www.wikidata.org/wiki/Property_talk:P5123)
* [Nobelprize.org](https://github.com/salgo60/open-data-examples/blob/master/Nobel%20API.ipynb)
  * WD [property 8024](https://www.wikidata.org/wiki/Property:P8024)
* [SBL](https://github.com/salgo60/open-data-examples/blob/master/SBL.ipynb) 
  * WD [property 3217](https://www.wikidata.org/wiki/Property:P3217) 
* [SKBL](https://github.com/salgo60/open-data-examples/blob/master/Svenskt%20Kvinnobiografiskt%20lexikon%20part%203.ipynb)
  * WD [property 4963](https://www.wikidata.org/wiki/Property:P4963)
* [Svenska Akademien](https://github.com/salgo60/open-data-examples/blob/master/Svenska%20Akademien.ipynb) 
  * WD [property 5325](https://www.wikidata.org/wiki/Property:P5325) 


In [1]:
from datetime import datetime
start_time  = datetime.now()
print("Last run: ", start_time)

Last run:  2023-02-16 10:00:29.897586


In [2]:
#https://kulturpersoner.uppsalakyrkogardar.se/galleri/
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
def cleanupstring(s):
    out_s = s
    while '  ' in out_s:
        out_s = out_s.strip().replace('  ', ' ')
    return out_s, len(s)-len(out_s)

urls = [
    'https://kulturpersoner.uppsalakyrkogardar.se/galleri/']

newList = []

for url in urls:

    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    soup.findAll('a')
    one_a_tag = soup.findAll('a')[0]
    #    print ( one_a_tag )
    #   link = one_a_tag['href']
    time.sleep(1)
    for i in range(1, len(soup.findAll('a'))):
        new_item = dict()
        one_a_tag = soup.findAll('a')[i]
        one_a_tag_href = one_a_tag['href']
        #print(one_a_tag_href)
        #print("\t",one_a_tag.text)
        if   "karta" not in one_a_tag_href \
                and not "Kyrkogårdsvandring" in one_a_tag.text \
                and not "Search" in one_a_tag.text \
                and not "Start" in one_a_tag.text \
                and not "Vandringar" in one_a_tag.text \
                and not "Guidad" in one_a_tag.text \
                and not "galleri" in one_a_tag_href \
                and not "om-oss"  in one_a_tag_href \
                and not "kyrkogardsexpedition"  in one_a_tag_href:
            #print(one_a_tag.text,"|",one_a_tag.text.replace(" ","_").replace("%C3%84","Ä"),"|",one_a_tag_href.replace("/genealogi/",""))
            
            #person = one_a_tag.text.replace("https://kulturpersoner.uppsalakyrkogardar.se/","").replace("-"," ").replace("%C3%84","Ä")
            #person = one_a_tag.text.replace("https://kulturpersoner.uppsalakyrkogardar.se/","")
            new_item["kulturgravname"] = one_a_tag.text
            new_item["href"] = one_a_tag['href'].replace("https://kulturpersoner.uppsalakyrkogardar.se/","").replace("/","")
            #print(new_item)
            newList.append(new_item)
print (len(newList) ," antal poster")

131  antal poster


In [3]:
import pandas as pd  
Kulturgravar = pd.DataFrame(newList)

In [4]:
Kulturgravar.head()

Unnamed: 0,kulturgravname,href
0,Digitala kyrkogårdsvandringar,digitalvandring
1,SV,#weglot_switcher
2,Anita Nathorst,anita-nathorst
3,Henri Osti,henri-osti
4,Fadime Sahindal,fadime-sahindal


## Wikidata  
check graves with ref kulturpersoner https://w.wiki/crt

In [5]:
# pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/

import sys,json
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

# https://w.wiki/3teL

query = """select ?person (REPLACE(STR(?person),".*Q","Q") AS ?qid) ?personLabel ?refURL ?href
{
   hint:Query hint:optimizer "None" .
   ?person wdt:P119 wd:Q4353116;
           p:P119 ?burial_statement. #Place of burial
   {?burial_statement prov:wasDerivedFrom ?ref}
   {?ref pr:P854 ?refURL} 
   FILTER (CONTAINS(str(?refURL),'kulturpersoner')) .  
   BIND(REPLACE(REPLACE(str(?refURL), "https://kulturpersoner.uppsalakyrkogardar.se/", ""),"/","") AS ?href)
 
   SERVICE wikibase:label { bd:serviceParam wikibase:language "sv"}
   Filter(!CONTAINS(str(?refURL),".mp4") ) # some links to mp4
}
Order by ?personLabel
"""

def get_sparql_dataframe(endpoint_url, query):
    """
    Helper function to convert SPARQL results into a Pandas data frame.
    """
    user_agent = "salgo60/%s.%s" % (sys.version_info[0], sys.version_info[1])
 
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()

    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']

    out = []
    for row in processed_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)

    return pd.DataFrame(out, columns=cols)

WDKulturGravar = get_sparql_dataframe(endpoint_url, query)


In [6]:
WDKulturGravar.to_csv("Check_WD_kulturpersoner_uppsalakyrkogardar.csv")
WDKulturGravar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   person       122 non-null    object
 1   qid          122 non-null    object
 2   personLabel  122 non-null    object
 3   refURL       122 non-null    object
 4   href         122 non-null    object
dtypes: object(5)
memory usage: 4.9+ KB


In [7]:
from IPython.display import display, HTML  
WDKulturGravar["WD"] = "<a href='http://www.wikidata.org/entity/" + WDKulturGravar['qid'].astype(str) + "#P119'>WD " + WDKulturGravar['qid'].astype(str) + "</a>"
WDKulturGravar["kulturperson"] = "<a href='https://kulturpersoner.uppsalakyrkogardar.se/" + WDKulturGravar['href'].astype(str) + "'>" + WDKulturGravar['href'].astype(str) + "</a>"

pd.set_option("display.max.columns", None) 
HTML(WDKulturGravar[{'WD','personLabel','kulturperson'}].tail(15).to_html(escape=False))


  HTML(WDKulturGravar[{'WD','personLabel','kulturperson'}].tail(15).to_html(escape=False))


Unnamed: 0,personLabel,kulturperson,WD
107,Rutger Sernander,rutger-sernander,WD Q919826
108,Sixtus Janson,sixtus-janson,WD Q38772992
109,Sonja Lyttkens,sonja-lyttkens,WD Q19844789
110,Svante Arrhenius,svante-arrhenius,WD Q80956
111,Sven Anders Hägg,sven-anders-hagg,WD Q99485352
112,Sven Lilja,sven-lilja,WD Q5951505
113,Thekla Knös,thekla-knos,WD Q4959294
114,Topper Martyn,topper-martyn,WD Q5978552
115,Tycho Hedén,tycho-heden,WD Q5795469
116,Ulla-Bella Fridh,ulla-bella-fridh-gabrielsson,WD Q4948950


compare href 

In [8]:
dfmerge = pd.merge(WDKulturGravar, Kulturgravar,how='outer', on='href',indicator=True)

In [9]:
dfmerge['_merge'] = dfmerge['_merge'].str.replace('left_only','Wikidata_only').str.replace('right_only','Kulturgravar_only')
dfmerge.rename(columns={"_merge": "Kulturgravar_Wikidata"},inplace = True)


In [10]:
dfmerge["Kulturgravar_Wikidata"].value_counts()

both                 128
Kulturgravar_only      4
Wikidata_only          1
Name: Kulturgravar_Wikidata, dtype: int64

### Any diff Wikidata <-> Kulturgravar

In [11]:
KulturgravarNotBoth = dfmerge[~(dfmerge["Kulturgravar_Wikidata"] == "both")].copy() 
KulturgravarNotBoth.shape[0]


5

In [12]:
KulturgravarNotBoth["refURL"]


29     https://kulturpersoner.uppsalakyrkogardar.se/e...
129                                                  NaN
130                                                  NaN
131                                                  NaN
132                                                  NaN
Name: refURL, dtype: object

In [13]:
Kulturgravaronly = dfmerge[dfmerge["Kulturgravar_Wikidata"] == "Kulturgravar_only"].copy() 

In [14]:
Kulturgravaronly.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 129 to 132
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   person                 0 non-null      object
 1   qid                    0 non-null      object
 2   personLabel            0 non-null      object
 3   refURL                 0 non-null      object
 4   href                   4 non-null      object
 5   WD                     0 non-null      object
 6   kulturperson           0 non-null      object
 7   kulturgravname         4 non-null      object
 8   Kulturgravar_Wikidata  4 non-null      object
dtypes: object(9)
memory usage: 320.0+ bytes


In [15]:
Kulturgravaronly["url"] = "https://kulturpersoner.uppsalakyrkogardar.se/" + Kulturgravaronly["href"] 
pd.set_option('column_space', 20)
pd.set_option('max_colwidth', 300)
Kulturgravaronly = Kulturgravaronly.reset_index(drop=True)  

Kulturgravaronly[["kulturgravname","url"]]

Unnamed: 0,kulturgravname,url
0,Digitala kyrkogårdsvandringar,https://kulturpersoner.uppsalakyrkogardar.se/digitalvandring
1,SV,https://kulturpersoner.uppsalakyrkogardar.se/#weglot_switcher
2,Henri Osti,https://kulturpersoner.uppsalakyrkogardar.se/henri-osti
3,Egmont Tornberg,https://kulturpersoner.uppsalakyrkogardar.se/egmont-tornberg


In [16]:
end = datetime.now()
print("Ended: ", end)
print('Time elapsed (hh:mm:ss.ms) {}'.format(datetime.now() - start_time))

Ended:  2023-02-16 10:00:37.553541
Time elapsed (hh:mm:ss.ms) 0:00:07.656590
