In [5]:
# Jupyter Notebook for Finlandssvenka tekniker - articles about individual Swedish speaking enginers active in 1800s in Finland
# output are markdown files that have been published at https://projektfredrika.fi/finsvetekniker
#
# 01 & 02 Scrape all articles from tfif.fi/finsvetekniker
# 03 Clean the article text with chatgpt4 API call (content is OCRd and has some "junk" in it
# 04 Search for each engineer's Wikidata Q-code and Wikipedia language articles with Wikidata API call
# 05 for each Wikipedia article language version, generate a an improved Wikipedia article based on soley on the given source (Finlandssvenska tekniker)
# 06 do NER on each Finlandssvenska tekniker article and save the NER, label as well as related Wikidata item
# 07 Create markdown file for each enginer with 1) cleaned article 2) NER-list 3) Wikipedia article suggestion in all languages
# 08 Create markdown file summary of NER-entities from all engineers
# Several steps loads/saves the data to/from a pkl file to not lose intermediate results (mainly cumulating to a list of dicts in finsvetekn-03.pkl) 

In [6]:
# 01 Get URLs of all articles at tfif.fi/finsvetekniker as well as metadata
import pickle 
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://tfif.fi/finsvetekniker/"

def get_links(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        a_tags = soup.find_all('a')
        results = []
        current_band = ''  # Initialize variable to store the current band
        for tag in a_tags:
            # Check for the preceding h2 tag for the current a-tag
            preceding_h2 = tag.find_previous('h2')
            if preceding_h2:
                current_band = preceding_h2.text.strip()  # Update current band with the h2 content
            href = tag.get('href')
            title = tag.text.strip()
            if href and not any(result['url'] == href for result in results):
                # Add 'band' to the dictionary
                results.append({'url': href, 'title': title, 'band': current_band})
            print(title, href, current_band)
    
        return results
    else:
        return []
    
pages = get_links(url)
print("Links found: "+str(len(pages)))
for p in pages:
    print(p)
with open("finsvetekn-01.pkl", "wb") as f:
    pickle.dump(pages, f)

Edvin Leonard Bergroth Edvin_Leonard_Bergroth.html Finlandssvenska tekniker I (1923)
Carl Adolf Engström Carl_Adolf_Engstrom.html Finlandssvenska tekniker I (1923)
Gustaf Adolf Helsingius Gustaf_Adolf_Helsingius.html Finlandssvenska tekniker I (1923)
Jacob Robert Huber Jacob_Robert_Huber.html Finlandssvenska tekniker I (1923)
Carl Theodor Höijer Carl_Theodor_Hoijer.html Finlandssvenska tekniker I (1923)
Bruno V. Nordberg Bruno_V_Nordberg.html Finlandssvenska tekniker I (1923)
Karl Evert Palmén Karl_Evert_Palmen.html Finlandssvenska tekniker I (1923)
Henrik Theodor Tallqvist Henrik_Theodor_Tallqvist.html Finlandssvenska tekniker I (1923)
Evert Edelfrid Wasastjerna Evert_Edelfrid_Wasastjerna.html Finlandssvenska tekniker I (1923)
Karl Joel Appelberg Karl_Joel_Appelberg.html Finlandssvenska tekniker II (1924)
Adolf Ossian Aschan Adolf_Ossian_Aschan.html Finlandssvenska tekniker II (1924)
Julius Donatus Forsman Julius_Donatus_Forsman.html Finlandssvenska tekniker II (1924)
Fredrik Werner L

In [7]:
# 02 Fetch content from all collected URLs at tfif.fi/finsvetekniker 
import html2text
import pickle 

def get_page(pages):
    converter = html2text.HTML2Text()
    converter.ignore_links = False
    converter.ignore_images = True
    converter.ignore_emphasis = False

    for page in pages:
        try:
            title = page['title']
            url = "https://tfif.fi/finsvetekniker/"+page['url']
            print(title, url)
            response = requests.get(url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                img_tag = soup.find('img')
                if img_tag and img_tag.has_attr('src'):
                    page['img'] = img_tag['src']
                else:
                    page['img'] = ''
                page['content_html'] = str(soup)
                page['content_text'] = soup.get_text(separator=' ', strip=True)
                page['content_md'] = converter.handle(response.content.decode('utf-8'))
        except Exception as e:
            print(f"Error fetching page {page['url']}: {str(e)}")
            page['content_text'] = 'Error fetching content'
            page['content_md'] = 'Error fetching content'
            page['content_html'] = 'Error fetching content'
            page['img'] = 'Error fetching image'
    return pages

pages_with_content = get_page(pages)
print(len(pages_with_content))

with open("finsvetekn-02.pkl", "wb") as f:
    pickle.dump(pages_with_content, f)

Edvin Leonard Bergroth https://tfif.fi/finsvetekniker/Edvin_Leonard_Bergroth.html
Carl Adolf Engström https://tfif.fi/finsvetekniker/Carl_Adolf_Engstrom.html
Gustaf Adolf Helsingius https://tfif.fi/finsvetekniker/Gustaf_Adolf_Helsingius.html
Jacob Robert Huber https://tfif.fi/finsvetekniker/Jacob_Robert_Huber.html
Carl Theodor Höijer https://tfif.fi/finsvetekniker/Carl_Theodor_Hoijer.html
Bruno V. Nordberg https://tfif.fi/finsvetekniker/Bruno_V_Nordberg.html
Karl Evert Palmén https://tfif.fi/finsvetekniker/Karl_Evert_Palmen.html
Henrik Theodor Tallqvist https://tfif.fi/finsvetekniker/Henrik_Theodor_Tallqvist.html
Evert Edelfrid Wasastjerna https://tfif.fi/finsvetekniker/Evert_Edelfrid_Wasastjerna.html
Karl Joel Appelberg https://tfif.fi/finsvetekniker/Karl_Joel_Appelberg.html
Adolf Ossian Aschan https://tfif.fi/finsvetekniker/Adolf_Ossian_Aschan.html
Julius Donatus Forsman https://tfif.fi/finsvetekniker/Julius_Donatus_Forsman.html
Fredrik Werner Lindberg https://tfif.fi/finsvetekniker/

In [8]:
# 03 Clean content with OpenAI. Saves and loads pkl file. The loop is limited to x articles in list, and articles that have not previously been cleaned
import pandas as pd
import os 
from openai import OpenAI
import pickle
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

def split_text_into_chunks(text, max_chunk_size=5000):
    def find_sentence_end(text):
        for i in range(len(text) - 1, -1, -1):
            if text[i] in '.!?':
                return i + 1
        return len(text)
    chunks = []
    while text:
        chunk_size = min(max_chunk_size, len(text))
        end_index = find_sentence_end(text[:chunk_size])
        chunks.append(text[:end_index])
        text = text[end_index:].strip()
    return chunks

def clean_text(content):
    chat_completion = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {
                "role": "user",
                "content": f"""Skriv om följande text med läsbara tecken, och radera möjligt 'smuts' som kommit med från OCR. Ändra inte orden till modern svenska. Behåll markdown formatteringen som sådan. Om det finns ett irrelevant sidnummer mit i texten radera den. Radera ologiska mellanslag, t.ex. Peters­ burg ska vara Petersburg. 
                
                {content}
                """,
            }
        ],
    )
    answer = chat_completion.choices[0].message.content
    tokens = chat_completion.usage.total_tokens
    return answer, tokens

#with open("finsvetekn-02.pkl", "rb") as f:
with open("finsvetekn-03.pkl", "rb") as f: # change to 02 to restart without previously done cleaning
    pages_with_content = pickle.load(f)

for i, page in enumerate(pages_with_content):
    if page.get("content_md_cleaned") == None and i >= 0 and i < 1:
        print("Cleaning with ChatGPT: " + page["title"] + ", length " + str(len(page["content_md"])))
        page["content_md_cleaned"] = ""
        chunks = split_text_into_chunks(page["content_md"])
        chunks_total = len(chunks)
        print(f"split into chunks: {chunks_total}")
        for j, chunk in enumerate(chunks):
            chunk_cleaned, tokens = clean_text(chunk)
            print(f"Chunk {j+1}/{chunks_total}, before: {len(chunk)}, after {len(chunk_cleaned)}, tokens {tokens}")
            page["content_md_cleaned"] += str(chunk_cleaned)
        print("before: " + str(len(page["content_md"])) + ", after: " + str(len(page["content_md_cleaned"])))
        with open('finsvetekniker/'+page["url"].replace(".html",".md"), 'w', encoding='utf-8') as file:
            file.write(page["content_md_cleaned"])

with open("finsvetekn-03.pkl", "wb") as f:
    pickle.dump(pages_with_content, f)


In [184]:
# 04 fetch Wikidata Q-code and Wikipedia languagae titles with engineer's name

import sys
from SPARQLWrapper import SPARQLWrapper, JSON
import pickle
import re
from urllib.parse import unquote

def get_results(person_name):
    endpoint_url = "https://query.wikidata.org/sparql"
    query = """SELECT ?person ?personLabel ?article WHERE {
  ?person ?label "<person_name>"@sv.
  ?article schema:about ?person .
  FILTER(CONTAINS(STR(?article), "wikipedia.org"))
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],sv,fi,en". }
}
""".replace("<person_name>",person_name)
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

def get_language_from_url(url):
    # Regex to match the language code in a Wikipedia URL
    pattern = r'https?://([a-z]{2,3})\.wikipedia\.org'
    match = re.search(pattern, url)
    if match:
        return match.group(1)  # Return the language code
    else:
        return None  # No language code found
    
def get_wikipedia_title_from_url(url):
    # Regex to extract the title part of a Wikipedia URL
    match = re.search(r'https?://[a-z]{2,3}\.wikipedia\.org/wiki/(.+)', url)
    if match:
        return match.group(1)  # Return the title part of the URL
    else:
        return None  # No title found

with open("finsvetekn-03.pkl", "rb") as f: 
    pages_with_content = pickle.load(f)

for page in pages_with_content:
  wikilinks = ""
  person = page["title"]
  print(person)
  results = get_results(person)
  page["wiki"] = []
  for result in results["results"]["bindings"]:
    wiki_url = result["article"]["value"]
    wiki_lang = get_language_from_url(wiki_url)
    wiki_title = unquote(get_wikipedia_title_from_url(wiki_url))
    wikilinks += "* " +wiki_url + "  \n"
    page["wiki"].append({"wiki_lang":wiki_lang, "wiki_url":wiki_url, "wiki_title":wiki_title})
  print("wikin:", len(page["wiki"]))
  page["wikitext"] = unquote(wikilinks)
  print(page["wikitext"])

with open("finsvetekn-03.pkl", "wb") as f:
    pickle.dump(pages_with_content, f)

Edvin Leonard Bergroth
wikin: 5
* https://en.wikipedia.org/wiki/Edvin_Bergroth  
* https://fi.wikipedia.org/wiki/Edvin_Bergroth  
* https://ru.wikipedia.org/wiki/Бергрот,_Эдвин_Иванович  
* https://sv.wikipedia.org/wiki/Edvin_Bergroth  
* https://uz.wikipedia.org/wiki/Edvin_Bergroth  

Carl Adolf Engström
wikin: 5
* https://ar.wikipedia.org/wiki/أدولف_إنغستروم  
* https://en.wikipedia.org/wiki/Adolf_Engström  
* https://fi.wikipedia.org/wiki/Adolf_Engström  
* https://ko.wikipedia.org/wiki/아돌프_엥스트룀  
* https://sv.wikipedia.org/wiki/Carl_Adolf_Engström  

Gustaf Adolf Helsingius
wikin: 2
* https://fi.wikipedia.org/wiki/Gustaf_Adolf_Helsingius  
* https://sv.wikipedia.org/wiki/Gustaf_Adolf_Helsingius  

Jacob Robert Huber
wikin: 2
* https://fi.wikipedia.org/wiki/Robert_Huber_(liikemies)  
* https://sv.wikipedia.org/wiki/Jacob_Robert_Huber  

Carl Theodor Höijer
wikin: 10
* https://arz.wikipedia.org/wiki/ثيودور_هويچير  
* https://arz.wikipedia.org/wiki/ثيودور_هويچير  
* https://en.wikiped

In [12]:
# 05 for each Wikipedia article language version, generate a an improved Wikipedia article based on soley on the given source (Finlandssvenska tekniker)
# loop is limited to one engineer definied in restrict-variable
from openai import OpenAI
import os
import requests
apikey = os.environ.get("OPENAI_API_KEY")
print(apikey)
client = OpenAI(api_key=apikey)
    
def get_wikipedia_article(title, lang):
    url = f"https://{lang}.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'format': 'json',
        'titles': title,
        'prop': 'extracts',
        'exintro': True,
        'explaintext': False,
    }
    response = requests.get(url, params=params)
    data = response.json()
    page = next(iter(data['query']['pages'].values()))
    content = page['extract'] if 'extract' in page else 'Content not found'
    return content

# Function to send a prompt referencing the uploaded file
def send_prompt_with_file(contents, wikipedia, lang):
    chat_completion = client.chat.completions.create(
        model="gpt-4-turbo-preview",
        messages=[
            {
                "role": "user",
                #prompt version 1: "content": f"Nedan finns en text om en person, och Wikipedia artikeln på språk som har wikipediaspråk {lang} om denna person. Gör förbättringar till Wikipedia artikeln på språket med wikipediaspråk {lang} baserat endast på den givna texten - visa ändringarna du föreslår geneom att bolda dem (med markdown formatering) - använd inte boldning för något annat. Behåll paragraffördelningen som den är, men för korta artiklar med bara en paragraf, utöka antalet paragrafer. Använd lättläst språk. \n# Text om en person\n {contents} \n# Wikipedia \n {wikipedia}",
                #prompt version 2: 
                "content": f"You are an experienced Wikipedian that writes high standard Wikipedia articles. You have new source material from Finlandssvenska tekniker (included below) that you will use to improve a Wikipedia article (also included below) in language {lang} (language given in wikipedia language code). \n\nBased on the source, improve the given Wikipedia article in the same language as the Wikipedia article. Follow these principles when writing your suggestion: \n\n1. Paragraph division: After a short introduction, follow with a new paragraph about upbringing and education (if available in source), and then paragraph(s) about career. Ensure logical flow and transitions between paragraphs to maintain coherence throughout the article. Divide and arrange paragraphs by theme (field of work, roles, achievements), and use chronological order when relevant.  \n2. Keep as much of the original content as possible as is, instead of completely rewriting existing text. Splitting a paragraph and re-arranging existing sentences is ok. Take into account accuracy, clarity, and coherence. Don't delete any existing facts from the article. Don't repeat things in the text.  \n3. Existing references: keep in the text as they are  \n4. References for new text: make sure additions are based on the given source text  \n5. Follow Wikipedia’s principals such as NPOV (Neutral Point of View), keep a neutral tone, avoid subjective judgment.  \n6. Encyclopaedic style: use a formal style and avoid informal sayings. Ensure the text is understandable to a broad audience by using simple language and some context.  \n7. Wikipedia technical standards: maintain the mark-up format given, and follow Wikipedia’s formatting rules for links, categories, sources. Link suitable keywords as is done on Wikipedia the first time a word is mentioned.  \n\nReply with the improved article in Wikipedia's markup code. \n\n# Wikipedia text {wikipedia} \n# Finlandssvenska tekniker article {contents}",
            }
        ],
    )
    print(chat_completion)
    answer = chat_completion.choices[0].message.content
    tokens = chat_completion.usage.total_tokens
    return answer, tokens

with open("finsvetekn-03.pkl", "rb") as f: 
    pages_with_content = pickle.load(f)

restrict = "Carl Adolf Engström"
stop = False
for page in pages_with_content:
    if page.get("content_md_cleaned") != None:
        print(page["title"])
        if page["title"] == restrict or stop:
            stop = False
            print(page["wikitext"])
            for w in page["wiki"]:
                lang = w["wiki_lang"]
                title = w["wiki_title"]
                print(lang, title)
                wiki_article = get_wikipedia_article(title, lang)
                w["original"] = wiki_article
                print(lang, title, len(wiki_article), wiki_article)
                contents = page["content_md_cleaned"]
                response, tokens = send_prompt_with_file(contents, wiki_article, lang)
                w["ai_version"] = response
                print(lang, title, "chatgpt, tokens consumed:", tokens)
                print(lang, title, "chatgpt, response:", response)

with open("finsvetekn-03.pkl", "wb") as f:
    pickle.dump(pages_with_content, f)



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
Edvin Leonard Bergroth
Carl Adolf Engström
* https://ar.wikipedia.org/wiki/أدولف_إنغستروم  
* https://en.wikipedia.org/wiki/Adolf_Engström  
* https://fi.wikipedia.org/wiki/Adolf_Engström  
* https://ko.wikipedia.org/wiki/아돌프_엥스트룀  
* https://sv.wikipedia.org/wiki/Carl_Adolf_Engström  

ar أدولف_إنغستروم
ar أدولف_إنغستروم 134 أدولف إنغستروم (بالفنلندية: Adolf Engström)‏ هو شخصية أعمال فنلندي، ولد في 17 فبراير 1855 في Vörå ‏ في فنلندا، وتوفي في 19 يونيو 1924.


In [195]:
# 06 do NER on each Finlandssvenska tekniker article and save the NER, label as well as related Wikidata item
import pickle
import spacy 

def countryqid(input):
    if input == None:
        return ""
    for p in input:
        if p["propertyId"] == "P17":
            return p["value"]
    return ""

def spacyfishing(text):
    nlp_model = spacy.load("sv_core_news_lg") # sm, md, lg
    nlp_model.add_pipe("entityfishing", config={"api_ef_base": "http://nerd.huma-num.fr/nerd/service", "language": "sv" , "extra_info": True}) 
    doc_en = nlp_model(text)
    results = []
    for ent in doc_en.ents:
        country_qid = countryqid(ent._.other_ids)
        result = {"text":ent.text, "label":ent.label_, "qid":ent._.kb_qid, "url_wikidata":ent._.url_wikidata, "nerd_score":ent._.nerd_score, "country_qid":country_qid} 
        results.append(result) 
    return results

with open("finsvetekn-03.pkl", "rb") as f:
    pages_with_content = pickle.load(f)

for i, page in enumerate(pages_with_content):
    if page.get("content_md_cleaned") != None:
        print(page["title"] + " getting entities")
        page["entities"] = spacyfishing(page["content_md_cleaned"])
        print(page["entities"])

with open("finsvetekn-04.pkl", "wb") as f:
    pickle.dump(pages_with_content, f)

Edvin Leonard Bergroth getting entities
[{'text': 'Edvin Leonard Bergroth', 'label': 'PRS', 'qid': 'Q11856628', 'url_wikidata': 'https://www.wikidata.org/wiki/Q11856628', 'nerd_score': 0.2897, 'country_qid': ''}, {'text': 'Jonatan Reuter', 'label': 'PRS', 'qid': 'Q6068803', 'url_wikidata': 'https://www.wikidata.org/wiki/Q6068803', 'nerd_score': 0.9443, 'country_qid': ''}, {'text': 'Edvin Leonard Bergroth', 'label': 'PRS', 'qid': 'Q11856628', 'url_wikidata': 'https://www.wikidata.org/wiki/Q11856628', 'nerd_score': 0.2897, 'country_qid': ''}, {'text': 'Edvin Bergroth', 'label': 'PRS', 'qid': 'Q11856628', 'url_wikidata': 'https://www.wikidata.org/wiki/Q11856628', 'nerd_score': 0.996, 'country_qid': ''}, {'text': 'Bergroth', 'label': 'PRS', 'qid': None, 'url_wikidata': None, 'nerd_score': None, 'country_qid': ''}, {'text': 'den 26 december 1836', 'label': 'TME', 'qid': None, 'url_wikidata': None, 'nerd_score': None, 'country_qid': ''}, {'text': 'Pihlajavesi', 'label': 'LOC', 'qid': 'Q20625

In [200]:
# 07 Create markdown file for each enginer with 1) cleaned article 2) NER-list 3) Wikipedia article suggestion
import pandas as pd
def lookup_country(q_code):
    data = pd.read_csv('land.csv', header=None, names=['QCode', 'Name'])
    result = data[data['QCode'] == q_code]['Name']
    if not result.empty:
        return result.iloc[0]
    else:
        return q_code

# save content with list to markdown file 
with open("finsvetekn-04.pkl", "rb") as f:
    pages_with_content = pickle.load(f)

df_cumulative = pd.DataFrame()
weight = 10
for i, page in enumerate(pages_with_content):
    if page.get("content_md_cleaned") != None:
        s = "\n\n## NER & Wikidata\n"
        print(page["title"])
        df = pd.DataFrame(page["entities"])
        df["title"] = page["title"]
        df["url"] = page["url"]
        text_counts = df.groupby('text').agg(
            count=('text', 'size'),  # Count the occurrences of each 'text'
            label=('label', 'first'),  # Get the first 'label' for each 'text'
            qid=('qid', 'first'),  # Get the first 'qid' for each 'text'
            url_wikidata=('url_wikidata', 'first'),  # Get the first 'url_wikidata' for each 'text'
            country_qid=('country_qid', 'first'),  # Get the first 'country_qid' for each 'text'
            nerd_score=('nerd_score', 'first')  # Get the first 'nerd_score' for each 'text'
        ).reset_index()
        text_counts_sorted_by_label_and_count = text_counts.sort_values(by=['label', 'count', 'text'], ascending=[True, False, True])
        current_label = None
        table_open = False
        for index, row in text_counts_sorted_by_label_and_count.iterrows():
            if row["label"] != current_label:
                if table_open == True:
                    s += f'</table>\n'
                    table_open = False
                current_label = row["label"]
                s += f'\n### {current_label}\n<table width=100%>\n'
                s += f'<tr><td width=15%>Antal</td><td width=40%>Ord</td><td width=15%>Wikidata</td><td width=15%>Land (P17)</td><td width=15%>Sannolikhet</td></tr>\n'
                table_open = True
            s += f"""<tr><td>{row["count"]}</td><td>{row["text"]}</td><td><a href='{row["url_wikidata"]}'>{row["qid"]}</a></td><td>{lookup_country(row["country_qid"])} </td><td>{row["nerd_score"]}</td></tr> \n"""
        if table_open == True:
            s += f'</table>\n'    
        df_cumulative = pd.concat([df_cumulative, df], ignore_index=True)
        frontmatter = f'''---
title: "{page["title"]}"
author: ""
book: "{page["band"]}"
year: ""
firstpage: ""
lastpage: ""
images: ["/finsvetekniker/{page["img"]}"]
wikidata: 
categories: ["Finlandssvenska tekniker"]
tags: ""
weight : {str(weight)}
---
\n'''
        weight += 10
        page["content_md_cleaned_with_ner"] = frontmatter + page["content_md_cleaned"] + s 
        page["content_md_cleaned_with_ner"] += "\n## Wikipedia \n" + page.get("wikitext", "not available")
        wikistuff = ""
        for w in page["wiki"]:
            wikistuff += f"\n### Wikipedia: {w['wiki_lang']}  \n"
            wikistuff += f"#### Original  \n"
            wikistuff += f"{w.get('original', 'inte skapad')}\n".replace('\n', '\n\n')
            wikistuff += f"#### AI-förslag  \n"
            wikistuff += f"{w.get('ai_version', 'inte skapad')}\n\n"
        page["content_md_cleaned_with_ner"] += wikistuff
        with open('finsvetekniker/'+page["url"].replace(".html",".md"), 'w', encoding='utf-8') as file:
            file.write(page["content_md_cleaned_with_ner"])
        #if i > 4:
        #    break

with open("finsvetekn-05.pkl", "wb") as f:
    pickle.dump(pages_with_content, f)

Edvin Leonard Bergroth
Carl Adolf Engström
Gustaf Adolf Helsingius
Jacob Robert Huber
Carl Theodor Höijer
Bruno V. Nordberg
Karl Evert Palmén
Henrik Theodor Tallqvist
Evert Edelfrid Wasastjerna


In [202]:
# 08 Create summary of NER-entities from all engineers

import pandas as pd
def lookup_country(q_code):
    data = pd.read_csv('land.csv', header=None, names=['QCode', 'Name'])
    result = data[data['QCode'] == q_code]['Name']
    if not result.empty:
        return result.iloc[0]
    else:
        return q_code

df = df_cumulative[['label', 'text', 'title', 'url', 'qid', 'country_qid']]  # Keep only the necessary columns
df = df.sort_values(by=['label', 'text'], ascending=[True, True])
df = df.drop_duplicates()
current_label = None
current_text = None
t = f'''---
title : "Finlandssvenska tekniker band I med index"
categories: ["Finlandssvenska tekniker"]
weight : 5
---\n\n'''
t += "# Finlandssvenska tekniker band I med index\n"

current_band = None
for i, page in enumerate(pages_with_content):
    if page["band"] != current_band:
        if i > 1:
            break
        current_band =  page["band"]
        t += f'\n### {page["band"]} \n'
    t += f'*  [{page["title"]}]({"/finsvetekniker/"+page["url"][:-5].lower()})  \n'

t += "## Finlandssvenska tekniker band I: NER-index\n"
t += f"Antaler artiklar indexerade: {df_cumulative['title'].nunique()}\n\n"

row_open = False
table_open = False
cell_open = False
for index, row in df.iterrows():
    if row["label"] != current_label:
        if table_open:
            t += "</td></tr></table> \n"
            table_open = False
        current_label = row["label"]
        t += f'\n### {current_label}\n<table width=100%><tr><td>Ord</td><td>Wikidata</td><td>Land (P17)</td><td>Omnämnd i</td></tr>'
        table_open = True
    if row["text"] != current_text:
        current_text = row["text"]
        current_qid = f'''<a href="https://www.wikidata.org/wiki/{row["qid"]}">{row["qid"]}</a>''' if row["qid"] is not None else "&#8203;"
        current_country = lookup_country(row["country_qid"]) if row["country_qid"] != "" else "&#8203;"
        if row_open:
            t += "</td></tr>\n"
            row_open = False
        t += f'\n<tr><td width=200>{current_text}</td><td width=100>{current_qid}</td><td width=100>{current_country}</td>\n<td>'
        row_open = True
    t += f""" <a href='{"/finsvetekniker/"+row["url"][:-5].lower()}'>{row["title"]}</a><br>\n"""
if row_open == True:
    t += "</td></tr>\n"
if table_open == True:
    t += f'</table>\n' 

print(t)
with open('finsvetekniker/ner_index.md', 'w', encoding='utf-8') as file:
    file.write(t)

---
title : "Finlandssvenska tekniker band I med index"
categories: ["Finlandssvenska tekniker"]
weight : 5
---

# Finlandssvenska tekniker band I med index

### Finlandssvenska tekniker I (1923) 
*  [Edvin Leonard Bergroth](/finsvetekniker/edvin_leonard_bergroth)  
*  [Carl Adolf Engström](/finsvetekniker/carl_adolf_engstrom)  
*  [Gustaf Adolf Helsingius](/finsvetekniker/gustaf_adolf_helsingius)  
*  [Jacob Robert Huber](/finsvetekniker/jacob_robert_huber)  
*  [Carl Theodor Höijer](/finsvetekniker/carl_theodor_hoijer)  
*  [Bruno V. Nordberg](/finsvetekniker/bruno_v_nordberg)  
*  [Karl Evert Palmén](/finsvetekniker/karl_evert_palmen)  
*  [Henrik Theodor Tallqvist](/finsvetekniker/henrik_theodor_tallqvist)  
*  [Evert Edelfrid Wasastjerna](/finsvetekniker/evert_edelfrid_wasastjerna)  
## Finlandssvenska tekniker band I: NER-index
Antaler artiklar indexerade: 9


### EVN
<table width=100%><tr><td>Ord</td><td>Wikidata</td><td>Land (P17)</td><td>Omnämnd i</td></tr>
<tr><td width=200>S