In [1]:
import requests
import xmlschema
import pickle
import os

In [2]:
# Schema found at
url = 'https://www.mediawiki.org/xml/export-0.10.xsd'
schema = requests.get(url)

# XML (~400 MB) has been downloaded from: 
# https://dumps.wikimedia.org/fiwiktionary/20200820/fiwiktionary-20200820-pages-meta-current.xml.bz2
xml = 'wikidata/fiwiktionary-20200820-pages-meta-current.xml'

# Initialize the schema object
xs = xmlschema.XMLSchema(schema.text)

# Set filename for saving the XML as dict() - to speed up loading if needed.
filename = "wikidata" + os.path.sep + "wiki.pickle"

In [3]:
# Uncomment lines below to load XML as dict. This will take some minutes.

# wiki = xs.to_dict(xml)

# f = open("wikidata" + os.path.sep + "wiki.pickle", "wb")
# pickle.dump(wiki, f)
# f.close()

In [4]:
# Load the pickle from disk
f = open(filename, "rb")
wiki = pickle.load(f, encoding="utf-8")
f.close()

In [5]:
def isolate_finnish_heading(page_content, verbose=False):
    """
    This function will isolate the contents that belong
    under the heading 'Suomi' (Finnish)
    """
    
    # Init names for slicing
    start_at = None
    end_to = None
    
    # Turn the page to rows
    rows = page_content.split('\n')
    n_rows = len(rows)

    for i, row in enumerate(rows):
        # If the row starts with ==Suomi
        if re.search('^==Suomi', row):
            start_at = i
            if verbose:
                print(f'Start at {i} as {row}')
            continue

        # Find the next ==Valtio
        if re.search('^==[A-Z]', row):
            end_to = i
            if verbose:
                print(f'End to {i} as {row}')
            break
        
    if end_to is None:
        if verbose:
            print("No other ==Country found")
        end_to = len(rows)

    # return the rows between ==Suomi and ==Valtio as a string
    return ' '.join(rows[start_at:end_to])

In [16]:
import pprint
import re

pp = pprint.PrettyPrinter(indent=4)

fintitles = []
fincontents = []

# Loop the pages in wiki.
for page in wiki['page']:
    
    # Keep only pages with type 0.
    if page['ns'] == 0:
        
        # Get the text content inside the page.
        text = page['revision'][0]['text']['$']
        
        # Drop any words that are not part of Finnish sets
        if not re.search('==Suomi==', text):
            continue
        
        # Drop words already in plural (e.g. eläinten)
        if re.search('taivm-mon-gen', text):
            continue
            
        # Drop 1-letter words
        if len(page['title']) < 2:
            continue
        
        # Isolate the Finnish/Suomi heading. We don't need to know
        # that kissa means 'to piss' in Swedish
        text = isolate_finnish_heading(text)

        fintitles.append(page['title'])
        fincontents.append(text)

In [21]:
import csv

# Get n_lines for printing purposes
maxlines = len(fintitles)

# Set filename
db_filename = "wikidata" + os.path.sep + "wikidb_parsed.csv"

with open(db_filename, 'w', newline='', encoding='utf-8') as csvfile:
    
    # Set header
    fieldnames = ['title', 'is_verb', 'is_subs', 'is_adje', 'is_nume', 'is_name', 'orig_index']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for i, (title, content) in enumerate(zip(fintitles, fincontents)):
        # Generate a row
        is_verb = True if re.search('fi-verbi', content) else False
        is_subs = True if re.search('fi-subs', content) else False
        is_adje = True if re.search('fi-adj', content) else False
        is_nume = True if re.search('num-k', content) else False
        is_name = True if re.search('erisnimi', content) else False

        # Write the row
        writer.writerow({'title': title,
                         'is_verb': is_verb,
                         'is_subs': is_subs,
                         'is_adje': is_adje,
                         'is_nume': is_nume,
                         'is_name': is_name,
                         'orig_index': i
                         })

        if i  % 10000 == 0:
            print(f'[INFO] Processing line {i}/{maxlines}')

[INFO] Processing line 0/127694
[INFO] Processing line 10000/127694
[INFO] Processing line 20000/127694
[INFO] Processing line 30000/127694
[INFO] Processing line 40000/127694
[INFO] Processing line 50000/127694
[INFO] Processing line 60000/127694
[INFO] Processing line 70000/127694
[INFO] Processing line 80000/127694
[INFO] Processing line 90000/127694
[INFO] Processing line 100000/127694
[INFO] Processing line 110000/127694
[INFO] Processing line 120000/127694


In [23]:
test_words = ['kuusi', 'kissa', 'juosta', 'pyöreä', 'tribus', 'eläinten', 'Ruttonen']

with open(db_filename, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)

    for row in reader:
        if row['title'] in test_words:
            
            info_to_print = f"#{row['orig_index']} : {row['title']} on:"
            
            if row['is_verb'] == 'True':
                info_to_print += ' verbi '
            if row['is_subs'] == 'True':
                info_to_print += ' substantiivi '
            if row['is_adje'] == 'True':
                info_to_print += ' adjektiivi '
            if row['is_nume'] == 'True':
                info_to_print += ' numeraali '
            if row['is_name'] == 'True':
                info_to_print += ' erisnimi '
            print(info_to_print)

#47 : kuusi on: substantiivi  numeraali 
#485 : pyöreä on: adjektiivi 
#1058 : kissa on: substantiivi 
#2653 : juosta on: verbi 
#16254 : tribus on: substantiivi 
#107577 : Ruttonen on: substantiivi  erisnimi 


In [24]:
fincontents[107577]

"==Suomi==  ===Erisnimi=== {{erisnimi|fi}} {{fi-taivutus|38}}  # {{sukunimi|fi}}  ====Taivutus==== {{fi-subs-nainen|Rutto}}  ====Lähteet==== * Esko Kirjalainen. ''Väestötietojärjestelmän suomalaisten nimiaineistot. [https://www.avoindata.fi/data/fi/dataset/none/resource/d25831d1-82a9-476f-8f7c-374c348efc14 Sukunimitilasto 4.9.2017].'' Väestörekisterikeskus. (32 nimenhaltijaa.)"

In [25]:
import pandas as pd

In [26]:
df = pd.read_csv(db_filename) 

In [57]:
df[df.is_subs].sample(10)

Unnamed: 0,title,is_verb,is_subs,is_adje,is_nume,is_name,orig_index
18592,botulismi,False,True,False,False,False,18592
16728,emotionalismi,False,True,False,False,False,16728
12036,vitonen,False,True,False,False,False,12036
40878,suippopyrstökottarainen,False,True,False,False,False,40878
35997,celebesinhaukkapöllö,False,True,False,False,False,35997
80548,ruokatavarakauppa,False,True,False,False,False,80548
26206,tarjoushinta,False,True,False,False,False,26206
93423,kanervakasvi,False,True,False,False,False,93423
96094,kunoniat,False,True,False,False,False,96094
28609,tulitikkurasia,False,True,False,False,False,28609


In [67]:
df[df.is_name].sample(10)

Unnamed: 0,title,is_verb,is_subs,is_adje,is_nume,is_name,orig_index
113727,Leppimäki,False,False,False,False,True,113727
113417,Majaneva,False,False,False,False,True,113417
23870,Raappana,False,False,False,False,True,23870
102106,Säämäjärvi,False,False,False,False,True,102106
114204,Vuorilahti,False,False,False,False,True,114204
107997,Vilppunen,False,True,False,False,True,107997
80388,Jyränki,False,False,False,False,True,80388
79015,Brasília,False,False,False,False,True,79015
108247,Keisala,False,False,False,False,True,108247
112069,Kylmäniemi,False,False,False,False,True,112069


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127694 entries, 0 to 127693
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   title       127694 non-null  object
 1   is_verb     127694 non-null  bool  
 2   is_subs     127694 non-null  bool  
 3   is_adje     127694 non-null  bool  
 4   is_nume     127694 non-null  bool  
 5   is_name     127694 non-null  bool  
 6   orig_index  127694 non-null  int64 
dtypes: bool(5), int64(1), object(1)
memory usage: 2.6+ MB


In [156]:
df.sample(5)

Unnamed: 0,title,is_verb,is_subs,is_adje,is_nume,is_name,orig_index
72751,änkyttäjä,False,True,False,False,False,72751
84109,jemmaten,False,False,False,False,False,84109
96201,imikät,False,True,False,False,False,96201
109351,Peuraharju,False,True,False,False,True,109351
94426,ulpukat,False,True,False,False,False,94426
