<a href="https://colab.research.google.com/github/rainermesi/estonian_literature_markov_chain/blob/main/wiki_word_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Generating new words using a Markov Chain and public domain Estonian literature

Working with Google Colab, this is needet to mount Google Drive to the notebook for persistent storage.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Modules

In [46]:
import numpy as np
import pandas as pd
from pandas.core.common import flatten
import random
import re
from collections import defaultdict
from collections import Counter
import unicodedata

! pip install BeautifulSoup4
from bs4 import BeautifulSoup
import requests
import time

! pip install ebooklib
import ebooklib
from ebooklib import epub
import os



Get list of public domain books. Tartu Public Libary publishes and distributes Estonian classics: https://www.luts.ee/index.php/111-e-raamatud

In [None]:
url = "https://www.luts.ee/index.php/111-e-raamatud"
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')

In [41]:
list_of_urls = []
for i in soup.find_all('a'):
  if i.get_text() == 'e-lugerisse (epub)':
    list_of_urls.append('https://www.luts.ee'+i.get('href'))

For some reason ebooklib does not like getting the epub file straight from requests. So I'm dowloading a copy of the books.

In [45]:
for i in list_of_urls:
  r = requests.get(i, allow_redirects=True)
  open(i.split('/')[-1], 'wb').write(r.content)
  time.sleep(5)

A function to parse the epubs into text

In [None]:
# code

A final loop to iterate over books and join them into one


In [None]:
for filename in os.listdir(os.curdir):
    if filename.endswith(".epub"):
        print(filename)
    else:
        continue

Read in Tammsaare corpus

In [None]:
book = epub.read_epub(r'/content/drive/My Drive/DATA/Wikipedia/etwiki_latest/Anton_Hansen_Tammsaare_Tode_ja_oigus_I.epub')

In [None]:
book_corpus = []

for i in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
  book_corpus.append(i.get_content())

In [None]:
def epub_to_text(corpus):
  blacklist = ['[document]','noscript','header','html','meta','head','input','script']
  output_str = ''
  output_list = []
  for i in corpus:
    soup = BeautifulSoup(i,'html.parser')
    text = soup.find_all(text=True)
    for t in text: 
      if t.parent.name not in blacklist:
        output_list.append(unicodedata.normalize("NFKD",t).strip())
  output_list = [i.split() for i in output_list]
  output_list = list(flatten(output_list))
  output_list_copy = [i for i in output_list if len(i) > 2]
  output_list = [i.replace(',','') for i in output_list_copy]
  output_list = [i.replace("'",'') for i in output_list]
  output_list = [i.replace("«",'') for i in output_list]
  output_list = [i.replace("»",'') for i in output_list]
  output_list = [i.lower() for i in output_list]
  return output_list

In [None]:
book_word_list = epub_to_text(book_corpus)

In [None]:
book_word_list

## Analysis

What are most popular words in corpus?

In [None]:
pd_word_series = pd.Series(book_word_list)

In [None]:
pd_word_series.value_counts().head(50)

How long are words in corpus?

In [None]:
def word_len_df_gen(in_list):
    count_list = [len(item) for item in in_list]
    count_df = pd.DataFrame.from_dict(Counter(count_list).items())
    count_df.sort_values(by=1,ascending=False)
    return count_df

In [None]:
word_len_df = word_len_df_gen(pd_word_series)
word_len_df.sort_values(by=[1],ascending=False).head(50)

Create a dictionay for the graph and parse the wiki corpus

In [None]:
def create_graph_dict(corpus):
  graphdict = defaultdict(lambda:defaultdict(int))
  for word in corpus:
    prev_letter = word[0]
    for letter in word[1:]:
      graphdict[prev_letter][letter] += 1
      prev_letter = letter
  return graphdict

In [None]:
graph_dict = create_graph_dict(pd_word_series)

Clean up the graph dictionary

In [None]:
def graph_cleanup(graph):
  abc = ['A', 'a', 'B', 'b', 'D', 'd', 'E', 'e', 'F', 'f', 'G', 'g', 'H', 'h', 'I', 'i', 'J', 'j', 'K', 'k', 'L', 'l', 'M', 'm', 'N', 'n', 'O', 'o', 'P', 'p', 'R', 'r', 'S', 's', 'Š', 'š', 'Z', 'z', 'Ž', 'ž', 'T', 't', 'U', 'u', 'V', 'v', 'Õ', 'õ', 'Ä', 'ä', 'Ö', 'ö', 'Ü', 'ü']
  abc = list(dict.fromkeys(i.lower() for i in abc))
  # clean primary keys
  tempgraph = dict((k, graph[k]) for k in abc if k in graph) 
  # clean nested key value pairs
  for letter in abc:
    try:
      for item in tempgraph[letter].copy():
        if item not in abc:
          del tempgraph[letter][item]
    except:
        print(letter,'not in corpus as key, skipping letter')
  return tempgraph

In [None]:
graph_dict = graph_cleanup(graph_dict)

f  not in corpus as key
š  not in corpus as key
z  not in corpus as key
ž  not in corpus as key
õ  not in corpus as key
ä  not in corpus as key
ö  not in corpus as key
ü  not in corpus as key


Traverse the graph_dict and create new words

In [None]:
def traverse_graph(graph, word_len=3, start_node=None):
  """Returns a list of words from a randomly weighted walk."""
  if word_len <= 0:
    return []
  
  # If not given, pick a start node at random.
  if not start_node:
    start_node = random.choice(list(graph.keys()))
  
  
  weights = np.array(
      list(graph[start_node].values()),
      dtype=np.float64)
  # Normalize letter counts to sum to 1. Create % weights for each letter.
  weights /= weights.sum()

  # Pick next letter using weighted distribution.
  choices = list(graph[start_node].keys())
  chosen_letter = np.random.choice(choices, None, p=weights)
  
  # recursively build a word until word_len = 0
  return [chosen_letter] + traverse_graph(
      graph, word_len=word_len-1,
      start_node=chosen_letter)

In [None]:
for i in range(10): 
  print(''.join(traverse_graph(graph_dict,word_len=random.choice(word_len_df[0]))))

ugakuisuudauistuhtta
aava
amama
telearit
sehimitesessetamimohela
amakehatsisistugajui
udalkubravagiloo
sstaragidindale
umelmidrulesaredmeev
orukiha
