# N-order Markov Model

## Imports

In [1]:
from glob import glob
from tqdm import tqdm

from re import sub, split
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('words')
stopwords = nltk.corpus.stopwords.words('english')
import re

from numpy import random, array

from urllib import request

## Load Text

In [2]:
text = ''
for file in glob('corpus/*'):
    if 'preface' in file.lower():
        print(file)
    text += open(file, 'r', encoding='utf8').read() + '\n'

## Clean Text

In [3]:
def clean(string):
    text = []
    
    words = nltk.word_tokenize(string)
    text.extend([word for word in words if word.isalpha()])

    return ' '.join(list(map(str.lower, text)))

In [4]:
text = clean(text)

## Markov Model

In [5]:
# markov.py

from itertools import tee


def update(d, keys, value):
    for key in keys:
        if key in d:
            d = d[key]
        else:
            newd = dict()
            d[key] = newd
            d = newd
    d[value] = d.get(value, 0) + 1


def marginalizeF(fname, window):
    with open(fname, 'r', encoding='utf8') as f:
        return marginalize(f.read(), window)


def marginalize(text, window):
    d = dict()
    for w in slide(text.split(' '), window):
        features = w[:-1]
        target = w[-1]
        update(d, features, target)
    return d


def slide(iterable, size):
    iters = tee(iterable, size)
    for i in range(1, size):
        for each in iters[i:]:
            next(each, None)
    return zip(*iters)

## Create Model

In [6]:
window = 4
# text = open('tales.txt', 'r', encoding='utf8').read()

d = marginalize(text, int(window))

## Generate Sentences

In [7]:
n_words = 25
n_sents = 5

for i in range(n_sents):
    chain = ['once', 'upon', 'a', 'time']
    for j in range(n_words):
        window_d = chain[1 - window:]
        d_t = d[window_d[0]]
        for key in range(1, window - 1):
            d_t = d_t.get(window_d[key], {})
        vals = list(d_t.values())
        word = random.choice(
            list(d_t.keys()),
            p=array(vals)/sum(vals)
        )
        chain.append(word)
    print(str(i+1)+':', ' '.join(chain), '\n')

1: once upon a time the maiden sat down too and the knight of the glen s wild steed of bells and a clattering of horses hoofs whereat starting up 

2: once upon a time a little boy and her heart was far away and i got away as soon as he had expected the peas had taken root in 

3: once upon a time when was reigning in benares and when he had wandered about for many hours at the beautiful elderbush a row of beautiful pillars led to 

4: once upon a time there were two voices the boy from over the sea look into my right ear but i have taken on myself to remind your highness 

5: once upon a time there lived a woman some distance up came an enormous bull so fat that they could by no means replied maimoune i have no flute 



## Check Style

In [8]:
doc = request.urlopen('https://docs.google.com/document/export?format=txt&id=1nlGzXv09roHMtTjlJQhJ6ZnwWMDHeGKi_Xnk8mygjEw').read().decode('utf-8').replace('\r\n', '\n').replace('\ufeff', '')
doc_words = clean(doc).split(' ')

In [9]:
chain = doc_words[:window - 1]
outliers = {}

for i, word in enumerate(doc_words[window - 1:]):
    d_t = d.get(doc_words[i], {})
    key = 1
    for key in range(1, window - 1):
        d_t = d_t.get(doc_words[i + key], {})
    if i + key + 1 < len(doc_words):
        if doc_words[i + key + 1] not in d_t:
            outliers[doc_words[i + key + 1]] = outliers.get(doc_words[i + key + 1], 0) + 1
    chain.append(word)

In [10]:
outlier_words = set()
for key, value in outliers.items():
    if value / doc_words.count(key) == 1 and key not in stopwords:
        outlier_words.add(key)
# print(outlier_words)

## Test Vocab

In [11]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

fairy_stems = set(map(stemmer.stem, set(text.split(' '))))

def check_fairy_word(word):
    return stemmer.stem(word) in fairy_stems

In [12]:
from nltk.corpus import words
word_list = words.words()

check_words = {}
for word in outlier_words:
    stem = stemmer.stem(word)
    if stem not in fairy_stems and word in word_list:
        if stem in check_words:
            check_words[stem].append(word)
        else:
            check_words[stem] = [word]

In [13]:
check_words

{'rubbl': ['rubble'],
 'unord': ['unordered'],
 'focus': ['focus'],
 'ocher': ['ocher'],
 'rematch': ['rematch'],
 'amplifi': ['amplify'],
 'repositori': ['repository'],
 'unitari': ['unitary'],
 'typic': ['typical'],
 'comput': ['computation', 'computer'],
 'notebook': ['notebook'],
 'rand': ['rand'],
 'algorithm': ['algorithm'],
 'fictiti': ['fictitiously'],
 'hanna': ['hanna'],
 'encod': ['encode'],
 'protagonist': ['protagonist'],
 'millennia': ['millennia'],
 'unscrambl': ['unscramble'],
 'copyright': ['copyright'],
 'browser': ['browser'],
 'specif': ['specifically'],
 'modulus': ['modulus'],
 'shor': ['shor'],
 'feedback': ['feedback'],
 'paperback': ['paperback'],
 'skid': ['skidding'],
 'photon': ['photon'],
 'electron': ['electronic'],
 'slowpok': ['slowpoke'],
 'encrypt': ['encrypt'],
 'cipher': ['cipher'],
 'uncloth': ['unclothed'],
 'coincident': ['coincidental', 'coincidentally'],
 'travi': ['travis'],
 'bibliographi': ['bibliography']}

In [14]:
check_fairy_word("modest")

True