# N-order Markov Model

## Imports

In [1]:
from glob import glob
from tqdm import tqdm

from re import sub, split
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('words')
# nltk.download('averaged_perceptron_tagger')
stopwords = nltk.corpus.stopwords.words('english')
word_list = nltk.corpus.words.words()
stemmer = nltk.stem.snowball.SnowballStemmer("english")

from numpy import random, array
from urllib import request

## Load Text

In [2]:
text = ''
for file in glob('corpus/*'):
    text += open(file, 'r', encoding='utf8').read() + '\n'

## Clean Text

In [3]:
def clean(string):
    text = []
    
    words = nltk.word_tokenize(string)
    text.extend([word for word in words if word.isalpha()])

    return ' '.join(list(map(str.lower, text)))

## Word List

In [4]:
scifi_text = ''
for file in glob('/Users/splch/Desktop/opposite/*'):
    scifi_text += open(file, 'r', encoding='utf8').read() + '\n'
scifi_text = clean(scifi_text)
scifi_stems = set(map(stemmer.stem, map(str.lower, set(scifi_text.split(' ')))))

In [5]:
words = {}
for sent in tqdm(nltk.sent_tokenize(text)):
    s = sub(r'\s+', ' ', sent)
    s = nltk.pos_tag(nltk.word_tokenize(s))
    for word, pos in s:
        stem = stemmer.stem(word).lower()
        if word not in stopwords and word.isalpha() and stem not in scifi_stems:
            if stem not in words:
                words[stem] = {(word.lower(), pos): 1}
            else:
                words[stem][(word.lower(), pos)] = words[stem].get((word.lower(), pos), 0) + 1
words_dict = words

100%|██████████| 196671/196671 [05:40<00:00, 578.32it/s]


In [6]:
final_list = {}
for stem_key, stem_value in words_dict.items():
    count = 0
    big = ('', 0)
    for word_key, word_value in stem_value.items():
        if word_value > count:
            big = word_key
            count = word_value

    if count > final_list.get(big[0], ('', 0))[1]:
        final_list[big[0]] = (big[1], count)

In [7]:
with open('fairy_tale_words.txt', 'w') as f:
    for item in sorted(final_list.items(), key=lambda x: -x[1][1]):
        if item[1][1] > 10:
            f.write(f'{item[0]}: {item[1][0]}, {item[1][1]}\n')

In [8]:
text = clean(text)

## Markov Model

In [9]:
# markov.py

from itertools import tee


def update(d, keys, value):
    for key in keys:
        if key in d:
            d = d[key]
        else:
            newd = dict()
            d[key] = newd
            d = newd
    d[value] = d.get(value, 0) + 1


def marginalizeF(fname, window):
    with open(fname, 'r', encoding='utf8') as f:
        return marginalize(f.read(), window)


def marginalize(text, window):
    d = dict()
    for w in slide(text.split(' '), window):
        features = w[:-1]
        target = w[-1]
        update(d, features, target)
    return d


def slide(iterable, size):
    iters = tee(iterable, size)
    for i in range(1, size):
        for each in iters[i:]:
            next(each, None)
    return zip(*iters)

## Create Model

In [10]:
window = 4
# text = open('tales.txt', 'r', encoding='utf8').read()

d = marginalize(text, int(window))

## Generate Sentences

In [11]:
n_words = 25
n_sents = 5

for i in range(n_sents):
    chain = ['once', 'upon', 'a', 'time']
    for j in range(n_words):
        window_d = chain[1 - window:]
        d_t = d[window_d[0]]
        for key in range(1, window - 1):
            d_t = d_t.get(window_d[key], {})
        vals = list(d_t.values())
        word = random.choice(
            list(d_t.keys()),
            p=array(vals)/sum(vals)
        )
        chain.append(word)
    print(str(i+1)+':', ' '.join(chain), '\n')

1: once upon a time an old woman putting out her lanterns you will light the candle for you and then he said o heavens why have not my mother 

2: once upon a time there was a fierce beast is in my iwanich thanked the old man thought in his mind while people were asleep he went out cut 

3: once upon a time there was a fire in my veins doth flow yet i laugh and sing the nurse asked the boatman can you take the fellow out 

4: once upon a time there lived an emperor whose name was souci and he had not gone far before he met a squirrel who bowed and hid it in 

5: once upon a time there was a big palmyra tree lying on the road the snow was so wet it certainly must have come from and how you came 



## Check Style

In [12]:
doc = request.urlopen('https://docs.google.com/document/export?format=txt&id=1nlGzXv09roHMtTjlJQhJ6ZnwWMDHeGKi_Xnk8mygjEw').read().decode('utf-8').replace('\r\n', '\n').replace('\ufeff', '')
doc_words = clean(doc).split(' ')

In [13]:
chain = doc_words[:window - 1]
outliers = {}

for i, word in enumerate(doc_words[window - 1:]):
    d_t = d.get(doc_words[i], {})
    key = 1
    for key in range(1, window - 1):
        d_t = d_t.get(doc_words[i + key], {})
    if i + key + 1 < len(doc_words):
        if doc_words[i + key + 1] not in d_t:
            outliers[doc_words[i + key + 1]] = outliers.get(doc_words[i + key + 1], 0) + 1
    chain.append(word)

In [14]:
outlier_words = set()
for key, value in outliers.items():
    if value / doc_words.count(key) == 1 and key not in stopwords:
        outlier_words.add(key)
# print(outlier_words)

## Test Vocab

In [15]:
fairy_stems = set(map(stemmer.stem, set(text.split(' '))))

def check_fairy_word(word):
    return stemmer.stem(word) in fairy_stems

In [16]:
check_words = {}
for word in outlier_words:
    stem = stemmer.stem(word)
    if stem not in fairy_stems and word in word_list:
        if stem in check_words:
            check_words[stem].append(word)
        else:
            check_words[stem] = [word]

In [17]:
check_words

{'protagonist': ['protagonist'],
 'electron': ['electronic'],
 'modulus': ['modulus'],
 'cipher': ['cipher'],
 'hanna': ['hanna'],
 'unitari': ['unitary'],
 'encod': ['encode'],
 'comput': ['computation', 'computer'],
 'rand': ['rand'],
 'coincident': ['coincidentally', 'coincidental'],
 'amplifi': ['amplify'],
 'rubbl': ['rubble'],
 'travi': ['travis'],
 'repositori': ['repository'],
 'copyright': ['copyright'],
 'uncloth': ['unclothed'],
 'ocher': ['ocher'],
 'paperback': ['paperback'],
 'rematch': ['rematch'],
 'millennia': ['millennia'],
 'browser': ['browser'],
 'introspect': ['introspective'],
 'skid': ['skidding'],
 'algorithm': ['algorithm'],
 'feedback': ['feedback'],
 'focus': ['focus'],
 'encrypt': ['encrypt'],
 'shor': ['shor'],
 'unscrambl': ['unscramble'],
 'edit': ['edition'],
 'slowpok': ['slowpoke'],
 'unord': ['unordered'],
 'fictiti': ['fictitiously'],
 'photon': ['photon'],
 'typic': ['typical'],
 'notebook': ['notebook'],
 'specif': ['specifically'],
 'bibliographi

In [18]:
check_fairy_word("breakfast")

True