# Markov Chain 

## Load Text Sources

In [1]:
import glob
import os
import numpy as np

In [5]:
speeches_dir = 'data/speeches/'

In [14]:
from gensim.parsing.preprocessing import preprocess_string, \
                                         strip_non_alphanum, strip_tags

def clean(text):
    text = strip_tags(text)
    text = strip_non_alphanum(text)
    return text

def load_speeches(category, filename='*.txt'):
    """
       :param category: What type of speeches to load
                        - women or comedians
       :param filename: The filename pattern
    """
    category_dir = os.path.join(speeches_dir,category)
    for filename in glob.glob(os.path.join(category_dir, filename)):
        with open(filename, encoding='latin-1') as f:
            yield filename, clean(f.read())

In [7]:
womens_speeches = load_speeches('women')
comedian_speeches = load_speeches('comedians')

In [8]:
from nltk.tokenize import word_tokenize


def load_corpus(speeches):
    corpus = []
    for filename, speech in speeches:
        print(f'Loading speech {filename}')
        tokens = word_tokenize(speech)
        corpus = corpus + tokens
    return corpus

In [9]:
corpus = load_corpus(womens_speeches)

Loading speech data/speeches/women\AintIAWoman-SojournerTruth.txt
Loading speech data/speeches/women\FreedomFromFear-AungSuuKyi.txt
Loading speech data/speeches/women\FreedomOrDeath-EmmelinePankhurst.txt
Loading speech data/speeches/women\MisogynySpeech-JuliaGillard.txt
Loading speech data/speeches/women\PulseOfTheMorning-MayaAngelou.txt
Loading speech data/speeches/women\RoomOfOnesOwn-VirginiaWoolf.txt
Loading speech data/speeches/women\SpeechToTheTroopsAtTillsbury-ElizabethI.txt
Loading speech data/speeches/women\WellesleyCommencement-NoraEphron.txt


In [10]:
def make_pairs(corpus):
    for i in range(len(corpus)-1):
        yield (corpus[i], corpus[i+1])
          
def load_word_dict(corpus):
    pairs = make_pairs(corpus)
    word_dict = {}
    for word_1, word_2 in pairs:
        if word_1 in word_dict.keys():
            word_dict[word_1].append(word_2)
        else:
            word_dict[word_1] = [word_2]
    return word_dict
            
def load_markov_dict(category, filename='*.txt'):
    speeches = load_speeches(category, filename)
    corpus = load_corpus(speeches)
    return load_word_dict(corpus)

In [11]:
womens_speeches_word_dict = load_markov_dict('women')

Loading speech data/speeches/women\AintIAWoman-SojournerTruth.txt
Loading speech data/speeches/women\FreedomFromFear-AungSuuKyi.txt
Loading speech data/speeches/women\FreedomOrDeath-EmmelinePankhurst.txt
Loading speech data/speeches/women\MisogynySpeech-JuliaGillard.txt
Loading speech data/speeches/women\PulseOfTheMorning-MayaAngelou.txt
Loading speech data/speeches/women\RoomOfOnesOwn-VirginiaWoolf.txt
Loading speech data/speeches/women\SpeechToTheTroopsAtTillsbury-ElizabethI.txt
Loading speech data/speeches/women\WellesleyCommencement-NoraEphron.txt


In [15]:
comedians_word_dict = load_markov_dict('comedians')

Loading speech data/speeches/comedians\Dartmouth-Conan.txt
Loading speech data/speeches/comedians\HardvardLawSchool-MindyKaling.txt
Loading speech data/speeches/comedians\Harvard-AmyPoehler.txt
Loading speech data/speeches/comedians\HarvardU-WillFerrell.txt
Loading speech data/speeches/comedians\TulaneGraduation-MayaRudolph.txt
Loading speech data/speeches/comedians\UniversityOfVirginia-StephenColbert.txt
Loading speech data/speeches/comedians\WilliamAndMary-JonStewart.txt


In [12]:
def get_sentence(word_dict, n_words=15):
    first_word = np.random.choice(list(word_dict.keys()))
    while first_word.islower():
        first_word = np.random.choice(corpus)
    chain = [first_word]
    for i in range(n_words):
        chain.append(np.random.choice(word_dict[chain[-1]]))
    return ' '.join(chain)

In [20]:
get_sentence(womens_speeches_word_dict)

'Union New Jersey accent I arrived at this event of it and proves its smooth gliding'

In [19]:
get_sentence(comedians_word_dict)

'U Hey that will say you Excuse me on my class of courtesy If your life'