In [30]:
import numpy as np
from random import randint, choice
import re
from collections import Counter, defaultdict

from nltk.corpus import stopwords 
from pprint import pprint

stop_words = set(stopwords.words('english')) 

In [81]:
with open("./trump_corpus.txt", "r") as f:
    text = f.read().split("\n")

def word_freq(text_list, remove_stopwords = False):
    cnt = Counter()
    for text in text_list:
        for word in text.split():
            word = re.sub("[^\w\d]", "", word)
            word = word.lower()
            if word:
                if (remove_stopwords and word not in stop_words) or (not remove_stopwords):
                    cnt[word] += 1      
    return cnt

def ngram(text_list, window = 3):
    ng = defaultdict(lambda : defaultdict(int))
    for text in text_list:
        text_words = [re.sub("[^\w\d]", "", word).lower() for word in text.split()]
        if len(text_words) < window:
            continue
        for i in range(window, len(text_words)):
            ng[' '.join(text_words[i-3: i])][text_words[i]] += 1
    return ng

cnt = word_freq(text, remove_stopwords = True)
print('\n10 MOST COMMON (NON STOPWORD) WORDS:\n', '='*35)
pprint(cnt.most_common(10))

ngram_common = Counter()
ng = ngram(text)
for k, v in ng.items():
    n = 0
    for v_ in v.values():
        n += v_
    ngram_common[k] += n

print('\n10 MOST COMMON NGRAMS (WINDOW SIZE = 3):\n', '='*35)
pprint(ngram_common.most_common(10))


10 MOST COMMON (NON STOPWORD) WORDS:
[('great', 377),
 ('person', 222),
 ('us', 185),
 ('people', 181),
 ('trade', 146),
 ('many', 143),
 ('country', 139),
 ('big', 137),
 ('democrats', 129),
 ('news', 113)]

10 MOST COMMON NGRAMS (WINDOW SIZE = 3):
[('the united states', 51),
 ('the fake news', 51),
 ('the white house', 34),
 ('fake news media', 31),
 ('will be a', 28),
 ('loves our military', 28),
 ('all of the', 26),
 ('has my full', 26),
 ('strong on crime', 26),
 ('there was no', 25)]


In [91]:
def simple_text_generator(ngram, window = 3, length=10):
    word_list = choice(list(ngram.keys())).split()
    for i in range(30):
        word_dict = ngram[' '.join(word_list[-window:])]
        if not word_dict:
            # this is necessary for non-contigous text
            word_list[-1] = word_list[-1] + "."
            word_list += choice(list(ngram.keys())).split()
        sample = randint(1, sum(ngram[' '.join(word_list[-window:])].values()))
        for word, count in ngram[' '.join(word_list[-window:])].items():
            sample -= count
            if sample <= 0: word_list.append(word); break
    return ' '.join(word_list)

print('\nSIMPLE TRUMP TEXT GENERATOR USING N GRAM\n')
for _ in range(5):
    print(f'TRUMP: {simple_text_generator(ng)}\n')


SIMPLE TRUMP TEXT GENERATOR USING N GRAM

TRUMP: and incredible help in getting our massive tax cut bill everyone is talking really nice to see. parkland we are determined to turn our grief into action full meeting. are starting to find out that it was indeed the

TRUMP: only he and bob mueller the leader of the 13 angry democrats and people that worked for obama for 8 years stop they have found no collussion with russia no obstruction but they

TRUMP: with meddling in our election where is the dnc server and why didnt the 13 angry democrats plus people who worked 8 years for obama working on the rigged russia witch hunt will

TRUMP: believe that with all of the wellwishers. author uses every trick in the book to demean and belittle i wish the people could see the real facts  and our country is doing great best financial

TRUMP: the second time with physical assault he doesnt know me but he would go down fast and hard crying all the way for ted in the upcoming primary  he will never l