In [15]:
import numpy as np
from random import randint, choice
import re
from collections import Counter, defaultdict

from nltk.corpus import stopwords 
from pprint import pprint

stop_words = set(stopwords.words('english')) 

In [55]:
with open("./tolstoy.txt", "r") as f:
    text = f.read().split("\n")

def word_freq(text_list, remove_stopwords = False):
    cnt = Counter()
    for text in text_list:
        for word in text.split():
            word = re.sub("[^\w\d]", "", word)
            word = word.lower()
            if word:
                if (remove_stopwords and word not in stop_words) or (not remove_stopwords):
                    cnt[word] += 1      
    return cnt

def ngram(text_list, window = 3):
    """
    "This is a long weekend" -> "This is a" -> "long"
                             -> "is a long" -> "weekend"
                             -> "a long weekend" -> ""
    
    
    "This is a" : Counter("long" : 123, "bad" : 12, ...)
    """
    ng = defaultdict(lambda : defaultdict(int))
    for text in text_list:
        text_words = [re.sub("[^\w\d]", "", word).lower() for word in text.split()]
        if len(text_words) < window:
            continue
        for i in range(window, len(text_words)):
            ng[' '.join(text_words[i-window: i])][text_words[i]] += 1
    return ng

cnt = word_freq(text, remove_stopwords = True)
print('\n10 MOST COMMON (NON STOPWORD) WORDS:\n', '='*35)
pprint(cnt.most_common(10))

ngram_common_1 = Counter()
ng = ngram(text)
for k, v in ng.items():
    n = 0
    for v_ in v.values():
        n += v_
    ngram_common_1[k] += n

print('\n10 MOST COMMON NGRAMS (WINDOW SIZE = 3):\n', '='*35)
pprint(ngram_common_1.most_common(10))


10 MOST COMMON (NON STOPWORD) WORDS:
[('said', 2834),
 ('one', 2045),
 ('prince', 1886),
 ('pierre', 1784),
 ('would', 1362),
 ('could', 1110),
 ('natásha', 1092),
 ('man', 1064),
 ('andrew', 1039),
 ('time', 921)]

10 MOST COMMON NGRAMS (WINDOW SIZE = 3):
[('he did not', 193),
 ('one of the', 146),
 ('out of the', 146),
 ('as soon as', 127),
 ('that he was', 121),
 ('that it was', 104),
 ('up to the', 104),
 ('he could not', 104),
 ('which he had', 98),
 ('did not know', 96)]


In [54]:
with open("./trump_corpus.txt", "r") as f:
    text = f.read().split("\n")

def word_freq(text_list, remove_stopwords = False):
    cnt = Counter()
    for text in text_list:
        for word in text.split():
            word = re.sub("[^\w\d]", "", word)
            word = word.lower()
            if word:
                if (remove_stopwords and word not in stop_words) or (not remove_stopwords):
                    cnt[word] += 1      
    return cnt

def ngram(text_list, window = 3):
    """
    "This is a long weekend" -> "This is a" -> "long"
                             -> "is a long" -> "weekend"
                             -> "a long weekend" -> ""
    
    
    "This is a" : Counter("long" : 123, "bad" : 12, ...)
    """
    ng = defaultdict(lambda : defaultdict(int))
    for text in text_list:
        text_words = [re.sub("[^\w\d]", "", word).lower() for word in text.split()]
        if len(text_words) < window:
            continue
        for i in range(window, len(text_words)):
            ng[' '.join(text_words[i-window: i])][text_words[i]] += 1
    return ng

cnt = word_freq(text, remove_stopwords = True)
print('\n10 MOST COMMON (NON STOPWORD) WORDS:\n', '='*35)
pprint(cnt.most_common(10))

ngram_common = Counter()
ng = ngram(text)
for k, v in ng.items():
    n = 0
    for v_ in v.values():
        n += v_
    ngram_common[k] += n

print('\n10 MOST COMMON NGRAMS (WINDOW SIZE = 3):\n', '='*35)
pprint(ngram_common.most_common(10))


10 MOST COMMON (NON STOPWORD) WORDS:
[('great', 377),
 ('person', 222),
 ('us', 185),
 ('people', 181),
 ('trade', 146),
 ('many', 143),
 ('country', 139),
 ('big', 137),
 ('democrats', 129),
 ('news', 113)]

10 MOST COMMON NGRAMS (WINDOW SIZE = 3):
[('the united states', 51),
 ('the fake news', 51),
 ('the white house', 34),
 ('fake news media', 31),
 ('will be a', 28),
 ('loves our military', 28),
 ('all of the', 26),
 ('has my full', 26),
 ('strong on crime', 26),
 ('there was no', 25)]


In [56]:
ngram_common & ngram_common_1

Counter({'i want to': 8,
         'has been a': 3,
         'been a great': 1,
         'with him in': 1,
         'him in his': 1,
         'the attempt to': 1,
         'attempt to stop': 1,
         'to stop the': 3,
         'have decided that': 1,
         'of the wall': 1,
         'will not be': 6,
         'in the center': 2,
         'the center of': 2,
         'of this great': 1,
         'the world of': 1,
         'world of the': 1,
         'of the power': 1,
         'the power of': 2,
         'of prayer and': 1,
         'prayer and the': 1,
         'the gift of': 1,
         'at a small': 1,
         'in a single': 1,
         'more than two': 1,
         'men and women': 10,
         'of love and': 1,
         'and hope to': 1,
         'it was an': 3,
         'the loss of': 2,
         'of so many': 1,
         'are determined to': 1,
         'yesterday at the': 1,
         'the white house': 1,
         'and many others': 1,
         'we must not': 1,
         '

In [38]:
def simple_text_generator(ngram, window = 5, length=10):
    word_list = choice(list(ngram.keys())).split()
    for i in range(30):
        word_dict = ngram[' '.join(word_list[-window:])]
        if not word_dict:
            # this is necessary for non-contigous text
            word_list[-1] = word_list[-1] + "."
            word_list += choice(list(ngram.keys())).split()
        sample = randint(1, sum(ngram[' '.join(word_list[-window:])].values()))
        for word, count in ngram[' '.join(word_list[-window:])].items():
            sample -= count
            if sample <= 0: word_list.append(word); break
    return ' '.join(word_list)

print('\nSIMPLE TRUMP TEXT GENERATOR USING N GRAM\n')
for _ in range(5):
    print(f'TOLSTOY: {simple_text_generator(ng)}\n')


SIMPLE TRUMP TEXT GENERATOR USING N GRAM

TOLSTOY: to be used as a curtain and behind that screen. at all shamefaced as pierre expected to find. suite all exchanged rapid looks that expressed. planks while others stood about doing nothing. is it my dear do you want me to go and tell him. his quarters the little old man with. the blackguards have looted there see what that one has behind. learned that despite the loyalty with which

TOLSTOY: sitting on the outer sill was being forced out by two footmen who. soldiers and officers bore a large. the conversation which again became general the. natásha did she could not see people unconcernedly but had to send. prince andrew turned away and began pacing the room. interested prince vasíli just then and. the pain though at night his feet were more. the stone entrance gates of the drive and

TOLSTOY: the enemy was the smoke of a cannon rose. performing one of the most solemn duties of. you exclaimed deeply moved voices as. stood with her slen

In [None]:
trump = {'us': 5}
tolstoy = {
    'war': 1313
}

trump + tolstoy -> most occuring word is 'war': 1313
                -> most occuring word is 'us': 5 ---> 5 to 1313 -> 5 * (1313/5)
