## Reference:
* ECE-684 Notes and TA Office Hours
* ECE-684 HW 3 (Fremgen)
* ECE-684 HW 3 (Antoun)
* ECE-684 HW 3 (Singh)

## Group Members: Richard Fremgen, Johnny Antoun, Jaskaran Singh

## Generative Probabilistic Model
### Language Model: Text Generation

### Load Packages

In [1]:
import nltk
import re
import pickle
import numpy as np
import random
from nltk.tokenize import word_tokenize

### Define Text Generation Function

In [2]:
def finish_sentence(sentence, n, corpus, deterministic=True): 
    
    """ Bare-Bones Markov text generator to return a sentence based on whether
        a determinist or stochastic mode is selected """
    
    best_match = sentence.copy()
    corp_length = len(corpus)
    initial_n = n
    
    while len(best_match) < 10:
        
        #print(n) 
        
        # Beak up corpus into n-1 word chunks
        start = range(0, corp_length)
        end = range(n-1, corp_length+1)  
        joined_word = []
                
        # Unigram and Deterministic Case
        if n == 1 and deterministic == True:
            
            w, w_index, w_count  = np.unique(corpus, return_counts=True, 
                                              return_index = True) 
            p = np.where(w_count == np.max(w_count))
            new_word = corpus[min(w_index[p])]
            best_match.append(new_word) 
            
            continue
        
        # Unigram and Stochastic Case
        if n == 1 and deterministic == False:
            
            w, w_index, w_count  = np.unique(corpus, return_counts=True, 
                                              return_index = True) 
            
            w_prob = w_count / np.sum(w_count)
            new_word = np.random.choice(w, size =1, p = w_prob)[0]
            best_match.append(new_word) 
            
            if new_word in [".", "?", "!"]:
                break   
            
            continue
            
        
        for i, j in zip(start, end): 
            joined_word.append(' '.join(corpus[i:j])) 
        
        # Save words that appear after the words in phrase        
        save_results = [joined_word[i+1].split()[n-2] for i, x 
                        in enumerate(joined_word) if x == ' '.join(best_match[-(n-1):])] 
        
        # Create lists of unique words in save_results along with index and counts
        w, w_index, w_count  = np.unique(save_results, return_counts=True, 
                                         return_index = True)  
        
        # When the word/phrase does not exist in corpus - use backoff method
        if len(w) == 0:
            n -= 1
            continue
        
        if deterministic == True:
    
            p = np.where(w_count == np.max(w_count))
            new_word = save_results[min(w_index[p])]
            
        else:
            
            w_prob = w_count / np.sum(w_count)
            new_word = np.random.choice(w, size =1, p = w_prob)[0]
            
        best_match.append(new_word) 
        
        n = initial_n

        if new_word in [".", "?", "!"]:
            break   

    return(best_match)

### Load Training Data and Pre-process

In [4]:
# read pickle file
pickle_in = open("plots_text.pickle","rb")
movie_plots = pickle.load(pickle_in)

# clean text
movie_plots = [re.sub("[^a-z' ]", "", i) for i in movie_plots]

# Tokenize words
new_corpus = []

for movie in movie_plots:
    a = movie.split()
    a = word_tokenize(movie)
    new_corpus.append(a)

corpus = [item for sublist in new_corpus for item in sublist]

### Train on Real Data - Deterministic

In [18]:
sentence = ['the', 'girl']
n = 2
deterministic=True
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

the girl who is a new york city and the


In [19]:
sentence = ['the', 'girl']
n = 3
deterministic=True
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test))  

the girl who is a good thing but the other


In [20]:
sentence = ['the', 'girl']
n = 4
deterministic=True
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test))  

the girl who 'd appeared and disappeared so mysteriously during


In [21]:
sentence = ['the', 'girl']
n = 5
deterministic=True
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test))  

the girl who 'd appeared and disappeared so mysteriously during


In [34]:
sentence = ['the', 'man', 'told']
n = 2
deterministic=True
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test))  

the man told that he is a new york city


In [35]:
sentence = ['the', 'man', 'told']
n = 3
deterministic=True
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test))  

the man told that the man who has been in


In [36]:
sentence = ['the', 'man', 'told']
n = 4
deterministic=True
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test))  

the man told that the jewellery was given to gordon


In [37]:
sentence = ['the', 'man', 'told']
n = 5
deterministic=True
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test))  

the man told that the jewellery was given to him


In [43]:
sentence = ['as', 'soon', 'the']
n = 2
deterministic=True
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test))  

as soon the film ends with the film ends with


In [44]:
sentence = ['as', 'soon', 'the']
n = 3
deterministic=True
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test))  

as soon the whole thing but the other hand is


In [45]:
sentence = ['as', 'soon', 'the']
n = 4
deterministic=True
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test))  

as soon the whole family is being neglected by the


In [46]:
sentence = ['as', 'soon', 'the']
n = 5
deterministic=True
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test))  

as soon the whole family is being neglected by the


In [51]:
sentence = ['i', 'have']
n = 2
deterministic=True
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

i have been a new york city and the film


In [52]:
sentence = ['i', 'have']
n = 3
deterministic=True
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

i have been a martial arts competition the international kindergarten


In [53]:
sentence = ['i', 'have']
n = 4
deterministic=True
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

i have been waiting ten years for manslaughter eva tells


In [54]:
sentence = ['i', 'have']
n = 5
deterministic=True
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

i have been waiting ten years for this he says


In [63]:
sentence = ['a', 'car']
n = 2
deterministic=True
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test))

a car and the film ends with the film ends


In [64]:
sentence = ['a', 'car']
n = 3
deterministic=True
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test))

a car accident with no apparent disapproval from the city


In [65]:
sentence = ['a', 'car']
n = 4
deterministic=True
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test))

a car accident with no child to tie them together


In [66]:
sentence = ['a', 'car']
n = 5
deterministic=True
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test))

a car accident with no child to tie them together


### Train on Real Data - Non-Deterministic

In [22]:
sentence = ['the', 'girl']
n = 2
deterministic=False
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

the girl should stay in a surprise birthday before he


In [23]:
sentence = ['the', 'girl']
n = 3
deterministic=False
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

the girl finally accepts and starts insulting ray by talking


In [24]:
sentence = ['the', 'girl']
n = 4
deterministic=False
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test))  

the girl 's trust he trades her a shiny silver


In [25]:
sentence = ['the', 'girl']
n = 5
deterministic=False
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test))  

the girl who saved his life eric rushes to kiss


In [33]:
sentence = ['the', 'man', 'told']
n = 2
deterministic=False
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

the man told that jeff poindexter a medical clearance to


In [27]:
sentence = ['the', 'man', 'told']
n = 3
deterministic=False
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

the man told the chairman concludes the hearing and praetorius


In [28]:
sentence = ['the', 'man', 'told']
n = 4
deterministic=False
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

the man told that since gauri has been convicted of


In [30]:
sentence = ['the', 'man', 'told']
n = 5
deterministic=False
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

the man told to begin with words on why he


In [38]:
sentence = ['as', 'soon', 'the']
n = 2
deterministic=False
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

as soon the movie has managed to armsized creatures in


In [39]:
sentence = ['as', 'soon', 'the']
n = 3
deterministic=False
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

as soon the whole thing screams in anguish without thinking


In [40]:
sentence = ['as', 'soon', 'the']
n = 4
deterministic=False
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

as soon the whole family is now on french soil


In [42]:
sentence = ['as', 'soon', 'the']
n = 5
deterministic=False
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

as soon the whole family is being neglected by the


In [47]:
sentence = ['i', 'have']
n = 2
deterministic=False
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

i have transferred them dad ali lectures frida he even


In [48]:
sentence = ['i', 'have']
n = 3
deterministic=False
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

i have been located by the th century castle originally


In [49]:
sentence = ['i', 'have']
n = 4
deterministic=False
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

i have been waiting ten years for manslaughter eva tells


In [50]:
sentence = ['i', 'have']
n = 5
deterministic=False
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

i have been waiting ten years for this he says


In [62]:
sentence = ['a', 'car']
n = 2
deterministic = False
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

a car into the mountains while training in his suspicions


In [58]:
sentence = ['a', 'car']
n = 3
deterministic = False
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

a car chase that follows the fight back and forth


In [59]:
sentence = ['a', 'car']
n = 4
deterministic = False
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

a car accident ambujakshi realizes to her horror that the


In [60]:
sentence = ['a', 'car']
n = 5
deterministic = False
test = finish_sentence(sentence, n, corpus, deterministic) 
print(" ".join(test)) 

a car containing a shipment of gold bullion robert 's


### Generate Synthetic Data

In [4]:
# Deterministic Synthetic Data
save_corpus = []

random.seed(123)

for i in range(500):
    start = random.randint(3, len(corpus)-3)
    end = start+3
    save_corpus.append(corpus[start:end])

save_data = [] 

for sent in save_corpus:
    new_sent = finish_sentence(sentence=sent, n=4, corpus=corpus, deterministic=True)
    save_data.append(new_sent)

syn_det_data = [' '.join(val) for val in save_data]

# with open('det_syn_data.pkl', 'wb') as f:
#     pickle.dump(syn_det_data, f) 


In [5]:
# Non-Deterministic Synthetic Data

save_data2 = [] 

for sent in save_corpus:
    new_sent2 = finish_sentence(sentence=sent, n=4, corpus=corpus, deterministic=False)
    save_data2.append(new_sent2)

syn_det_data2 = [' '.join(val) for val in save_data2]

# with open('non_det_syn_data.pkl', 'wb') as f:
#     pickle.dump(syn_det_data2, f) 

### Train on Synthetic Data - Deterministic

In [67]:
# Load Synthetic Data
import pickle
pickle_in = open("det_syn_data.pkl","rb")
syn_det_data = pickle.load(pickle_in)

# Tokenize words
new_corpus2 = []

for d in syn_det_data:
    a = d.split()
    a = word_tokenize(d)
    new_corpus2.append(a)

syn_det_corpus = [item for sublist in new_corpus2 for item in sublist]

In [68]:
sentence = ['the', 'girl']
n = 2
deterministic=True
test = finish_sentence(sentence, n, syn_det_corpus, deterministic) 
print(" ".join(test))  

the girl attending a new 'master ' who is a


In [69]:
sentence = ['the', 'girl']
n = 3
deterministic=True
test = finish_sentence(sentence, n, syn_det_corpus, deterministic) 
print(" ".join(test)) 

the girl attending a college a long way from home


In [70]:
sentence = ['the', 'girl']
n = 5
deterministic=True
test = finish_sentence(sentence, n, syn_det_corpus, deterministic) 
print(" ".join(test)) 

the girl attending a college a long way from home


In [79]:
sentence = ['the', 'man', 'told']
n = 2
deterministic=True
test = finish_sentence(sentence, n, syn_det_corpus, deterministic) 
print(" ".join(test)) 

the man told to the three children for the three


In [80]:
sentence = ['the', 'man', 'told']
n = 3
deterministic=True
test = finish_sentence(sentence, n, syn_det_corpus, deterministic) 
print(" ".join(test)) 

the man told to wait back holds his games refuse


In [81]:
sentence = ['the', 'man', 'told']
n = 5
deterministic=True
test = finish_sentence(sentence, n, syn_det_corpus, deterministic) 
print(" ".join(test)) 

the man told to wait back holds his games refuse


In [85]:
sentence = ['as', 'soon', 'the']
n = 2
deterministic=True
test = finish_sentence(sentence, n, syn_det_corpus, deterministic) 
print(" ".join(test)) 

as soon the three children for the three children for


In [86]:
sentence = ['as', 'soon', 'the']
n = 3
deterministic=True
test = finish_sentence(sentence, n, syn_det_corpus, deterministic) 
print(" ".join(test)) 

as soon the three men conducted a guerrilla war against


In [87]:
sentence = ['as', 'soon', 'the']
n = 5
deterministic=True
test = finish_sentence(sentence, n, syn_det_corpus, deterministic) 
print(" ".join(test)) 

as soon the three men conducted a guerrilla war against


In [91]:
sentence = ['i', 'have']
n = 2
deterministic=True
test = finish_sentence(sentence, n, syn_det_corpus, deterministic) 
print(" ".join(test)) 

i have to the three children for the three children


In [92]:
sentence = ['i', 'have']
n = 3
deterministic=True
test = finish_sentence(sentence, n, syn_det_corpus, deterministic) 
print(" ".join(test)) 

i have to murdered girls are buried in anand 's


In [93]:
sentence = ['i', 'have']
n = 5
deterministic=True
test = finish_sentence(sentence, n, syn_det_corpus, deterministic) 
print(" ".join(test)) 

i have to murdered girls are buried in anand 's


In [97]:
sentence = ['a', 'car']
n = 2
deterministic=True
test = finish_sentence(sentence, n, syn_det_corpus, deterministic) 
print(" ".join(test)) 

a car around the three children for the three children


In [98]:
sentence = ['a', 'car']
n = 3
deterministic=True
test = finish_sentence(sentence, n, syn_det_corpus, deterministic) 
print(" ".join(test)) 

a car around boys before the festival the pool is


In [99]:
sentence = ['a', 'car']
n = 5
deterministic=True
test = finish_sentence(sentence, n, syn_det_corpus, deterministic) 
print(" ".join(test)) 

a car around boys before the festival the pool is


### Train on Synthetic Data - Non-Deterministic

In [72]:
# Load Synthetic Data
import pickle
pickle_in = open("non_det_syn_data.pkl","rb")
syn_det_data2 = pickle.load(pickle_in)


# Tokenize words
new_corpus3 = []

for d in syn_det_data2:
    a = d.split()
    a = word_tokenize(d)
    new_corpus3.append(a)

syn_non_det_corpus = [item for sublist in new_corpus3 for item in sublist]

In [73]:
sentence = ['the', 'girl']
n = 2
deterministic=True
test = finish_sentence(sentence, n, syn_non_det_corpus, deterministic) 
print(" ".join(test)) 

the girl attending a young lady beatrice russo who is


In [74]:
sentence = ['the', 'girl']
n = 3
deterministic=True
test = finish_sentence(sentence, n, syn_non_det_corpus, deterministic) 
print(" ".join(test)) 

the girl 's in the back where she recognizes her


In [75]:
sentence = ['the', 'girl']
n = 5
deterministic=True
test = finish_sentence(sentence, n, syn_non_det_corpus, deterministic) 
print(" ".join(test)) 

the girl 's in the spring unfortunately evil magician murgatroyd


In [76]:
sentence = ['the', 'man', 'told']
n = 2
deterministic=True
test = finish_sentence(sentence, n, syn_non_det_corpus, deterministic) 
print(" ".join(test)) 

the man told to the film also shocked as the


In [77]:
sentence = ['the', 'man', 'told']
n = 3
deterministic=True
test = finish_sentence(sentence, n, syn_non_det_corpus, deterministic) 
print(" ".join(test)) 

the man told to wait back holds his games refuse


In [78]:
sentence = ['the', 'man', 'told']
n = 5
deterministic=True
test = finish_sentence(sentence, n, syn_non_det_corpus, deterministic) 
print(" ".join(test)) 

the man told to wait back holds his games refuse


In [82]:
sentence = ['as', 'soon', 'the']
n = 2
deterministic=True
test = finish_sentence(sentence, n, syn_non_det_corpus, deterministic) 
print(" ".join(test))

as soon the film also shocked as the film also


In [83]:
sentence = ['as', 'soon', 'the']
n = 3
deterministic=True
test = finish_sentence(sentence, n, syn_non_det_corpus, deterministic) 
print(" ".join(test))

as soon the film also chika puts her furs back


In [84]:
sentence = ['as', 'soon', 'the']
n = 5
deterministic=True
test = finish_sentence(sentence, n, syn_non_det_corpus, deterministic) 
print(" ".join(test))

as soon the film also chika puts her furs back


In [88]:
sentence = ['i', 'have']
n = 2
deterministic=True
test = finish_sentence(sentence, n, syn_non_det_corpus, deterministic) 
print(" ".join(test))

i have to the film also shocked as the film


In [89]:
sentence = ['i', 'have']
n = 3
deterministic=True
test = finish_sentence(sentence, n, syn_non_det_corpus, deterministic) 
print(" ".join(test))

i have to murdered girls are buried in anand 's


In [90]:
sentence = ['i', 'have']
n = 5
deterministic=True
test = finish_sentence(sentence, n, syn_non_det_corpus, deterministic) 
print(" ".join(test))

i have to murdered girls are buried in anand 's


In [94]:
sentence = ['a', 'car']
n = 2
deterministic=True
test = finish_sentence(sentence, n, syn_non_det_corpus, deterministic) 
print(" ".join(test))

a car around boys before the film also shocked as


In [95]:
sentence = ['a', 'car']
n = 3
deterministic=True
test = finish_sentence(sentence, n, syn_non_det_corpus, deterministic) 
print(" ".join(test))

a car compelled to tell her he a raft they


In [96]:
sentence = ['a', 'car']
n = 5
deterministic=True
test = finish_sentence(sentence, n, syn_non_det_corpus, deterministic) 
print(" ".join(test))

a car compelled to tell anna of jacob now the
