In [None]:
#---------------------------------------------------------------------
#This generates Trump style sentences
#---------------------------------------------------------------------

import random
from ngram import NgramModel #to run this import the nltk model branch is needed: https://github.com/nltk/nltk/tree/model
from nltk.probability import MLEProbDist
from collections import Counter
from numpy.random import choice

def do_ngram(model, x_gram):
#enter the n-gram model and the number of n (2-4)
    sentence =[]
    
    for x in range(0,x_gram - 2): #generate list to start the generation
        sentence.append("")
    sentence.append("START") 
    
    #generate new tags until "END" is reached
    while sentence[-1]!= "END":
        sentence.append(model.generate_one((sentence[-1*x_gram+1:])))
    return(sentence[2:])

#---------------------------------------------------------------------
#Settings
#---------------------------------------------------------------------

#range of top words to pick from, have to be >= 1
draw_range_start = 2 #number of the most probable words from which the starting word is picked 
draw_range = 4 #number of the most probable words from which the words are picked within the sentence 
draw_range_random = 3 #number of the most probable words from which the word is picked, if it has to choose randomly
trump_word_insert_prob = 25 #100-p = %-probability that a Trump word is inserted if there are valid instances
number_of_sentences = 100 #number of sentences being generated

#---------------------------------------------------------------------
#Loading all required files
#---------------------------------------------------------------------

#load the tagged Trump corpus
trump_corpus_file = open("..\\data\\stanford_tagged_trump_corpus.txt", encoding= "utf-8")
trump_corpus_tagged = eval(trump_corpus_file.read())

#load the emission probability dictionary
emission_dict_file = open("..\\data\\emission_dict.txt", encoding= "utf-8")
prob_emission_dict = eval(emission_dict_file.read())

#open and load topic corpus
topic_file_tagged = open("..\\data\\stanford_tagged_topic_corpus.txt")
topic_text_tagged = eval(topic_file_tagged.read())

#open and load style dictionary
style_dict_file = open("..\\data\\style_dict.txt")
final_style_dict = eval(style_dict_file.read())

#---------------------------------------------------------------------
#Generate the sentences
#---------------------------------------------------------------------

tag_corpus = []
for sentence in trump_corpus_tagged:
    tag_sentence = [tag for word, tag in sentence ]
    tag_sentence.append("END") #add end tag of tag sequence
    tag_sentence.insert(0, "START") # add start tag at the beginng of the tag sequence
    tag_corpus.append(tag_sentence)

#generate model (option to experiment with different models)
model = NgramModel(4, tag_corpus, MLEProbDist)

sentence_count=0

while sentence_count != number_of_sentences:
    try:
        #create tag sentence
        tag_sentence = do_ngram(model, 4)  

        #---------------------------------------------------------------------
        #Generate output
        #---------------------------------------------------------------------
        
        #generate first word
        output_sentence = []
        collector = []
        
        for sentence in topic_text_tagged:
            if sentence[0][1] == tag_sentence[1] and sentence[1][1]==tag_sentence[2]:
                collector.append(sentence[0][0])
                
        list_counted = Counter(collector)
        words = [pair[0] for pair in list_counted.most_common()[:draw_range_start]]
        counts = [pair[1] for pair in list_counted.most_common()]
        counts = [count*(1/sum(counts[:draw_range_start])) for count in counts[:draw_range_start]] #counts[:N] normalize prob for only the N most likely
        try:
            output_sentence.extend(list(choice(words[:draw_range_start],1, p=counts)))
        except ValueError:
            continue

        for idx,tag in enumerate(tag_sentence):
            if idx <2 or tag == "END":
                continue
            else:
                collector = []
                for sentence in topic_text_tagged:
                    for pair_index, pair in enumerate(sentence):
                        if pair_index==0:
                            continue
                        else:
                            if pair == (output_sentence[-1], tag_sentence[idx-1]): #if it is the same word with same tag
                                try:
                                    if sentence[pair_index+1][1] == tag_sentence[idx]:
                                        try:
                                            if sentence[pair_index+2][1] == sentence[pair_index+1][1]:
                                                collector.append(sentence[pair_index+1][0])
                                        except IndexError:
                                            continue
                                except IndexError:
                                    continue

                list_counted =Counter(collector)
                words = [pair[0] for pair in list_counted.most_common()]
                counts = [pair[1] for pair in list_counted.most_common()]
                counts = [count*(1/sum(counts[:draw_range])) for count in counts[:draw_range]] #counts[:N] normalize prob for only the N most likely
                try:
                    output_sentence.extend(list(choice(words[:draw_range],1, p=counts))) #word[:N] only the N most probable words
                except ValueError: #if error then pick random word from emission probabilities
                    try:
                        count += 1
                        words = [pair[0] for pair in prob_emission_dict[tag].items()]
                        counts = [pair[1] for pair in prob_emission_dict[tag].items()]
                        counts = [count*(1/sum(counts[:draw_range_random])) for count in counts[:draw_range_random]] #counts[:N] normalize prob for only the N most likely
                        output_sentence.extend(list(choice(words[:draw_range_random],1, p=counts))) #word[:N] only the N most probable words
                    except KeyError:
                        continue

        #---------------------------------------------------------------------
        #Check if instances where Trump style words could be inserted exist
        #---------------------------------------------------------------------
        
        def find_sequence(subseq, seq):
            #"brute force approach to search for subsequence in a list and to return the start index of it"
            i, n, m = -1, len(seq), len(subseq)
            try:
                while True:
                    i = seq.index(subseq[0], i + 1, n - m + 1)
                    if subseq == seq[i:i + m]:
                        return i
            except ValueError:
                return -1

        occurance=[]
        for tag_sequence in final_style_dict:
            tags=[tag for tag in tag_sequence]
            if find_sequence(tags, tag_sentence)!=-1:
                occurance.append([find_sequence(tags, tag_sentence),tag_sequence]) #2nd word of the tag sequence 

        #-----------------------------------------------------------------------------
        #Randomly replace selected words in the output sentence with Trump style words
        #-----------------------------------------------------------------------------
        
        if occurance != []:
            for instance in occurance:
                number=random.randint(0, 100)
                if number>trump_word_insert_prob: #probability that the word is exchanged
                    if len(final_style_dict[instance[1]])>1: 
                        #for word,count in final_style_dict[instance[1]].items():
                        words=[word for word,count in final_style_dict[instance[1]].items()]
                        counts=[count for word,count in final_style_dict[instance[1]].items()]
                        counts=[count/sum(counts) for count in counts]     
                        output_sentence[instance[0]]="".join(list(choice(words,1, p=counts)))
                    else:
                        for word,count in final_style_dict[instance[1]].items():
                            output_sentence[instance[0]] = word
        sentence_count += 1
        
        print(" ".join(output_sentence), '\n')

    except IndexError:
        continue