In [1]:
import nltk

class Generator:

    


    def __init__(self, sentences, ngram = 2):
      
        self.langmod = {} 
        self.ngram = ngram
        words = self._clean_sentences(sentences)
   
   
        self.langmod[1] = self._bigram_mle_model(words)
        for i in range(2, ngram + 1):
            self.langmod[i] = self._ngram_mle_model(words, i + 1)


    @staticmethod
    def _unigram_mle_model(words):
        ngrams = nltk.ngrams(words, 1)
        cfdist = nltk.ConditionalFreqDist((tuple(x[:(0)]), x[0]) for x in \
                ngrams)
        return nltk.ConditionalProbDist(cfdist, nltk.MLEProbDist)


    @staticmethod
    def _bigram_mle_model(words):
        cfdist = nltk.ConditionalFreqDist(nltk.bigrams(words))
        return nltk.ConditionalProbDist(cfdist, nltk.MLEProbDist)

    @staticmethod
    def _ngram_mle_model(words, n):
        ngrams = nltk.ngrams(words, n)
        cfdist = nltk.ConditionalFreqDist((tuple(x[:(n - 1)]), x[n - 1]) for x in \
                ngrams)
        return nltk.ConditionalProbDist(cfdist, nltk.MLEProbDist)

    @staticmethod
    def _clean_sentences(sents):
       
        result = []
        for sent in sents:
            result.append(Generator.begin)
            result.extend([word for word in sent if word not in \
                Generator.IGNORED])
            result.append(Generator.end)
        return result

    @staticmethod
    def formation(sentence):
    
        result = []
        buf = ""
        for word in sentence:
            if word in Generator.NO_SPACE_AFTER:
                buf += word
            elif word in Generator.NO_SPACE_BEFORE or word[0] in \
                    Generator.NO_SPACE_BEFORE_PREFIX:
                if len(result) == 0:
                    result.append(buf + word)
                else:
                    result[-1] += buf + word
                buf = ""
            else:
                result.append(buf + word)
                buf = ""
        return "".join(result)

    def generate(self, as_list = False):

        sentence = []
        context = [Generator.begin]
        while context[-1] != Generator.end:
        
            if len(context) == 1:
                cur = self.langmod[1][context[0]].generate()
                context.append(cur)
            else:
                cur = self.langmod[len(context)][tuple(context)].generate()
                context.append(cur)
                if len(context) >= self.ngram:
                    context.pop(0)

            if cur != Generator.end:
                sentence.append(cur)

        if as_list:
            return sentence
        else:
            return self.formation(sentence)

    IGNORED = ['"', '\'']
    
    NO_SPACE_BEFORE = [',', '.', '?', ':', ';', ')', '!', "n't", "''", "'t"]
    NO_SPACE_BEFORE_PREFIX = ['.', '\'']
    NO_SPACE_AFTER = ['(', '``']
    begin = '<s>'
    end = '</s>'


def read_sentences_from_file(file_path):
        fileObj = open(file_path, 'r',encoding="utf8")
        text = fileObj.read()
        tokens = nltk.sent_tokenize(text.lower())
        return tokens


if __name__ == '__main__':
    tokens = read_sentences_from_file("speeches.txt")

    obj=Generator(tokens,6)
 
    for i in range(1,6):
        print(obj.generate())


nafta.
i loves than part of it.
but i feel good such nice to buy than ever to pay a big excavators, go again – forget, who are all-time, it’s right.
and important, i guess around the way, it to this is get 2 million.
but...i called local leading in the middle class is out and they’re doing.
