## KneyserNey  Model

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
from math import log10

In [10]:
class kneyserNey():
    
    '''import pandas as pd
    import numpy as np
    import nltk
    from math import log10
    '''
    def __init__(self):
        #setting in fit
        self.ngram_order = None
        self.all_gram = None
        self.vocab = None
        
    def make_ngrams(self, text, n):
        '''
        takes a text and n gram and creates an n-gram for it
        '''
        # Parse text into sentences
        sent_text = sent_tokenize(text)
        # Get n-grams
        dict_ngram = {}
        for sentence in sent_text:
            sentence = (n-1)*"<s> " + sentence # create n-1 pseudo tokens
            n_grams = nltk.ngrams(sentence.split(), n)
            for grams in n_grams:
                new_gram = []
                # Change infrequent words into unknown
                for word in grams:
                    word = word.strip(".").strip("?").strip("!").strip(";").strip(":").strip('"')
                    wLower = word.lower()
                    new_gram.append(wLower)
                new_gram = tuple(new_gram)
                if new_gram in dict_ngram:
                    dict_ngram[new_gram] = dict_ngram[new_gram] + 1 
                else:
                    dict_ngram[new_gram] = 1
        return dict_ngram

    def fit(self, text, ngram_order):
        '''
        Create the fit database based on the order
        '''
        all_gram = {}
        for gram in range(1, ngram_order+1):
            all_gram[gram] = self.make_ngrams(text, gram)
        vocab = len(all_gram[1]) -1 # -1 to take care of start phrase
        self.ngram_order = ngram_order
        self.all_gram = all_gram
        self.vocab = vocab
        return self
    
    def score(self, phrase, d):
        '''
        Performs basic checks before proceeding to calculate score of the phrase
        '''
        phrase = tuple(phrase.lower().split())
        if (d <= 0 ) or (d >= 1):
            return "Please discounting a value between 0 and 1"
        elif (len(phrase) < self.ngram_order) or (len(phrase) > self.ngram_order):
            return ("Please give the ngram order as %d for your current phrase" %(len(phrase)))
        else:
            return self.calculate_score(phrase, d)
            
        
    def calculate_score(self, phrase, d):
        '''
        Calculate the calculate_score based on the phrase of reference
        '''
        ngram_len = len(phrase)
        all_gram = self.all_gram
        ngram_order = self.ngram_order
        vocab = self.vocab
        if ngram_len == 1: # base case
            if phrase in all_gram[ngram_len]:
                probability = all_gram[ngram_len][phrase]/vocab
                return log10(probability) 
            else:
                return log10(1/(vocab + 1)) # the word does not exist
        else: #recursive case    
            if ngram_len == ngram_order: ##counting  case
                if phrase in all_gram[ngram_len]:
                    num_1 = max(all_gram[ngram_len][phrase] - d, 0)
                    num_2 = len([each for each in all_gram[ngram_len] if phrase[:-1] == each[:-1]])
                    denom = sum([all_gram[ngram_len][each] for each in all_gram[ngram_len] if phrase[:-1] == each[:-1]])
                    probability = num_1/denom + d*num_2/denom*self.calculate_score(phrase[1:], d)
                    return probability
                else:
                    probability = self.calculate_score(phrase[1:], d) # we check for one lower gram
                    return probability

            else: #continuous counting case
                if phrase in all_gram[ngram_len]:
                    num_1 = max(len([each for each in all_gram[ngram_len+1] if phrase == each[1:]]) - d, 0)
                    num_2 = len([each for each in all_gram[ngram_len] if phrase[:-1] == each[:-1]])
                    denom = len([each for each in all_gram[ngram_len + 1 ] if phrase[:-1] == each[1:-1]])
                    probability = num_1/denom + d*num_2/denom*self.calculate_score(phrase[1:], d)
                    return probability
                else:
                    probability = self.calculate_score(phrase[1:], d) # we check for one lower gram
                    return probability

### Example Use Case for KneyserNey Model

In [11]:
chapter = ""
with open ('Austen_Pride.txt','r') as f:
    for line in f:
        chapter += line
chapter = chapter.replace('\n', ' ').replace("ï»¿", "").strip("'").strip("`")

phrase='truth universally hated'
ngram_order = 3
d = 0.75
prideKN = kneyserNey()
prideKN.fit(chapter, ngram_order)
prideKN.score(phrase, d)

-3.3802564783641027

# Testing the KneyserNey  Model on brown corpus

In [6]:
from nltk.corpus import brown
#nltk.download('brown')

In [7]:
# For Brown corpus processing
def make_sentence(text):
    '''
    Converts the corpus into a text with sentences
    '''
    text_as_sentence = ""
    for word in text:
        if word.isalpha():
            text_as_sentence = text_as_sentence + " " + word
        else:
            text_as_sentence = text_as_sentence + word
    return text_as_sentence        

In [8]:
def predict_genre(phrase, ngram_order, d):
    '''
    inputs : phrase (str) : the phrase who's score you are trying to predict
             ngram_order ()
    '''
    #generate categories
    scifi = make_sentence(brown.words(categories='science_fiction'))
    rom = make_sentence(brown.words(categories='romance'))
    myst = make_sentence(brown.words(categories='mystery'))
    #make predictions
    category = kneyserNey()
    predict = {"Science Fiction" : category.fit(scifi, ngram_order).score(phrase, d), \
               "Romance" : category.fit(rom, ngram_order).score(phrase, d), \
               "Mystery" : category.fit(myst, ngram_order).score(phrase, d)}
    #Get max value
    return max(predict, key=predict.get)

### Example Use Case to choose between the genres of a particular text

In [12]:
predict_genre('i love you', 3, 0.5)

'Romance'

In [13]:
predict_genre('this is scary', 3, 0.75)

'Science Fiction'

In [22]:
predict_genre('the mysterious crime', 3, 0.75)

'Mystery'