# NLTK with Language Model

https://www.nltk.org/api/nltk.lm.html

https://www.nltk.org/_modules/nltk/test/unit/lm/test_models.html

In [129]:
from nltk.lm import MLE
from nltk.tokenize import word_tokenize
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm.models import Laplace, KneserNeyInterpolated, WittenBellInterpolated
import os
import pandas as pd
import string

In [130]:
def tokenize(phrase):
    tokens = word_tokenize(phrase, language='spanish')
    i_offset = 0
    for i, t in enumerate(tokens):
        i -= i_offset
        if (t == '%' or t == '$' or t == '€')and i > 0:
            left = tokens[:i-1]
            joined = [tokens[i - 1] + t]
            right = tokens[i + 1:]
            tokens = left + joined + right
            i_offset += 1

    news_tokens = []
    for t in tokens:
        if t.startswith('¿') or t.startswith('¡'):
            news_tokens.append(t[0])
            news_tokens.append(t[1:])
        else:
            news_tokens.append(t)

    return news_tokens

In [131]:
def prepare_data(csv):
    data = pd.read_csv(csv)
    list_titles = data.title.values
    
    list_titles_return = []
    
    for title in list_titles:
       
        tokens_title = tokenize(title)
        tokens_title = [token for token in tokens_title if not token.isdigit()]
        tokens_title = [token.lower() for token in tokens_title if not token in string.punctuation]

        list_titles_return.append(tokens_title)
    
    return list_titles_return

In [None]:
train_sentences = [['an', 'apple'], ['an', 'orange']]
test_sentences = [['an', 'an'], [ "orange"]]


files_train = '../03_treatment_text/dataframes/df_specialty_title_abstract/'
files_test = '../03_treatment_text/dataframes/df_specialty_title_abstract_case_report/'

for file_train in os.listdir(files_train):

    if not 'general_practice' in file_train:
    
        train_sentences = prepare_data(files_train + file_train)

        print('\n{:30} {:25} {:6s}   {:6s}   {:6s}'.format("Train", "Test", "Laplace1Gram", "Laplace2Gram", "Laplace3Gram"))
        for file_test in os.listdir(files_test):

            print('\n{:30} {:25} '.format(file_train.split("_", 1)[1], file_test.split("_", 1)[1])  , end = '')

            test_sentences = prepare_data(files_test + file_test)


            for n in range(1, 4):

                '''
                    MLE
                train_data, padded_vocab = padded_everygram_pipeline(n, train_sentences)
                test_data, _ = padded_everygram_pipeline(n, test_sentences)

                # Lets train a N-grams model, previously we set n=1 or 2 or 3
                model = MLE(n) # MLE / Laplace add1 / KneserNeyInterpolated / WittenBellInterpolated / 
                # fit on padded vocab that the model know the new tokens added to vocab (<s>, </s>, UNK etc)
                model.fit(train_data, padded_vocab)

                sum = 0
                cont = 0
                for index, test in enumerate(test_data):
                    pp = model.perplexity(test)

                    #print(index, test_sentences[index])
                    #print("PP (model MLE) {}-gram: {}".format( n , pp))

                    cont += 1
                    sum += pp

                print("PP (model MLE) {}-gram: {}".format( n , sum/cont))
                '''



                '''
                    Lappace
                '''

                train_data, padded_vocab = padded_everygram_pipeline(n, train_sentences)
                test_data, _ = padded_everygram_pipeline(n, test_sentences)

                model = Laplace(n) # MLE / Laplace add1 / KneserNeyInterpolated / WittenBellInterpolated / 
                model.fit(train_data, padded_vocab)

                sum = 0
                cont = 0
                for index, test in enumerate(test_data):
                    pp = model.perplexity(test)
                    cont += 1
                    sum += pp

                print("{:6f}   ".format(sum/cont) , end = '')


                '''
                    KneserNeyInterpolated


                train_data, padded_vocab = padded_everygram_pipeline(n, train_sentences)
                test_data, _ = padded_everygram_pipeline(n, test_sentences)

                model = KneserNeyInterpolated(n) # MLE / Laplace add1 / KneserNeyInterpolated / WittenBellInterpolated / 
                model.fit(train_data, padded_vocab)

                sum = 0
                cont = 0
                for index, test in enumerate(test_data):
                    try:
                        pp = model.perplexity(test)
                    except ZeroDivisionError:
                        pp = float('inf')

                    cont += 1
                    sum += pp

                print("PP (model KneserNeyInterpolated) {}-gram: {}".format( n , sum/cont))

                '''
                    #WittenBellInterpolated
                '''

                train_data, padded_vocab = padded_everygram_pipeline(n, train_sentences)
                test_data, _ = padded_everygram_pipeline(n, test_sentences)

                model = WittenBellInterpolated(n) # MLE / Laplace add1 / KneserNeyInterpolated / WittenBellInterpolated / 
                model.fit(train_data, padded_vocab)

                sum = 0
                cont = 0
                for index, test in enumerate(test_data):
                    pp = model.perplexity(test)
                    cont += 1
                    sum += pp

                print("PP (model WittenBellInterpolated) {}-gram: {}".format( n , sum/cont))
                '''


Train                          Test                      Laplace1Gram   Laplace2Gram   Laplace3Gram

medical_oncology.csv           general_practice.csv      1403.805670   1385.993641   1257.148111   
medical_oncology.csv           medical_oncology.csv      4653.949746   