In [1]:
from collections import Counter
import math as calc
from nltk import word_tokenize
import pandas as pd
import string
import os

In [2]:
class nGramProbability():
    """A program which creates n-Gram (1-5) Maximum Likelihood Probabilistic Language Model with Laplace Add-1 smoothing
    and stores it in hash-able dictionary form.
    n: number of bigrams (supports up to 5)
    corpus_file: relative path to the corpus file.
    """

    def __init__(self, n=1, corpus_file=None):
        """Constructor method which loads the corpus from file and creates ngrams based on imput parameters."""
        self.words = []
        self.load_corpus(corpus_file)
        self.unigram = self.bigram = self.trigram = self.quadrigram = self.pentigram = None
        self.create_unigram()
        if n >= 2:
            self.create_bigram()
        if n >= 3:
            self.create_trigram()
        if n >= 4:
            self.create_quadrigram()
        if n >= 5:
            self.create_pentigram()
        return

    def tokenize(self, phrase):
        tokens = word_tokenize(phrase, language='spanish')
        i_offset = 0
        for i, t in enumerate(tokens):
            i -= i_offset
            if (t == '%' or t == '$' or t == '€')and i > 0:
                left = tokens[:i-1]
                joined = [tokens[i - 1] + t]
                right = tokens[i + 1:]
                tokens = left + joined + right
                i_offset += 1
        
        news_tokens = []
        for t in tokens:
            if t.startswith('¿') or t.startswith('¡'):
                news_tokens.append(t[0])
                news_tokens.append(t[1:])
            else:
                news_tokens.append(t)
                
        return news_tokens

    def read_sentences_from_file(self, file_path):
        data = pd.read_csv(file_path)
        list_titles = data.title.values
        list_tokens_return = []
        
        for title in list_titles:

            # pre-processing text (remove digits, remove punctuations, lowercase)
            list_tokens = self.tokenize(title)
            list_tokens = [token for token in list_tokens if not token.isdigit()]
            list_tokens = [token.lower() for token in list_tokens if not token in string.punctuation]
            
            list_tokens.insert(0, "<s>")
            list_tokens.append("</s>")

            list_tokens_return.extend(list_tokens)
            
        return list_tokens_return

    def load_corpus(self, file_name):
        """Method to load external file which contains raw corpus."""
        print("Loading Corpus from data file")
        self.words = self.read_sentences_from_file(file_name)
        print("Processing Corpus")
    
    def create_unigram(self):
        """Method to create Unigram Model for words loaded from corpus."""
        print("Creating Unigram Model")
        unigram_file = None
        print("Calculating Count for Unigram Model")
        unigram = Counter(self.words)
        self.unigram = unigram

    def create_bigram(self):
        """Method to create Bigram Model for words loaded from corpus."""
        print("Creating Bigram Model")
        words = self.words
        biwords = []
        for index, item in enumerate(words):
            if index == len(words)-1:
                break
            biwords.append(item+' '+words[index+1])
        print("Calculating Count for Bigram Model")
        bigram_file = None
        bigram = Counter(biwords)
        self.bigram = bigram

    def create_trigram(self):
        """Method to create Trigram Model for words loaded from corpus."""
        print("Creating Trigram Model")
        words = self.words
        triwords = []
        for index, item in enumerate(words):
            if index == len(words)-2:
                break
            triwords.append(item+' '+words[index+1]+' '+words[index+2])
        print("Calculating Count for Trigram Model")
        trigram = Counter(triwords)
        self.trigram = trigram

    def create_quadrigram(self):
        """Method to create Quadrigram Model for words loaded from corpus."""
        print("Creating Quadrigram Model")
        words = self.words
        quadriwords = []
        for index, item in enumerate(words):
            if index == len(words)-3:
                break
            quadriwords.append(item+' '+words[index+1]+' '+words[index+2]+' '+words[index+3])
        print("Calculating Count for Quadrigram Model")
        quadrigram = Counter(quadriwords)
        self.quadrigram = quadrigram

    def create_pentigram(self):
        """Method to create Pentigram Model for words loaded from corpus."""
        print("Creating pentigram Model")
        words = self.words
        pentiwords = []
        for index, item in enumerate(words):
            if index == len(words)-4:
                break
            pentiwords.append(item+' '+words[index+1]+' '+words[index+2]+' '+words[index+3]+' '+words[index+4])
        print("Calculating Count for pentigram Model")
        pentigram = Counter(pentiwords)
        self.pentigram = pentigram

    def sentence_probability_mle(self, sentence, n=1, form='antilog'):
        """Method to calculate cumulative n-gram Maximum Likelihood Probability of a phrase or sentence."""
        words = self.tokenize(sentence)
        P = 0
        if n == 1:
            for index, item in enumerate(words):
                unigram = item

                # laplace add-1
                numerator = self.unigram[unigram] + 1
                denominator = len(self.words) + len(self.unigram)

                P += calc.log( numerator /denominator)

        if n == 2:
            for index, item in enumerate(words):
                if index >= len(words) - 1:
                    break
                
                unigram = item
                bigram = item+' '+words[index+1]
                
                # laplace add-1
                numerator = self.bigram[bigram] + 1
                denominator = self.unigram[unigram] + len(self.unigram)

                P += calc.log( numerator /denominator)

        if n == 3:
            for index, item in enumerate(words):
                if index >= len(words) - 2:
                    break
             
                bigram = item+' '+words[index+1]
                trigram = item + ' ' + words[index+1] + ' ' + words[index+2]
                
                # laplace add-1
                numerator = self.trigram[trigram] + 1
                denominator = self.bigram[bigram] + len(self.unigram)

                P += calc.log( numerator /denominator)


        if n == 4:
            for index, item in enumerate(words):
                if index >= len(words) - 3:
                    break

                trigram = item + ' ' + words[index+1] + ' ' + words[index+2]
                quadrigram = item + ' ' + words[index+1] + ' ' + words[index+2] + ' ' + words[index+3]

                # laplace add-1
                numerator = self.quadrigram[quadrigram] + 1
                denominator = self.trigram[trigram] + len(self.unigram)

                P += calc.log( numerator /denominator)

        if n == 5:
            for index, item in enumerate(words):
                if index >= len(words) - 4:
                    break

                quadrigram = item + ' ' + words[index+1] + ' ' + words[index+2] + ' ' + words[index+3]
                pentagram = item + ' ' + words[index+1] + ' ' + words[index+2] + ' ' + words[index+3] + ' ' + words[index+4]
                
                # laplace add-1
                numerator = self.pentigram[pentagram] + 1
                denominator = self.quadrigram[quadrigram] + len(self.unigram)

                P += calc.log( numerator /denominator)

        if form == 'log':
            return P
        elif form == 'antilog':
            return calc.pow(calc.e, P)

In [14]:
files_train = '/home/plubeda/git_repo/specialist-lexicon/03_treatment_text/dataframes/df_specialty_title_abstract/'
files_test = '/home/plubeda/git_repo/specialist-lexicon/03_treatment_text/dataframes/df_specialty_title_abstract_case_report/'

for file_train in os.listdir(files_train):

    ng_train = nGramProbability(5, files_train + file_train)
    
    #print(">> Training...", file_train)
    #with open("outputs/" + file_train, "w") as fout:

        # file header 
        #fout.write("Train Test Prob1gram Prob2gram Prob3gram Prob4gram Prob5gram\n")
    print('{:30} {:25} {:6s}   {:6s}   {:6s}   {:6s}   {:6s} '.format("Train", "Test", "Prob1Gram", "Prob2Gram", "Prob3Gram", "Prob4Gram", "Prob5Gram" ))
    for file_test in os.listdir(files_test):

        #print(">> Testing...", file_test)
        
        list_results_unigam = []
        list_results_bigam = []
        list_results_trigam = []
        list_results_quadigam = []
        list_results_pentigam = []

        data_test = pd.read_csv(files_test + file_test)
        list_titles = data_test.title.values
       
        for title in list_titles:

            # pre-processing as in training (remove digit, remove punctuation, convert lowecase)
            list_tokens = ng_train.tokenize(title)
            list_tokens = [token for token in list_tokens if not token.isdigit()]
            list_tokens = [token.lower() for token in list_tokens if not token in string.punctuation]
            title = ' '.join(list_tokens)

            # unigram probability
            result_uigram = ng_train.sentence_probability_mle(sentence=title, n=1, form='log')
            list_results_unigam.append(result_uigram)

            # bigram probability
            result_bigram = ng_train.sentence_probability_mle(sentence=title, n=2, form='log')
            list_results_bigam.append(result_bigram)

            # trigram probability
            result_trigram = ng_train.sentence_probability_mle(sentence=title, n=3, form='log')
            list_results_trigam.append(result_trigram)

            # uadigram probability
            result_quadigram = ng_train.sentence_probability_mle(sentence=title, n=4, form='log')
            list_results_quadigam.append(result_quadigram)

            # pentigram probability
            result_pentigram = ng_train.sentence_probability_mle(sentence=title, n=5, form='log')
            list_results_pentigam.append(result_pentigram)


        unigram = sum(list_results_unigam)/len(list_results_unigam)
        bigram = sum(list_results_bigam)/len(list_results_bigam)
        trigram = sum(list_results_trigam)/len(list_results_trigam)
        quadigram = sum(list_results_quadigam)/len(list_results_quadigam)
        pentigram = sum(list_results_pentigam)/len(list_results_pentigam)

        print('{:30} {:30} {:06.3f}   {:06.3f}   {:06.3f}   {:06.3f}   {:06.3f} '.format(file_train.split("_", 1)[1], file_test.split("_", 1)[1], unigram, bigram, trigram, quadigram, pentigram ))

Loading Corpus from data file
Processing Corpus
Creating Unigram Model
Calculating Count for Unigram Model
Creating Bigram Model
Calculating Count for Bigram Model
Creating Trigram Model
Calculating Count for Trigram Model
Creating Quadrigram Model
Calculating Count for Quadrigram Model
Creating pentigram Model
Calculating Count for pentigram Model
Train                          Test                      Prob1Gram   Prob2Gram   Prob3Gram   Prob4Gram   Prob5Gram 
general_practice.csv           general_practice.csv           -63.043   -58.587   -55.387   -49.532   -42.928 
general_practice.csv           medical_oncology.csv           -62.465   -55.010   -49.618   -43.148   -36.872 
general_practice.csv           forensic_medicine.csv          -58.061   -52.331   -47.433   -40.774   -34.812 
general_practice.csv           ophthalmology.csv              -65.275   -58.199   -52.981   -46.418   -39.924 
general_practice.csv           surgery_plastic.csv            -76.232   -69.707   -64.893

KeyboardInterrupt: 