## 1. Smoothing models

In [86]:
import os
from nltk import word_tokenize, sent_tokenize
import numpy as np
import os
from collections import Counter
from collections import deque
import nltk
from nltk.util import ngrams
import sklearn 
import string
from itertools import islice
import math
import time
import string

In [91]:
def tokenize_unigram(phrase):
    tokens = word_tokenize(phrase, language='spanish')
    i_offset = 0
    for i, t in enumerate(tokens):
        i -= i_offset
        if (t == '%' or t == '$' or t == '€')and i > 0:
            left = tokens[:i-1]
            joined = [tokens[i - 1] + t]
            right = tokens[i + 1:]
            tokens = left + joined + right
            i_offset += 1
    
    news_tokens = []
    for t in tokens:
        if t.startswith('¿') or t.startswith('¡'):
            news_tokens.append(t[0])
            news_tokens.append(t[1:])
        else:
            news_tokens.append(t)
            
    news_tokens = [term for term in news_tokens if not term in set(string.punctuation)]
    news_tokens = [term for term in news_tokens if not term.isdigit() and len(term) > 0 ]      
    
    return news_tokens

In [69]:
''' 
    only for n=2,3 or more, generate {words:count}
    usually take training ngram tokens such as {a,b,c} as input, 
    generate ngram count with UNK
    when input test/dev data, it is used for error analysis
 
''' 
def word_freq(tokens):
    start_time= time.time()
    ngram_freq = {}
    # initial work-count dict population
    for token in tokens:       
        ngram_freq[token] = ngram_freq.get(token,0) + 1
    return ngram_freq

In [70]:
'''get Vocabulary from training set. including UNK
   :param: input tokenized training text
   :output: unigrams {unigrams:count}
'''
def unigram_V(training_tokens, unk_threshold):
    # this is the total length/num of tokens in training data
    global total_words_len
    global replaced_tokens_train 
    total_words_len = len(training_tokens)
    
    # initialize word_count pairs
    unigram_V = {}
    unigram_V[UNK_token] = 0
    
    # initial work-count dict population
    for token in training_tokens:
        unigram_V[token]= unigram_V.get(token,0) + 1
    
    # re-assign UNK
    unk_words = set()
    
    #items = unigram_V.iteritems()
    for word, count in unigram_V.items():
        # treat low freq word as UNK
        if count <= unk_threshold:
            unk_words.add(word)
            unigram_V[UNK_token] += count

    unk_words.discard(STOP_token)
    unk_words.discard(UNK_token)

    for word in unk_words:
        del unigram_V[word]

    replaced_tokens_train = training_tokens
    for idx, token in enumerate(replaced_tokens_train):            
        if token in unk_words:                
            replaced_tokens_train[idx] = UNK_token
            # modify tuple to contain UNK
    return unigram_V

In [84]:
'''ngram generator, n>1
   : param: input tokened texts with STOP sign, and UNK replaced
            could be either training data or test data or sentences
        
'''
def ngrams_gen(tokens, n):
   
    ngrams_tp = tuple()
    
    text = ' '.join(tokens)
    text = text.replace(STOP_token,STOP_token+'\n')
    
    sentences = sent_tokenize(text)
    
    for word in sentences:
        it = iter(word.split())
        window = deque(islice(it, n), maxlen=n)
        yield tuple(window)       
        for item in it:
            window.append(item)
            yield tuple(window)
       
    ngrams_tp += tuple(window)
    print(ngrams_tp)
    yield ngrams_tp

In [72]:
''' calculate MLE Probablity of unigram
    input word-freq dict for training data, which is Vocaborary
    this function will run even n specified by the shell is not 1
'''
def unigrams_prob(uni_count_dict):

    for word, count in uni_count_dict.items():
        prob_dict[word] = float(count) / float(total_words_len)
    
    return prob_dict

In [73]:
'''
calculate MLE probability of ngram, n>=2
: param: n: count dict of ngram,start from bigram
: param: input untokened train texts with STOP sign
'''
def ngram_prob(n,tokens, unigram_count):
    #print('------start ngram_prob---------------')
    start_time = time.time()
    # generate {ngrams:count} from training data
    ngram_list = list(ngrams_gen(tokens,n))
    
    ngram_count_pairs = word_freq(ngram_list)
    prob_dict = ngram_count_pairs
    if(n == 2):
        #items = prob_dict.iteritems()     
        uni_count = unigram_count
        # current probablity and word, in case n = 2, input is bigram words:count dict
        # input {a,b}: count, continue to get {a}: count
        for words, count in prob_dict.items():
            # extract the first item in bigram. 
            prior_word = words[0]   
            # get the count from {unigram: count} generated before       
            cnt_prior = uni_count[prior_word]    
            #print(prior_word,words,cnt_prior,count)  
            # q(w/v) = c(v,w)/c(v)      
            prob_dict[words] = count / cnt_prior
            #print(count,cnt_prior)
        # this should save as global for later use as bigram_prob_dict
        return prob_dict
    if(n > 2):
        #items = prob_dict.iteritems() 
        # get {n-1gram:count} pairs
        priorgram_list = list(ngrams_gen(tokens,n-1))
        priorgram_count_pairs = word_freq(priorgram_list)
        #-----------need to discard first few items--------
        for words, count in prob_dict.items():
            prior_word = words[:n-1]
            cnt_prior = priorgram_count_pairs[prior_word]
            #print(prior_word,words,cnt_prior,count)
            prob_dict[words] = count / cnt_prior
        return prob_dict

In [74]:
''' Evaluate the (negative) log probability of this word in this context.
    :param word: the word to get the probability of
    :param prob_dict: the context the word is in
'''
def logprob(word,prob_dict):
    prob_dict = prob_dict
    return -math.log(prob_dict[word], 2)

In [75]:
''' calculate entropy given a test/dev text
# input text should be processed propriately
# N = 1,2,3
# smooth_type used to deal with unseen word in different smoothing method
'''
def entropy(test_test,n,prob_dict,smooth_type):
    entr = 0.0

    tokens = tokenize_unigram(test_test)
    
    # number of words in text
    text_len = len(tokens)   
    global vocabulary
    
    sentences = sent_tokenize(text)
    
    # number of sentences
    sent_num = len(sentences)
    voc_set = set(prob_dict.keys())
   
    if (n ==1):      
        
        for sent in sentences:
            sent_temp = tokenize_unigram(sent)
            for word in sent_temp:
                if word not in voc_set:
                    entr += logprob(UNK_token, prob_dict)
                else:
                    entr += logprob(word, prob_dict)
    if(n > 1):   
        #ngram_prob_dict = ngram_prob(n,train_cut)
        for sent in sentences:
            
            # generate ngram for single sentence test data
            ngram_tmp = tuple(ngrams_gen(tokenize_unigram(sent), n))
            
            # iterate ngram in one sentence, skip first n-1 items
            for i in range(n - 1, len(list(ngram_tmp))):
                
                # si la palabra no está incluida en el vocabulario:
                if ngram_tmp[i] not in voc_set:
                    
                    if(smooth_type==NO_SMOOTHING):
                        entr += -math.log(0, 2)
                    
                    if(smooth_type==ADD_K_SMOOTHING):
                        entr += logprob(UNSEEN_NGRAM, prob_dict)
                
                # la palabra si está en nuestro vocabulario
                else:
                    entr += logprob(ngram_tmp[i], prob_dict)
                    
                    
    return entr / float(text_len - (n - 1) * sent_num)

In [76]:
''' 
perplexity for  ngram
'''
def perplexity(test_text,n,prob_dict,smooth_type):
    entropy = entropy(test_text,n,prob_dict,smooth_type)
    return math.pow(2.0, entropy)

In [77]:
# GLOABLS
STOP_token = '_STOP_'
UNK_token = '_UNK_' 
# token for ngram that appear in dev and test but not in training
UNSEEN_NGRAM = '_UNSEEN_'
unk_threshold = 1
# total counts of words in training data. contrain duplicated word
total_words_len = 0
ADD_K_SMOOTHING = 'add_k_smoothing'
LINER_INT = 'liner interpolation'
NO_SMOOTHING = 'no smoothing'
# training tokens
replaced_tokens_train = list()
vocabulary = set()

In [92]:
## prueba en un dataframe
df = pd.read_csv('/home/plubeda/git_repo/specialist-lexicon/03_treatment_text/dataframes/df_specialty_title_abstract/H02.403.600_neurology.csv')

text_train = ''
# Iteración por filas del DataFrame:
for index, row in df.iterrows():

    file_id = row['id']

    title = row['title']
    title = title.lower()
    text_train += title + ' '

text_train = text_train.strip()
train_token = tokenize_unigram(text_train)

print('tokenization done!')

unigram_count = unigram_V(train_token,unk_threshold)

# a list of vocabulary in unigrams
vocabulary = set(unigram_count.keys())

# generate unigram probablity dict
uni_prob_dict = {}
uni_prob_dict = unigram_count.copy()

unigrams_prob_dict = unigrams_prob(uni_prob_dict)

V = len(vocabulary)

print("Vocabulary lenth",V)
print('total_words_len',total_words_len)
print("training unigram finished")

# generate trigram probability dict
#trigram_prob_dict = ngram_prob(3,replaced_tokens_train, unigram_count)
#print("training trigram finished")

# generate bigram probability dict
#bigram_prob_dict = ngram_prob(2,replaced_tokens_train, unigram_count)
#print("training bigram finished")

tokenization done!
Vocabulary lenth 10189
total_words_len 253895
training unigram finished


In [93]:
unigrams_prob_dict

{'facomatosis': 1.9693180251678844e-05,
 'hipercolesterolemia': 1.5754544201343076e-05,
 'riluzol': 7.877272100671538e-06,
 'diagnosticados': 0.00010240453730872998,
 'evolución': 0.0010279840091376355,
 'supraórticos': 7.877272100671538e-06,
 'familiares': 0.00017329998621477382,
 'profunda': 0.0001260363536107446,
 'conductuales': 9.452726520805845e-05,
 'amnesias': 7.877272100671538e-06,
 'paramediano': 7.877272100671538e-06,
 'original': 7.877272100671538e-06,
 'motilidad': 2.757045235235038e-05,
 'castellano': 7.089544890604384e-05,
 'sociolaboral': 7.877272100671538e-06,
 'paraneoplásicas': 1.1815908151007306e-05,
 'alejandro': 1.1815908151007306e-05,
 'paroxística': 3.544772445302192e-05,
 'codepeh': 1.1815908151007306e-05,
 'saber': 2.757045235235038e-05,
 'alcance': 2.3631816302014612e-05,
 'existen': 3.544772445302192e-05,
 'línea': 3.150908840268615e-05,
 'pediátricas': 5.120226865436499e-05,
 'ada': 7.877272100671538e-06,
 'neurocognitiva': 1.5754544201343076e-05,
 'radioin

In [94]:
#get perplextity for unigram for test data
# A low perplexity indicates the probability distribution is good at predicting the sample. 
# Perplexity metric in NLP is a way to capture the degree of 'uncertainty' a model has in predicting (assigning probabilities to) some text.
print('perplexity for unsmoothed unigram:')
path_df_test = '/home/plubeda/git_repo/specialist-lexicon/03_treatment_text/dataframes/df_specialty_title_abstract_case_report/'
list_specialties = os.listdir(path_df_test)

for specialty in list_specialties:
    df = pd.read_csv(path_df_test + specialty)
    text_test = ''
    # Iteración por filas del DataFrame:
    for index, row in df.iterrows():

        title = row['title']
        title = title.lower()
        text_test += title + ' '

    text_test = text_test.strip()
    print("%-60s %-20s" % (specialty, perplexity(text_test,1,unigrams_prob_dict,NO_SMOOTHING)))

#print('\n')

perplexity for unsmoothed unigram:
H02.403.340_general_practice.csv                             302.3258154749782   
H02.403.429.515_medical_oncology.csv                         466.537765641947    
H02.403.330_forensic_medicine.csv                            395.4829650161554   
H02.403.810.468_ophthalmology.csv                            481.7630471379692   
H02.403.810.788_surgery_plastic.csv                          405.507889018929    
H02.403.429.480_infectious_disease_medicine.csv              395.02479693797795  
H02.403.600_neurology.csv                                    575.5783551904927   
H02.403.763_reproductive_medicine.csv                        333.18267429777023  
H02.403.690_psychiatry.csv                                   420.55973732973695  
H02.403.429.730_rheumatology.csv                             463.514047747573    
H02.403.740_radiology.csv                                    514.5686163238114   
H02.403.044.500_immunochemistry.csv                          43