In [3]:
import math
import random
import numpy as np
import pandas as pd
import re
import nltk
nltk.data.path.append('.')

In [4]:
with open("setopatiopinion.txt", "r", encoding="utf-8") as f:
    data = f.read()

In [5]:
def split_to_sentences(data):
    #get stripped sentences in array removing empty sentences 
    sentences = re.sub(r'[^क-नःप-रलव-हा-ृेैोौ्ँंॐ‍ अ-ऌएऐओऔ।]', "",data)
    sentences = sentences.split('।')
    sentences = [s.strip() for s in sentences]
    sentences = [s for s in sentences if len(s) > 0]
    
    return sentences    

In [10]:
def tokenize_sentences(sentences):
    #get array of tokens/words for each of the sentences
    tokenized_sentences = []
    
    for sentence in sentences:

        tokenized =  nltk.word_tokenize(sentence)

        tokenized_sentences.append(tokenized)
    
    return tokenized_sentences

In [11]:
def get_tokenized_data(data):
    #Combined effect of 
        # 1. Splitting into sentences 
        # 2. Tokenizeing sentences
    sentences = split_to_sentences(data)
    tokenized_sentences = tokenize_sentences(sentences)
    
    return tokenized_sentences

In [12]:
get_tokenized_data('विस्तारै त्यसको हिसाब खोज्न थालियो। विस्तारै प्रश्न सोध्न थालियो। विस्तारै उपनिवेशहरू जाग्न थाले।')

[['विस्तारै', 'त्यसको', 'हिसाब', 'खोज्न', 'थालियो'],
 ['विस्तारै', 'प्रश्न', 'सोध्न', 'थालियो'],
 ['विस्तारै', 'उपनिवेशहरू', 'जाग्न', 'थाले']]

# Tokenize sentences and Divide into TEST and TRAIN

In [13]:
#1. tokenize the data 
#2. suffle
#3. split data into train(80% data) and remaining 20% into test_data
tokenized_data = get_tokenized_data(data)
print(tokenized_data[0])
random.seed(87)
random.shuffle(tokenized_data)

train_size = int(len(tokenized_data) * 0.8)
train_data = tokenized_data[0:train_size]
test_data = tokenized_data[train_size:]

['पहिला', 'पहिला', 'शक्तिमा', 'हुनेहरूले', 'आफ्नो', 'पोट्रेट', 'बनाउन', 'लगाउने', 'चलन', 'थियो']


In [17]:
print("{} data are split into {} train and {} test set".format(
    len(tokenized_data), len(train_data), len(test_data)))

print("First training sample:")
print(train_data[0])
      
print("First test sample")
print(test_data[0])

114985 data are split into 91988 train and 22997 test set
First training sample:
['अर्कातिर', 'दोस्रो', 'विश्वयुद्ध', 'पछि', 'बनेको', 'विश्वव्यवस्थामा', 'परिवर्तन', 'आइरहेको', 'छ']
First test sample
['यो', 'कर्मकाण्डले', 'नै', 'मुलुकको', 'लोकतान्त्रिक', 'भविष्य', 'निर्धारण', 'गर्छ', 'भन्ने', 'नेहरूको', 'मान्यता', 'छ']


In [18]:
def get_words_ftable(tokenized_sentences):
   # get dictionary containing all tokens(words) with their frequecy
    word_counts = {}
    
    for sentence in tokenized_sentences: 

        for token in sentence: 


            if token not in word_counts :
                word_counts[token] = 1
            
            else:
                word_counts[token] += 1

    
    return word_counts

In [19]:
def get_vocab_above_threshold(tokenized_sentences, threshold):
    
    closed_vocab = []
    
    # Get the word couts of the tokenized sentences
    word_counts = get_words_ftable(tokenized_sentences)
    
    for word, cnt in word_counts.items(): 
        
        if cnt >= threshold  :
            closed_vocab.append(word)

    return closed_vocab

In [20]:
def replace_oov_words_by_unk(tokenized_sentences, vocabulary, unknown_token="<unk>"):
    # tokenized sentences with unk
    vocabulary = set(vocabulary)
    
    # Initialize a list that will hold the sentences after less frequent words are replaced by the unknown token
    replaced_tokenized_sentences = []
    
    for sentence in tokenized_sentences:
        
        # Initialize the list that will contain a single sentence with "unknown_token" replacements
        replaced_sentence = []

        for token in sentence: 

            if token in vocabulary: 

                replaced_sentence.append(token)
            else:

                replaced_sentence.append(unknown_token)
     
        # Append the list of tokens to the list of lists
        replaced_tokenized_sentences.append(replaced_sentence)
        
    return replaced_tokenized_sentences

In [21]:
def preprocess_data(train_data, test_data, threshold):

    # Get the closed vocabulary using the train data
    vocabulary = get_vocab_above_threshold (train_data, threshold)
    
    # For the train data, replace less common words with "<unk>"
    train_data_replaced = replace_oov_words_by_unk(train_data, vocabulary, unknown_token="<unk>")
    
    # For the test data, replace less common words with "<unk>"
    test_data_replaced = replace_oov_words_by_unk(test_data, vocabulary, unknown_token="<unk>")

    return train_data_replaced, test_data_replaced, vocabulary

### Preprocess the train and test data

In [22]:
#gives vocabulary/ (closed vocabulary with threshold)
#      train_data_replaced with <unk> using vocab
#      test_data_replaced with <unk> using vocab
train_data_processed, test_data_processed, vocabulary = preprocess_data(train_data, test_data, threshold=2)

In [23]:
print("First preprocessed training sample:")
print(train_data_processed[0], len(train_data_processed))
print()
print("First preprocessed test sample:")
print(test_data_processed[0],len(test_data_processed))
print()
print("First 10 vocabulary:")
print(vocabulary[0:10])
print()
print("Size of vocabulary:", len(vocabulary))

First preprocessed training sample:
['अर्कातिर', 'दोस्रो', 'विश्वयुद्ध', 'पछि', 'बनेको', '<unk>', 'परिवर्तन', 'आइरहेको', 'छ'] 91988

First preprocessed test sample:
['यो', '<unk>', 'नै', 'मुलुकको', 'लोकतान्त्रिक', 'भविष्य', 'निर्धारण', 'गर्छ', 'भन्ने', 'नेहरूको', 'मान्यता', 'छ'] 22997

First 10 vocabulary:
['अर्कातिर', 'दोस्रो', 'विश्वयुद्ध', 'पछि', 'बनेको', 'परिवर्तन', 'आइरहेको', 'छ', 'देवदार', 'पवित्रता']

Size of vocabulary: 46638


In [24]:
def get_ngrams_ftable(tokenized_sentences, n, start_token='<s>', end_token = '<e>'):

    # Initialize dictionary of n-grams and their counts
    n_grams = {}
    
    # Go through each sentence in the data
    for sentence in tokenized_sentences: 
        
        # prepend start token n times, and  append <e> one time
        sentence = n* [start_token] + sentence +  [end_token ]
        
        # convert list to tuple
        # So that the sequence of words can be used as a key in the dictionary
        sentence = tuple (sentence)
        
    #count frequency of each ngram
        for i in range(len(sentence) - n + 1 ): 

            # Get the n-gram from i to i+n
            n_gram =   sentence [i: i + n]

            if n_gram in n_grams : 

                n_grams[n_gram] + = 1
            else:

                n_grams[n_gram] = 1
                
            #n_grams[n_gram]=n_grams.get(ngram,0)+1
    

    return n_grams

In [25]:
def estimate_probability(word, current_ngram_chunk, 
                         ngram_ftable, nplus1_gram_ftable, vocab_size, k=1.0):
    #for finding, given current_chunk what is the probability of the word...
    # convert list to tuple to use it as a dictionary key
    current_ngram_chunk = tuple(current_ngram_chunk)
    
    
    current_ngram_freq =  ngram_ftable.get(current_ngram_chunk, 0) 
    denominator = current_ngram_freq + (k * vocab_size)

    # current chunk with word
    nplus1_gram_chunk = current_ngram_chunk + (word,)

    nplus1_gram_freq = nplus1_gram_ftable.get(nplus1_gram_chunk, 0)
        
    numerator = nplus1_gram_freq + k

    probability = numerator / denominator
    
    return probability

### Estimate probabilities for all words

The function defined below loops over all words in vocabulary to calculate probabilities for all possible words.


In [26]:
def estimate_probabilities(current_ngram, ngram_ftable, nplus1_gram_ftable, vocabulary, k=1.0):

    # convert list to tuple to use it as a dictionary key
    current_ngram = tuple(current_ngram)
    
    # add <e> <unk> to the vocabulary
    # <s> is not needed since it should not appear as the next word
    vocabulary = vocabulary + ["<e>", "<unk>"]
    vocabulary_size = len(vocabulary)
    
    probabilities = {}
    for word in vocabulary:
        probability = estimate_probability(word, current_ngram, 
                                           ngram_ftable, nplus1_gram_ftable, 
                                           vocabulary_size, k=k)
        probabilities[word] = probability

    return probabilities

In [27]:
import math

In [28]:
def calculate_perplexity(sentence, ngram_ftable, nplus1_gram_ftable, vocabulary_size,n, k=1.0):
   
    # prepend <s> and append <e>
    sentence = ["<s>"] * n + sentence + ["<e>"]
    sentence = tuple(sentence)
    
    # length of sentence (after adding <s> and <e> tokens)
    N = len(sentence)
    
    # The variable p will hold the product
    # that is calculated inside the n-root
    # Update this in the code below
    product_pi = 1.0
    
    summation=0
    # Index t ranges from 0 to N - n, inclusive on both ends
    for t in range(0, N-n): # complete this line

        # get the n-gram preceding the word at position t
        ngram = sentence[t:t+n]
        
        # get the word at position t
        word = sentence[t+n]

        probability =estimate_probability(word, ngram, ngram_ftable, nplus1_gram_ftable, vocabulary_size, k=k)
        summation+=math.log(probability)
#         product_pi *=  (1 / probability)

#     perplexity = product_pi ** (1 / N)
#     perplexity = math.exp(-summation/ N)

    return summation, N

In [29]:
def suggest_a_word(previous_words, ngram_ftable, nplus1_gram_ftable, vocabulary, n, k=1.0, start_with=None):
       
    
    # get the most recent 'n' words from previous words as the previous n-gram
    previous_ngram = previous_words[-n:]

    # Estimate the probabilities that each word in the vocabulary is the next word
    
    probabilities = estimate_probabilities(previous_ngram,
                                           ngram_ftable, nplus1_gram_ftable,
                                           vocabulary, k=k)

    suggestion = None
    
    max_prob = 0

    for word, prob in probabilities.items(): 
        
        if start_with is not None:   
            if   not word.startswith(start_with):
                continue 
        
        if prob > max_prob :
            
            suggestion = word
            max_prob = prob
  
    return suggestion, max_prob

In [30]:
# def calculate_prediction_values(sentence, ngram_ftable, nplus1_gram_ftable, vocabulary,n, k=1.0):
   
#     # length of previous words
    
#     # prepend <s> and append <e>
#     sentence = ["<s>"] * n + sentence + ["<e>"]
#     sentence = tuple(sentence)
    
#     # length of sentence (after adding <s> and <e> tokens)
#     N = len(sentence)
    
#     # The variable p will hold the product
#     # that is calculated inside the n-root
#     # Update this in the code below
#     count = 1    
    
#     # Index t ranges from 0 to N - n, inclusive on both ends
#     for t in range(0, N-n): # complete this line

#         # get the n-gram preceding the word at position t
#         ngram = sentence[t:t+n]
        
#         # get the word at position t
#         word = sentence[t+n]
        

#         predicted_word =suggest_a_word(ngram, ngram_ftable, nplus1_gram_ftable, vocabulary, n, k=k)
        
#         if word==predicted_word:
#             count+=1

#     total_prediction = N-n+1
#     correct_prediction = count
    
#     return correct_prediction , total_prediction

### Get multiple suggestions

In [47]:
def get_suggestions(previous_tokens, ngram_ftable_list, vocabulary, k=1.0, start_with=None):
    model_counts = len(previous_tokens)
    suggestions = []
    for i in range(model_counts):
        ngram_ftable = ngram_ftable_list[i]
        nplus1_gram_ftable = ngram_ftable_list[i+1]
        
        suggestion = suggest_a_word(previous_tokens, ngram_ftable,
                                    nplus1_gram_ftable, vocabulary,
                                    k=k,n=i+1, start_with=start_with)
        suggestions.append(suggestion)
    return suggestions

### Suggest multiple words using n-grams of varying length

Using n-grams of varying lengths (unigrams, bigrams, trigrams, 4-grams,5-grams).

In [32]:
ngram_ftable_list = []
for n in range(1, 6):
    ngram_ftable = get_ngrams_ftable(train_data_processed, n)
    print(list(ngram_ftable.items())[5])
    ngram_ftable_list.append(ngram_ftable)


(('बनेको',), 378)
(('पछि', 'बनेको'), 2)
(('विश्वयुद्ध', 'पछि', 'बनेको'), 1)
(('दोस्रो', 'विश्वयुद्ध', 'पछि', 'बनेको'), 1)
(('अर्कातिर', 'दोस्रो', 'विश्वयुद्ध', 'पछि', 'बनेको'), 1)


In [33]:
def calculate_pp(test_sentences, ngram_ftable_list, vocab_size, k=1.0):
    ngram_accuracy_list=[]
    for i in range(4):
        count=0
        total=0
        for sentence in test_sentences:
            c,t=calculate_perplexity(sentence,ngram_ftable_list[i],ngram_ftable_list[i+1],vocab_size,n=i+1,k=k)
            count+=c
            total+=t
        ngram_accuracy_list.append(math.exp(count/total))
    return ngram_accuracy_list

In [34]:
calculate_pp(test_data_processed, ngram_ftable_list, len(vocabulary), k=1.0)

[0.00026906287491775574,
 0.00017160021411939935,
 0.0002484017651750977,
 0.00039325419330332723]

In [54]:
# def calculate_accuracy(test_sentences, ngram_ftable_list, vocabulary, k=1.0):
#     ngram_accuracy_list=[]
#     for i in range(4):
#         count=0
#         total=0
#         for sentence in test_sentences:
#             c,t=calculate_prediction_values(sentence,ngram_ftable_list[i],ngram_ftable_list[i+1],vocabulary,n=i+1,k=k)
#             count+=c
#             total+=t
#         ngram_accuracy_list.append(count/total)
#         print(count/total)
#     return ngram_accuracy_list

In [48]:
previous_tokens = ["कम्प्युटर"]
tmp_suggest8 = get_suggestions(previous_tokens, ngram_ftable_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest8)

The previous words are ['कम्प्युटर'], the suggestions are:


[('इन्जिनियरिङको', 0.00014994109456999035)]

In [52]:
previous_tokens = ["कालो"]
tmp_suggest8 = get_suggestions(previous_tokens, ngram_ftable_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest8)

The previous words are ['कालो'], the suggestions are:


[('चारकोलले', 0.0010878375495925942)]