# Assignment - 2

Setting up the notebook

In [1]:
import pickle
import sys
import os
import concurrent
import os
import pickle
import nltk
from nltk.corpus import brown


from Custom_Utils.processor import adding_unknown, tokenizer, load_birkbeck_file,evaluate_models
from Custom_Utils.ngram_model import SimpleNGram



In [2]:
def save_model(model):
    os.makedirs(model.model_loc, exist_ok=True)
    with open(os.path.join(model.model_loc, f'{model.n}-gram-counts.pickle'), 'wb') as handle:
        pickle.dump(model.n_gram_counts, handle, protocol=pickle.HIGHEST_PROTOCOL)


def load_model(model):
    with open(os.path.join(model.model_loc, f'{model.n}-gram-counts.pickle'), 'rb') as handle:
        model.n_gram_counts = pickle.load(handle)

Downloading the brown corpus to train the model

In [3]:
nltk.download('brown')
nltk.download('punkt')


[nltk_data] Downloading package brown to
[nltk_data]     /Users/prithvirao/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/prithvirao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Training the models using brown corpus and testing all models

In [20]:
new_list = brown.sents()
tokenized_sent, vocabulary = tokenizer(new_list)
model_train_data = adding_unknown(tokenized_sent, vocabulary)
test_sentence_tokens=["The", "morning"]
ns = [1, 2, 3, 5, 10]
for n in ns:
    model = SimpleNGram(n=n, model_loc='models', vocabulary=vocabulary)
    model.fit(model_train_data)
    save_model(model)
    print(f"{n}-gram model prediction: {model.get_suggestions(test_sentence_tokens)}")
    print()

1-gram model prediction: {1: [('the', 0.054305789145672016)], 5: [('the', 0.054305789145672016), ('<unknown>', 0.049470570749800345), (',', 0.04527565013508343), ('<s>', 0.044502636086562054), ('<e>', 0.044502636086562054)], 10: [('the', 0.054305789145672016), ('<unknown>', 0.049470570749800345), (',', 0.04527565013508343), ('<s>', 0.044502636086562054), ('<e>', 0.044502636086562054), ('.', 0.038988313983405035), ('of', 0.028260027645341777), ('and', 0.02239334773291899), ('to', 0.02030170831448012), ('a', 0.018014487028343074)]}

2-gram model prediction: {1: [(',', 0.0039904229848363925)], 5: [(',', 0.0039904229848363925), ('.', 0.002154828411811652), ('``', 0.0010375099760574621), ('and', 0.0010375099760574621), ('of', 0.0009577015163607343)], 10: [(',', 0.0039904229848363925), ('.', 0.002154828411811652), ('``', 0.0010375099760574621), ('and', 0.0010375099760574621), ('of', 0.0009577015163607343), ('when', 0.00047885075818036713), ('the', 0.0003990422984836393), ('at', 0.00039904229

# Loading a small Birkbeck APPLING1DAT file as a validation file with handful enteries to execute for all models and evaluating them

In [9]:
testing_data = load_birkbeck_file(file_loc='Data/APPLING1DAT_validting.643')
tokenized_sent, a = tokenizer(testing_data['previous-tokens'].values.tolist(), remove_empty=False)
model_testing_data = adding_unknown(tokenized_sent, vocabulary)
testing_data['final-test'] = model_testing_data

In [10]:
print(model_testing_data)

[['i', 'felt', 'very'], ['when', 'the'], ['in', 'the'], ['i', 'thought', 'it', 'was', 'a'], ['everything'], ['the', 'morning'], ['the', 'hunters']]


Using the same evaluation method as used in previous spell check model

In [11]:
queries = [{} for _ in ns]
results_eval = [{} for _ in ns]

for i, n in enumerate(ns):
    model = SimpleNGram(n=n, model_loc='models', vocabulary=vocabulary)
    load_model(model)
    argument_list = model_testing_data
    suggestions = []
    with concurrent.futures.ProcessPoolExecutor() as executor:
        for result in executor.map(model.get_suggestions, argument_list):
            suggestions.append(result)

    query = queries[i]
    result_eval = results_eval[i]
    evaluate_models(queries[i],results_eval[i],suggestions,testing_data,model_testing_data,n)

100%|███████████████████████████████████████████| 7/7 [00:00<00:00, 7414.17it/s]

S@k average of 1-gram model
******************************
success_1 average: 0.0
success_10 average: 0.0
success_5 average: 0.0



100%|███████████████████████████████████████████| 7/7 [00:00<00:00, 7245.84it/s]


S@k average of 2-gram model
******************************
success_1 average: 0.0
success_10 average: 0.14285714285714285
success_5 average: 0.14285714285714285


100%|███████████████████████████████████████████| 7/7 [00:00<00:00, 5456.26it/s]

S@k average of 3-gram model
******************************
success_1 average: 0.0
success_10 average: 0.0
success_5 average: 0.0





# Loading complete Birkbeck APPLING1DAT file and evaluating it

The 3-gram, 5-gram, 10-gram models take very much time to compute even after multiprocessing and we lack the computational power, thus we are showing 1-gram and 2-gram implementation using the same code used for the validation dataset on the complete dataset

In [12]:
testing_data = load_birkbeck_file(file_loc='Data/APPLING1DAT.643')
tokenized_sent, a = tokenizer(testing_data['previous-tokens'].values.tolist(), remove_empty=False)
model_testing_data = adding_unknown(tokenized_sent, vocabulary)
testing_data['final-test'] = model_testing_data

Comment the below single line of code inorder to evaluate all the models

In [15]:
ns = [1, 2]

In [14]:
queries = [{} for _ in ns]
results_eval = [{} for _ in ns]

for i, n in enumerate(ns):
    model = SimpleNGram(n=n, model_loc='models', vocabulary=vocabulary)
    load_model(model)
    argument_list = model_testing_data
    suggestions = []
    with concurrent.futures.ProcessPoolExecutor() as executor:
        for result in executor.map(model.get_suggestions, argument_list):
            suggestions.append(result)

    query = queries[i]
    result_eval = results_eval[i]
    evaluate_models(queries[i],results_eval[i],suggestions,testing_data,model_testing_data,n)
        

100%|██████████████████████████████████████| 198/198 [00:00<00:00, 11843.42it/s]

S@k average of 1-gram model
******************************
success_1 average: 0.0
success_10 average: 0.0
success_5 average: 0.0



100%|██████████████████████████████████████| 198/198 [00:00<00:00, 11494.90it/s]

S@k average of 2-gram model
******************************
success_1 average: 0.007407407407407408
success_10 average: 0.014814814814814815
success_5 average: 0.007407407407407408



