## Testing models

In [1]:
import numpy as np
import os
import re
import random

c:\Users\USUARIO\anaconda3\envs\anteia\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
c:\Users\USUARIO\anaconda3\envs\anteia\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


In [2]:
n_grams_folder = "n_grams"
group_code = "santiagomartinez_camilocastaneda" 

In [3]:
def custom_tokenizer(text):
    # Use regular expression to split on spaces while preserving '<s>' and '</s>' as separate tokens
    tokens = re.split(r'(\s|<s>|</s>)', text)
    
    # Remove empty tokens and tokens containing only special characters
    tokens = [token for token in tokens if token.strip() and not re.match(r'^[^a-zA-Z0-9]+$', token)]
    
    # Remove trailing '>' characters from tokens like "the>"
    tokens = [re.sub(r'(.*[^>])>', r'\1', token) if token not in ['<s>','</s>', '<UNK>'] else token for token in tokens]
    
    return tokens

# Define a function to calculate perplexity with Laplace smoothing
def calculate_perplexity(tokens, model_file, n):
    N = len(tokens)
    log_prob = 0.0
    
    with open(model_file, "r", encoding="utf-8") as file:
        model_data = file.read().splitlines()
    
    model = {}
    for line in model_data:
        ngram, probability = line.split('\t')
        model[ngram] = float(probability)
    for i in range(1, N):
        # Construct the n-gram (unigram, bigram, or trigram)
        ngram = " ".join(tokens[max(0, i - n + 1):i + 1])
        # Calculate the conditional probability of the n-gram
        prob = model.get(ngram, 0)  # Use Laplace smoothing with a default of 0
        
        if prob == 0:
            # Laplace smoothing: add 1 to the count of unseen n-grams
            prob = 1 / (len(model) + 1)
        
        # Update the log probability
        log_prob += np.log(prob)
    
    # Calculate perplexity
    perplexity = np.exp(-log_prob / N)
    return perplexity


In [4]:
# Paths to output files
output_file_20N_unigrams = f"20N_{group_code}_unigrams.txt"
output_file_20N_bigrams = f"20N_{group_code}_bigrams.txt"
output_file_20N_trigrams = f"20N_{group_code}_trigrams.txt"

output_file_BAC_unigrams = f"BAC_{group_code}_unigrams.txt"
output_file_BAC_bigrams = f"BAC_{group_code}_bigrams.txt"
output_file_BAC_trigrams = f"BAC_{group_code}_trigrams.txt"

In [5]:
unigrams_model = os.path.join(n_grams_folder, output_file_20N_unigrams)
bigrams_model = os.path.join(n_grams_folder, output_file_20N_bigrams)
trigrams_model = os.path.join(n_grams_folder, output_file_20N_trigrams)

test_dataset = f"20N_{group_code}_testing.txt"
# Load the test dataset
with open(test_dataset, "r", encoding="utf-8") as file:
    test_data = file.read()

# Tokenize the test dataset using your custom tokenizer
test_tokens = custom_tokenizer(test_data)

# Calculate perplexity for the unigrams, bigrams, and trigrams models
perplexity_unigrams = calculate_perplexity(test_tokens, unigrams_model, n=1)
perplexity_bigrams = calculate_perplexity(test_tokens, bigrams_model, n=2)
perplexity_trigrams = calculate_perplexity(test_tokens, trigrams_model, n=3)

# Print the perplexity results
print("Results for 20N dataset")
print("Perplexity for Unigrams Model:", perplexity_unigrams)
print("Perplexity for Bigrams Model:", perplexity_bigrams)
print("Perplexity for Trigrams Model:", perplexity_trigrams)

Results for 20N dataset
Perplexity for Unigrams Model: 1304.3970894236732
Perplexity for Bigrams Model: 77362.21812197147
Perplexity for Trigrams Model: 622158.1869173201


In [6]:
unigrams_model = os.path.join(n_grams_folder, output_file_BAC_unigrams)
bigrams_model = os.path.join(n_grams_folder, output_file_BAC_bigrams)
trigrams_model = os.path.join(n_grams_folder, output_file_BAC_trigrams)

test_dataset = f"BAC_{group_code}_testing.txt"
# Load the test dataset
with open(test_dataset, "r", encoding="utf-8") as file:
    test_data = file.read()

# Tokenize the test dataset using your custom tokenizer
test_tokens = custom_tokenizer(test_data)

# Calculate perplexity for the unigrams, bigrams, and trigrams models
perplexity_unigrams = calculate_perplexity(test_tokens, unigrams_model, n=1)
perplexity_bigrams = calculate_perplexity(test_tokens, bigrams_model, n=2)
perplexity_trigrams = calculate_perplexity(test_tokens, trigrams_model, n=3)

# Print the perplexity results
print("Results for BAC dataset")
print("Perplexity for Unigrams Model:", perplexity_unigrams)
print("Perplexity for Bigrams Model:", perplexity_bigrams)
print("Perplexity for Trigrams Model:", perplexity_trigrams)

Results for BAC dataset
Perplexity for Unigrams Model: 766.3209107647922
Perplexity for Bigrams Model: 64707.8327112743
Perplexity for Trigrams Model: 1041388.7212299752


## Generar Frases

In [19]:
# Load the n-gram model
def load_ngram_model(model_file):
    ngram_model = {}
    with open(model_file, "r", encoding="utf-8") as file:
        for line in file:
            ngram, probability = line.strip().split("\t")
            ngram_model[ngram] = float(probability)
    return ngram_model

# Function to generate the next word based on the n-gram model
def generate_next_word(ngram_model, current_words):
    candidates = []
    probabilities = []
    for key in ngram_model.keys():
        if key.startswith(current_words):
            candidates.append(key.split(" ")[-1])
            probabilities.append(ngram_model.get(key))
    if not candidates:
        return None

    # Normalize the probabilities to ensure they sum to 1
    total_probability = sum(probabilities)

    probabilities = [prob / total_probability for prob in probabilities]

    # Use the calculated and normalized probabilities to randomly select the next word
    next_word = np.random.choice(candidates, p=probabilities)

    return next_word

# Function to generate a sentence using unigrams
def generate_sentence_unigram(start_word, unigram_model, max_length=50):
    sentence = start_word
    while len(sentence) < max_length:
        next_word = generate_next_word(unigram_model, sentence[-1])
        if next_word is None:
            break
        
        sentence.append(next_word)
        
        if next_word == "</s>":
            break
    
    return " ".join(sentence)

# Function to generate a sentence
def generate_sentence(start_word, ngram_model, n, max_length=50):
    if n==1:
        sentence = [start_word.lower()]
        
        while len(sentence) < max_length:
            next_word = generate_sentence_unigram(ngram_model, sentence[-1])
            if next_word is None:
                break
            
            sentence.append(next_word)
            
            if next_word == "</s>":
                break
        
        return " ".join(sentence)
    else:
        sentence = [start_word.lower()]
        
        while len(sentence) < max_length:
            current_words = " ".join(sentence[-(n-1):])
            next_word = generate_next_word(ngram_model, current_words)
            if next_word is None:
                break
            
            sentence.append(next_word)
            
            if next_word == "</s>":
                break
        
        return " ".join(sentence)

In [20]:
# Choose the n-gram model to use (e.g., trigrams)
n_gram_model_file = os.path.join(n_grams_folder, f"BAC_{group_code}_trigrams.txt")
# Load the selected n-gram model
ngram_model = load_ngram_model(n_gram_model_file)

# Example usage:
start_word = "we"  # Replace with your desired start word
generated_sentence = generate_sentence(start_word, ngram_model, n=3)

print("Generated Sentence:")
print(generated_sentence)

Generated Sentence:
we a really good quantitative NUM verbal </s>


In [27]:
# Choose the n-gram model to use (e.g., trigrams)
n_gram_model_file = os.path.join(n_grams_folder, f"20N_{group_code}_trigrams.txt")
# Load the selected n-gram model
ngram_model = load_ngram_model(n_gram_model_file)

# Example usage:
start_word = "we"  # Replace with your desired start word
generated_sentence = generate_sentence(start_word, ngram_model, n=3)

print("Generated Sentence:")
print(generated_sentence)

Generated Sentence:
we today call for a pronounced effect than for files </s>


In [10]:
# Choose the n-gram model to use (e.g., trigrams)
n_gram_model_file = os.path.join(n_grams_folder, f"BAC_{group_code}_bigrams.txt")
# Load the selected n-gram model
ngram_model = load_ngram_model(n_gram_model_file)

# Example usage:
start_word = "we"  # Replace with your desired start word
generated_sentence = generate_sentence(start_word, ngram_model, n=2)

print("Generated Sentence:")
print(generated_sentence)

Generated Sentence:
we that do n't believe he can just some utility knife from the tanning your love languages do n't move from a it times past eleven 's amp attending a so will to make i at the notion i covers things to go grill and fun use in become a


In [11]:
# Choose the n-gram model to use (e.g., trigrams)
n_gram_model_file = os.path.join(n_grams_folder, f"20N_{group_code}_bigrams.txt")
# Load the selected n-gram model
ngram_model = load_ngram_model(n_gram_model_file)

# Example usage:
start_word = "we"  # Replace with your desired start word
generated_sentence = generate_sentence(start_word, ngram_model, n=2)

print("Generated Sentence:")
print(generated_sentence)

Generated Sentence:
we ed dictionary arguments i severed at me </s>


In [28]:
# Choose the n-gram model to use (e.g., trigrams)
n_gram_model_file = os.path.join(n_grams_folder, f"BAC_{group_code}_unigrams.txt")
# Load the selected n-gram model
ngram_model = load_ngram_model(n_gram_model_file)

# Example usage:
start_word = "we"  # Replace with your desired start word
generated_sentence = generate_sentence(start_word, ngram_model, n=1)

print("Generated Sentence:")
print(generated_sentence)

AttributeError: 'dict' object has no attribute 'lower'