## Testing models

In [11]:
import numpy as np
import os
import re
import random

In [8]:
n_grams_folder = "n_grams"
group_code = "group_code" 

In [38]:
def custom_tokenizer(text):
    # Use regular expression to split on spaces while preserving '<s>' and '</s>' as separate tokens
    tokens = re.split(r'(\s|<s>|</s>)', text)
    
    # Remove empty tokens and tokens containing only special characters
    tokens = [token for token in tokens if token.strip() and not re.match(r'^[^a-zA-Z0-9]+$', token)]
    
    # Remove trailing '>' characters from tokens like "the>"
    tokens = [re.sub(r'(.*[^>])>', r'\1', token) if token not in ['<s>','</s>', '<UNK>'] else token for token in tokens]
    
    return tokens

# Define a function to calculate perplexity
def calculate_perplexity(tokens, model_file, n):
    N = len(tokens)
    log_prob = 0.0
    
    with open(model_file, "r", encoding="utf-8") as file:
        model_data = file.read().splitlines()
    
    model = {}
    for line in model_data:
        ngram, probability = line.split('\t')
        model[ngram] = float(probability)
    for i in range(1, N):
        # Construct the n-gram (unigram, bigram, or trigram)
        ngram = " ".join(tokens[max(0, i - n + 1):i + 1])
        # Calculate the conditional probability of the n-gram
        prob = model.get(ngram, model.get("<UNK>", 0))

        if prob==0:
            continue
        # Update the log probability
        log_prob += np.log(prob)
    
    # Calculate perplexity
    perplexity = np.exp(-log_prob / N)
    return perplexity

In [39]:
# Paths to output files
output_file_20N_unigrams = f"20N_{group_code}_unigrams.txt"
output_file_20N_bigrams = f"20N_{group_code}_bigrams.txt"
output_file_20N_trigrams = f"20N_{group_code}_trigrams.txt"

output_file_BAC_unigrams = f"BAC_{group_code}_unigrams.txt"
output_file_BAC_bigrams = f"BAC_{group_code}_bigrams.txt"
output_file_BAC_trigrams = f"BAC_{group_code}_trigrams.txt"

In [40]:
unigrams_model = os.path.join(n_grams_folder, output_file_20N_unigrams)
bigrams_model = os.path.join(n_grams_folder, output_file_20N_bigrams)
trigrams_model = os.path.join(n_grams_folder, output_file_20N_trigrams)

test_dataset = "20N_group_code_testing.txt"
# Load the test dataset
with open(test_dataset, "r", encoding="utf-8") as file:
    test_data = file.read()

# Tokenize the test dataset using your custom tokenizer
test_tokens = custom_tokenizer(test_data)

# Calculate perplexity for the unigrams, bigrams, and trigrams models
perplexity_unigrams = calculate_perplexity(test_tokens, unigrams_model, n=1)
perplexity_bigrams = calculate_perplexity(test_tokens, bigrams_model, n=2)
perplexity_trigrams = calculate_perplexity(test_tokens, trigrams_model, n=3)

# Print the perplexity results
print("Results for 20N dataset")
print("Perplexity for Unigrams Model:", perplexity_unigrams)
print("Perplexity for Bigrams Model:", perplexity_bigrams)
print("Perplexity for Trigrams Model:", perplexity_trigrams)

Results for 20N dataset
Perplexity for Unigrams Model: 1247.3996950676096
Perplexity for Bigrams Model: 11637.520439027003
Perplexity for Trigrams Model: 3460.6417600480177


In [41]:
unigrams_model = os.path.join(n_grams_folder, output_file_BAC_unigrams)
bigrams_model = os.path.join(n_grams_folder, output_file_BAC_bigrams)
trigrams_model = os.path.join(n_grams_folder, output_file_BAC_trigrams)

test_dataset = "BAC_group_code_testing.txt"
# Load the test dataset
with open(test_dataset, "r", encoding="utf-8") as file:
    test_data = file.read()

# Tokenize the test dataset using your custom tokenizer
test_tokens = custom_tokenizer(test_data)

# Calculate perplexity for the unigrams, bigrams, and trigrams models
perplexity_unigrams = calculate_perplexity(test_tokens, unigrams_model, n=1)
perplexity_bigrams = calculate_perplexity(test_tokens, bigrams_model, n=2)
perplexity_trigrams = calculate_perplexity(test_tokens, trigrams_model, n=3)

# Print the perplexity results
print("Results for BAC dataset")
print("Perplexity for Unigrams Model:", perplexity_unigrams)
print("Perplexity for Bigrams Model:", perplexity_bigrams)
print("Perplexity for Trigrams Model:", perplexity_trigrams)

Results for BAC dataset
Perplexity for Unigrams Model: 759.2631866374661
Perplexity for Bigrams Model: 14322.288777939446
Perplexity for Trigrams Model: 4315.062816364823


## Generar Frases

In [114]:
# Load the n-gram model
def load_ngram_model(model_file):
    ngram_model = {}
    with open(model_file, "r", encoding="utf-8") as file:
        for line in file:
            ngram, probability = line.strip().split("\t")
            ngram_model[ngram] = float(probability)
    return ngram_model

# Function to generate the next word based on the n-gram model
def generate_next_word(ngram_model, current_words):
    candidates = []
    probabilities = []
    for key in ngram_model.keys():
        if key.startswith(current_words):
            candidates.append(key.split(" ")[-1])
            probabilities.append(ngram_model.get(key))
    if not candidates:
        # rwg = random.choice(list(ngram_model.keys())) 
        return None # No candidates found, returns random word in vocab
    
    # Calculate probabilities for each candidate word
    # probabilities = [ngram_model.get(f"{current_words} {candidate}", 0.0) for candidate in candidates]

    # Normalize the probabilities to ensure they sum to 1
    total_probability = sum(probabilities)

    probabilities = [prob / total_probability for prob in probabilities]

    # Use the calculated and normalized probabilities to randomly select the next word
    next_word = np.random.choice(candidates, p=probabilities)

    return next_word

def generate_next_word_unigram(ngram_model):
    candidates = [word for word in ngram_model.keys()]
    if not candidates:
        return None  # No candidates found, returns random word in vocab

    # Calculate probabilities for each candidate word
    probabilities = [prob for prob in ngram_model.values()]

    # Normalize the probabilities to ensure they sum to 1
    total_probability = sum(probabilities)
    probabilities = [prob / total_probability for prob in probabilities]

    # Use the calculated and normalized probabilities to randomly select the next word
    next_word = np.random.choice(candidates, p=probabilities)

    return next_word

# Function to generate a sentence
def generate_sentence(start_word, ngram_model, n, max_length=50):
    sentence = [start_word.lower()]
    if n==1:
        while len(sentence) < max_length:
            next_word = generate_next_word_unigram(ngram_model)
            sentence.append(next_word)
            
            if next_word == "</s>":
                break

    else:
        while len(sentence) < max_length:
            current_words = " ".join(sentence[-(n-1):])
            next_word = generate_next_word(ngram_model, current_words)
            if next_word is None:
                break
            
            sentence.append(next_word)
            
            if next_word == "</s>":
                break
        
    return " ".join(sentence)

In [123]:
# Choose the n-gram model to use (e.g., trigrams)
n_gram_model_file = os.path.join(n_grams_folder, f"BAC_{group_code}_trigrams.txt")
# Load the selected n-gram model
ngram_model = load_ngram_model(n_gram_model_file)

# Example usage:
start_word = "we"  # Replace with your desired start word
generated_sentence = generate_sentence(start_word, ngram_model, n=3)

print("Generated Sentence:")
print(generated_sentence)

Generated Sentence:
we to this holiday </s>


In [124]:
# Choose the n-gram model to use (e.g., trigrams)
n_gram_model_file = os.path.join(n_grams_folder, f"BAC_{group_code}_bigrams.txt")
# Load the selected n-gram model
ngram_model = load_ngram_model(n_gram_model_file)

# Example usage:
start_word = "we"  # Replace with your desired start word
generated_sentence = generate_sentence(start_word, ngram_model, n=2)

print("Generated Sentence:")
print(generated_sentence)

Generated Sentence:
we with a i forgot the messiness in my front even make the drinks hypnotic state as usual when slipknot they gave my life </s>


In [125]:
# Choose the n-gram model to use (e.g., trigrams)
n_gram_model_file = os.path.join(n_grams_folder, f"20N_{group_code}_trigrams.txt")
# Load the selected n-gram model
ngram_model = load_ngram_model(n_gram_model_file)

# Example usage:
start_word = "we"  # Replace with your desired start word
generated_sentence = generate_sentence(start_word, ngram_model, n=3)

print("Generated Sentence:")
print(generated_sentence)

Generated Sentence:
we in the swiss but NUM million european jews </s>


In [126]:
# Choose the n-gram model to use (e.g., trigrams)
n_gram_model_file = os.path.join(n_grams_folder, f"20N_{group_code}_bigrams.txt")
# Load the selected n-gram model
ngram_model = load_ngram_model(n_gram_model_file)

# Example usage:
start_word = "we"  # Replace with your desired start word
generated_sentence = generate_sentence(start_word, ngram_model, n=2)

print("Generated Sentence:")
print(generated_sentence)

Generated Sentence:
we have seen it your but he me </s>


In [127]:
# Choose the n-gram model to use (e.g., trigrams)
n_gram_model_file = os.path.join(n_grams_folder, f"BAC_{group_code}_unigrams.txt")
# Load the selected n-gram model
ngram_model = load_ngram_model(n_gram_model_file)

# Example usage:
start_word = "we"  # Replace with your desired start word
generated_sentence = generate_sentence(start_word, ngram_model, n=1)

print("Generated Sentence:")
print(generated_sentence)

Generated Sentence:
we i will and pick love i all the does get occasions and shy be nightmare systems <s> dismay and eh my have avoid by living NUM a them them homeless and this one here sapped </s>
