In [1]:
import nltk
import numpy as np
import re

In [2]:
nltk.download('brown')
from nltk.corpus import brown

def ensure_nltk_resources():
  required_resources = ['punkt', 'brown', 'punkt_tab']
  for resource in required_resources:
    try:
      if resource == 'punkt_tab':
        nltk.data.find(f'tokenizers/{resource}/english/')
      else:
        nltk.data.find(f'tokenizers/{resource}')
    except LookupError:
      nltk.download(resource, download_dir='C://Users//caden/nltk_data')


[nltk_data] Downloading package brown to
[nltk_data]     /Users/pranavdhinakar/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [3]:
ensure_nltk_resources()

[nltk_data] Downloading package punkt to C://Users//caden/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to C://Users//caden/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C://Users//caden/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
# Get the first 10 sentences from the Brown corpus
sentences = brown.sents()[:10]

print("Sample sentences from the Brown corpus:")
for i, sentence in enumerate(sentences, 1):
    print(f"{i}. {' '.join(sentence)}")

print("\nCategories in the Brown corpus:")
print(brown.categories())

print("\nSample words from different categories:")
for category in brown.categories()[:5]:  # First 5 categories
    print(f"\n{category.capitalize()}:")
    print(' '.join(brown.words(categories=category)[:20]))  # First 20 words

Sample sentences from the Brown corpus:
1. The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .
2. The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise and thanks of the City of Atlanta '' for the manner in which the election was conducted .
3. The September-October term jury had been charged by Fulton Superior Court Judge Durwood Pye to investigate reports of possible `` irregularities '' in the hard-fought primary which was won by Mayor-nominate Ivan Allen Jr. .
4. `` Only a relative handful of such reports was received '' , the jury said , `` considering the widespread interest in the election , the number of voters and the size of this city '' .
5. The jury said it did find that many of Georgia's registration and election laws `` are outmoded or inadequate and often ambiguous '' .
6

# N-gram Model Process Example

Let's use a bigram (2-gram) model to process the first sentence from the Brown corpus and demonstrate how it predicts new tokens.

Sentence: "The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced no evidence that any irregularities took place ."

## 1. Tokenization

First, we tokenize the sentence:

["The", "Fulton", "County", "Grand", "Jury", "said", "Friday", "an", "investigation", "of", "Atlanta's", "recent", "primary", "election", "produced", "no", "evidence", "that", "any", "irregularities", "took", "place", "."]

## 2. Creating Bigrams

We create bigrams from this sequence:

("The", "Fulton"), ("Fulton", "County"), ("County", "Grand"), ..., ("took", "place"), ("place", ".")

## 3. Counting Bigrams

We count the occurrences of each bigram in our corpus. Let's assume we've processed the entire corpus and have these counts (simplified for this example):

- Count("The", "Fulton") = 10
- Count("Fulton", "County") = 15
- ...
- Count("The") = 1000 (total occurrences of "The" as the first word in any bigram)

## 4. Calculating Probabilities

For each bigram, we calculate the probability:

$P(w_2|w_1) = \frac{Count(w_1, w_2)}{Count(w_1)}$

For example:
$P(\text{"Fulton"}|\text{"The"}) = \frac{Count(\text{"The", "Fulton"})}{Count(\text{"The"})} = \frac{10}{1000} = 0.01$

## 5. Predicting Next Token

To predict the next token after "The", we would:

1. Find all bigrams starting with "The"
2. Calculate their probabilities
3. Choose the one with the highest probability

Let's say we have these bigrams and probabilities:

- $P(\text{"Fulton"}|\text{"The"}) = 0.01$
- $P(\text{"Grand"}|\text{"The"}) = 0.02$
- $P(\text{"investigation"}|\text{"The"}) = 0.03$

The model would predict "investigation" as the next token after "The", as it has the highest probability.

## 6. Generating Text

To generate text, we would:

1. Start with a token (e.g., "The")
2. Predict the next token based on probabilities
3. Add the predicted token to our sequence
4. Repeat steps 2-3, using the last token as the new starting point

For example:
"The" → "investigation" → "of" → "Atlanta's" → ...

## 7. Handling Unseen Bigrams

If we encounter a bigram that wasn't in our training data, we need a smoothing technique. A simple method is add-one (Laplace) smoothing:

$P(w_2|w_1) = \frac{Count(w_1, w_2) + 1}{Count(w_1) + V}$

Where $V$ is the size of our vocabulary.

This ensures that even unseen bigrams have a small, non-zero probability.

In [5]:
from model_builder import NGram

[nltk_data] Downloading package brown to
[nltk_data]     /Users/pranavdhinakar/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [6]:
# Initialize the NGram model
n = 2  # for bigram model
ngram_model = NGram(n)

# Prepare the corpus
corpus = brown.words()[:10000]  # Using first 10000 words from Brown corpus

In [7]:
# Train the model
print("Training the model...")
ngram_model.train(corpus)
print("Model training complete.")

# Print some information about the trained model
print(f"Vocabulary size: {len(ngram_model.vocab)}")
print(f"First 10 words in vocabulary: {ngram_model.vocab[:10]}")
print(f"Shape of counts array: {ngram_model.counts.shape}")

Training the model...
Model training complete.
Vocabulary size: 2475
First 10 words in vocabulary: ['6', 'privilege', 'scattered', 'firmer', 'h.', 'pressure', 'disappointment', 'criticized', 'each', 'franker']
Shape of counts array: (2475, 2475)


In [8]:
# Test words
test_contexts = ['the', 'a', 'to']
test_words = ['man', 'woman', 'child', 'court', 'judge']

# Calculate and print probabilities
for context in test_contexts:
    for word in test_words:
        try:
            if context in ngram_model.vocab and word in ngram_model.vocab:
                prob = ngram_model.prob([context], word)
                print(f"P({word}|{context}) = {prob:.6f}")
            else:
                print(f"Either '{context}' or '{word}' is not in the vocabulary.")
        except Exception as e:
            print(f"Error calculating P({word}|{context}): {str(e)}")

# Find the most probable word after 'the'
if 'the' in ngram_model.vocab:
    try:
        most_probable = max(ngram_model.vocab, key=lambda w: ngram_model.prob(['the'], w))
        prob = ngram_model.prob(['the'], most_probable)
        print(f"\nMost probable word after 'the': '{most_probable}' with P({most_probable}|the) = {prob:.6f}")
    except Exception as e:
        print(f"Error finding most probable word after 'the': {str(e)}")
else:
    print("'the' is not in the vocabulary.")

# Print some vocabulary information
print("\nVocabulary size:", len(ngram_model.vocab))
print("First 10 words in vocabulary:", ngram_model.vocab[:10])
print("'the' in vocabulary:", 'the' in ngram_model.vocab)

P(man|the) = 0.000178
Either 'the' or 'woman' is not in the vocabulary.
P(child|the) = 0.000356
P(court|the) = 0.000533
P(judge|the) = 0.000178
P(man|a) = 0.000194
Either 'a' or 'woman' is not in the vocabulary.
P(child|a) = 0.000194
P(court|a) = 0.000194
P(judge|a) = 0.000194
P(man|to) = 0.000192
Either 'to' or 'woman' is not in the vocabulary.
P(child|to) = 0.000192
P(court|to) = 0.000192
P(judge|to) = 0.000192

Most probable word after 'the': 'state' with P(state|the) = 0.003378

Vocabulary size: 2475
First 10 words in vocabulary: ['6', 'privilege', 'scattered', 'firmer', 'h.', 'pressure', 'disappointment', 'criticized', 'each', 'franker']
'the' in vocabulary: True


In [13]:
# CSDS497 Programming Exercise 1

## Setup and Imports

import nltk
from nltk.corpus import brown, gutenberg
import numpy as np
import re
from typing import List, Tuple
import matplotlib.pyplot as plt

# Import your NGram class here
from model_builder import NGram

nltk.download('brown')
nltk.download('gutenberg')

## 1. Different runs of your code (n=2)

# Training data
#train_data = brown.words()[:50000]  # Use first 50,000 words for training
#test_data = brown.words()[50000:60000]  # Use next 10,000 words for testing
train_data = brown.words()[:5000]# Use first 5,000 words for training
test_data = brown.words()[5000:5500]# Use next 500 words for testing


# Train the model
bigram_model = NGram(2)
bigram_model.train(train_data)



## 2. Test with different source

#gutenberg_data = gutenberg.words()[:10000]  # Use first 10,000 words from Gutenberg corpus
gutenberg_data = gutenberg.words()[:1000]  # Use first 1,000 words from Gutenberg corpus


# Test the model on Gutenberg data
# (We'll implement testing here)

## 3. Compare n-gram models (n=1,2,3)

# Train and test models for n=1,2,3
# (We'll implement this comparison here)

## 4. Compare models with and without smoothing

# Implement versions of NGram with and without smoothing
# Test on vocabulary from lexicon and vocabulary not from lexicon
# (We'll implement this comparison here)

[nltk_data] Downloading package brown to
[nltk_data]     /Users/pranavdhinakar/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/pranavdhinakar/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [14]:
def calculate_perplexity(model: NGram, test_data: List[str]) -> float:
    log_likelihood = 0
    N = len(test_data)
    
    for i in range(N - model.n + 1):
        context = test_data[i:i+model.n-1]
        word = test_data[i+model.n-1]
        prob = model.prob(context, word)
        log_likelihood += np.log2(prob)
    
    perplexity = 2 ** (-1/N * log_likelihood)
    return perplexity

def test_model(model: NGram, test_data: List[str]) -> Tuple[float, List[Tuple[str, str, float]]]:
    perplexity = calculate_perplexity(model, test_data)
    
    # Get some example probabilities
    examples = []
    # Adjust sampling interval based on dataset size
    sample_interval = max(1, len(test_data) // 5)  # Get about 5 examples
    
    for i in range(0, len(test_data) - model.n + 1, sample_interval):
        context = test_data[i:i+model.n-1]
        word = test_data[i+model.n-1]
        prob = model.prob(context, word)
        examples.append((" ".join(context), word, prob))
    
    return perplexity, examples

# Test the bigram model
print("Training bigram model...")
bigram_model = NGram(2)
bigram_model.train(train_data)

print("\nTesting on Brown corpus test data...")
perplexity, examples = test_model(bigram_model, test_data)

print(f"Perplexity: {perplexity:.2f}")
print("\nExample probabilities from test data:")
print("Format: P(next_word|context) = probability")
print("-" * 50)
for context, word, prob in examples:
    print(f"P({word}|{context}) = {prob:.6f}")

print("\nTesting on Gutenberg corpus...")
gutenberg_perplexity, gutenberg_examples = test_model(bigram_model, gutenberg_data)

print(f"Perplexity: {gutenberg_perplexity:.2f}")
print("\nExample probabilities from Gutenberg data:")
print("Format: P(next_word|context) = probability")
print("-" * 50)
for context, word, prob in gutenberg_examples:
    print(f"P({word}|{context}) = {prob:.6f}")

# Print some statistics about the data
print("\nDataset Statistics:")
print(f"Training data size: {len(train_data)} words")
print(f"Test data size: {len(test_data)} words")
print(f"Gutenberg test data size: {len(gutenberg_data)} words")
print(f"Vocabulary size: {len(bigram_model.vocab)} words")

Training bigram model...

Testing on Brown corpus test data...
Perplexity: 2356.02

Example probabilities from test data:
Format: P(next_word|context) = probability
--------------------------------------------------
P(evidence|said) = 0.000326
P(not|did) = 0.000990
P(Cook|of) = 0.000313
P(of|precincts) = 0.000330
P(.|them) = 0.000330

Testing on Gutenberg corpus...
Perplexity: 2734.81

Example probabilities from Gutenberg data:
Format: P(next_word|context) = probability
--------------------------------------------------
P(Emma|[) = 0.000330
P(mildness|the) = 0.000298
P(father|her) = 0.000330
P(last|the) = 0.000597
P(her|for) = 0.000325

Dataset Statistics:
Training data size: 5000 words
Test data size: 500 words
Gutenberg test data size: 1000 words
Vocabulary size: 1513 words


In [None]:
# Comprehensive NGram Analysis
import pandas as pd

def run_experiment(train_size: int, test_size: int, n: int) -> dict:
    # Get data
    train_data = brown.words()[:train_size]
    test_brown = brown.words()[train_size:train_size+test_size]
    test_gutenberg = gutenberg.words()[:test_size]
    
    # Train model
    model = NGram(n)
    model.train(train_data)
    
    # Test on both corpora
    brown_perp, brown_examples = test_model(model, test_brown)
    gut_perp, gut_examples = test_model(model, test_gutenberg)
    
    return {
        'n': n,
        'train_size': train_size,
        'test_size': test_size,
        'vocab_size': len(model.vocab),
        'brown_perplexity': brown_perp,
        'gutenberg_perplexity': gut_perp,
        'example_prob_brown': brown_examples[0][2] if brown_examples else None,
        'example_prob_gutenberg': gut_examples[0][2] if gut_examples else None
    }

# Run experiments with different configurations
experiments = []
for train_size in [5000, 10000]:  # Try different training sizes
    for n in [1, 2, 3]:  # Try different n-gram sizes
        result = run_experiment(train_size, 500, n)
        experiments.append(result)

# Create a DataFrame and display results
results_df = pd.DataFrame(experiments)
print("\nExperiment Results:")
print(results_df.to_string(float_format=lambda x: '{:.2f}'.format(x)))

# Plot perplexity comparison
plt.figure(figsize=(10, 6))
for train_size in [5000, 10000]:
    mask = results_df['train_size'] == train_size
    plt.plot(results_df[mask]['n'], 
             results_df[mask]['brown_perplexity'], 
             label=f'Brown (train={train_size})',
             marker='o')
    plt.plot(results_df[mask]['n'], 
             results_df[mask]['gutenberg_perplexity'], 
             label=f'Gutenberg (train={train_size})',
             marker='s')

plt.xlabel('n-gram size')
plt.ylabel('Perplexity')
plt.title('Perplexity vs n-gram size')
plt.legend()
plt.grid(True)
plt.show()

# Print analysis
print("\nAnalysis:")
print("1. Effect of n-gram size:")
for n in [1, 2, 3]:
    mask = results_df['n'] == n
    avg_brown = results_df[mask]['brown_perplexity'].mean()
    avg_gut = results_df[mask]['gutenberg_perplexity'].mean()
    print(f"  n={n}:")
    print(f"    Average Brown perplexity: {avg_brown:.2f}")
    print(f"    Average Gutenberg perplexity: {avg_gut:.2f}")

print("\n2. Effect of training size:")
for size in [5000, 10000]:
    mask = results_df['train_size'] == size
    avg_brown = results_df[mask]['brown_perplexity'].mean()
    avg_gut = results_df[mask]['gutenberg_perplexity'].mean()
    print(f"  Training size={size}:")
    print(f"    Average Brown perplexity: {avg_brown:.2f}")
    print(f"    Average Gutenberg perplexity: {avg_gut:.2f}")