# CSC 820 Homework 4  
Andrew Dahlstrom  
2/28/2024

In [1]:
import nltk
from nltk.corpus import gutenberg
import pandas as pd
import Levenshtein 

# Load Lewis Carroll's Alice in Wonderland from project Gutenberg library into a list like object
corpus = gutenberg.words('carroll-alice.txt') 
corpus_name = "Alice in Wonderland"

# Use the built-in set method to automatically create a list of unique words from the corpus
types = set(corpus)

# Total number of words (tokens) in corpus
N = len(corpus)

In [2]:
# Create a unigram dataframe to hold the types from the corpus
unigram_table = pd.DataFrame(types, columns=['word'])

# Create a dataframe to store all the tokens from the corpus in a column
token_table = pd.DataFrame(corpus, columns=['word'])

# Create a series to hold the frequency counts of each word from the token dataframe
word_freq = token_table['word'].value_counts()

# Map the word frequency to the corresponding word in the type table under
# the new frequency column
unigram_table['frequency'] = unigram_table['word'].map(word_freq)

# Create a new column that contains the probability of the word occuring in the corpus
# by dividing each word frequency by the total number of words in the corpus.
unigram_table['probability'] = unigram_table['frequency'] / N

unigram_table.head()

Unnamed: 0,word,frequency,probability
0,ring,2,5.9e-05
1,cardboard,1,2.9e-05
2,sounds,3,8.8e-05
3,Mind,2,5.9e-05
4,figures,1,2.9e-05


In [3]:
# Create bigram dataframe

# Use NLTK bigrams method to extract bigrams from the corpus
bigrams = list(nltk.bigrams(corpus))

# create pandas dataframe to sort and count the bigrams               
bigram_table = pd.DataFrame(bigrams, columns=['word1', 'word2'])

bigram_table['frequency'] = bigram_table.groupby(['word1', 'word2'])['word2'].transform('count')

bigram_table.head(20)

Unnamed: 0,word1,word2,frequency
0,[,Alice,1
1,Alice,',11
2,',s,195
3,s,Adventures,1
4,Adventures,in,1
5,in,Wonderland,2
6,Wonderland,by,1
7,by,Lewis,1
8,Lewis,Carroll,1
9,Carroll,1865,1


In [4]:
# Add probability column to bigram_table

# First create a merged dataframe of bigram and unigram frequencies
merged_table = pd.merge(bigram_table, unigram_table, left_on='word1', right_on='word', how='inner')

# Use the merged_table to calculate the probability column for the bigram_table
bigram_table['probability'] = merged_table['frequency_x'] / merged_table['frequency_y']
# bigram_table.info()

In [5]:
# Part 2

# Create a loop to continue checking for next possible words
while True:
    # Get an input word from user
    inpW = input("Enter a word (or type '%%%%' to exit): ")
    
    # Check if the user wants to exit
    if inpW == "%%%%":
        break

    # Find all bigrams where the user input word in the first word
    inpw_bigrams = bigram_table[bigram_table['word1'] == inpW]
    
    # Consolidate duplicates by grouping and select max probability
    inpw_bigrams_con = inpw_bigrams.groupby('word2')['probability'].max().reset_index()
    
    # Return the possible next words from the bigram model or no next words found
    if inpw_bigrams_con.empty:
        print(f"No next words found for '{inpW}'")
        continue
    else: 
        # Sort input word bigrams
        
        sorted_bigrams = inpw_bigrams_con.sort_values(by='probability', ascending=False)
        print("\nPossible next words:")
        for index, row in sorted_bigrams.iterrows():
            # Print next word and its probability
            print(f"{row['word2']}: {row['probability']}")
        
    

Enter a word (or type '%%%%' to exit):  new



Possible next words:
idea: 0.5
kind: 0.25
pair: 0.25


Enter a word (or type '%%%%' to exit):  half



Possible next words:
hoping: 0.13636363636363635
of: 0.13636363636363635
to: 0.13636363636363635
an: 0.09090909090909091
-: 0.045454545454545456
afraid: 0.045454545454545456
believed: 0.045454545454545456
down: 0.045454545454545456
expecting: 0.045454545454545456
high: 0.045454545454545456
my: 0.045454545454545456
no: 0.045454545454545456
shut: 0.045454545454545456
the: 0.045454545454545456
those: 0.045454545454545456


Enter a word (or type '%%%%' to exit):  the



Possible next words:
Queen: 0.04060248853962017
King: 0.0366732154551408
Gryphon: 0.03339882121807466
Mock: 0.032089063523248196
Hatter: 0.030124426981008513
Duchess: 0.024230517354289455
Dormouse: 0.019646365422396856
March: 0.01899148657498363
other: 0.0183366077275704
Caterpillar: 0.015717092337917484
same: 0.0137524557956778
Mouse: 0.0137524557956778
Rabbit: 0.0137524557956778
door: 0.01309757694826457
White: 0.01309757694826457
Cat: 0.012442698100851343
little: 0.011787819253438114
jury: 0.010478061558611657
court: 0.0091683038637852
time: 0.008513425016371971
whole: 0.008513425016371971
right: 0.007858546168958742
way: 0.007858546168958742
first: 0.007858546168958742
house: 0.007858546168958742
words: 0.007858546168958742
Dodo: 0.007858546168958742
end: 0.007203667321545514
Pigeon: 0.007203667321545514
next: 0.007203667321545514
cook: 0.007203667321545514
dance: 0.006548788474132285
baby: 0.006548788474132285
air: 0.006548788474132285
sea: 0.006548788474132285
garden: 0.00589390

Enter a word (or type '%%%%' to exit):  %%%%
