##Feature Creation from Text: N-Grams


1. Load the "brown" corpus from nltk and remove the punctuations


In [3]:
# Step 1: Import necessary libraries
import nltk
import re
import string
import pandas as pd
import collections
from nltk.util import ngrams

In [5]:
# Step 2: Download and load the "brown" corpus
import nltk
import string
from nltk.corpus import brown

# Download the brown corpus if not already downloaded
nltk.download('brown')

# Step 3: Load and preprocess the text
# Convert the lazy corpus loader object to a list to avoid 'read of closed file' error
tokenized_text = list(brown.words())

# Remove punctuation from tokens
tokenized_text = [word for word in tokenized_text if word not in string.punctuation]

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


Define a function that returns a dataframe with n-grams and their frequencies, given a value of "n"


In [6]:
# Function to get n-gram frequencies
def get_ngrams_frequency(tokens, n):
    list_ngrams = ngrams(tokens, n)
    ngram_freq = collections.Counter(list_ngrams)
    df_ngrams = pd.DataFrame.from_records(list(dict(ngram_freq).items()), columns=['n_gram', 'count'])
    df_ngrams['n_gram'] = df_ngrams['n_gram'].apply(lambda x: ' '.join(x).lower())
    return df_ngrams.sort_values(by='count', ascending=False).reset_index(drop=True)


In [7]:
df_bigrams = get_ngrams_frequency(tokenized_text, 2)
df_bigrams.head()


Unnamed: 0,n_gram,count
0,of the,9628
1,in the,5550
2,to the,3430
3,on the,2302
4,and the,2137


Define a function to return a top "k" proposal of next words, based on n-grams, given a word or sequence of words, the values of "k" and "n"


In [8]:
# Function to get top k next-word suggestions based on n-grams
def predict_next_words(tokens, n, input_seq, k=5):
    ngram_list = list(ngrams(tokens, n))
    ngram_freq = collections.Counter(ngram_list)

    input_seq = tuple(input_seq.lower().split())

    # Match last (n-1) words of input with n-gram prefix
    candidates = {ng: freq for ng, freq in ngram_freq.items() if ng[:n-1] == input_seq}
    sorted_candidates = sorted(candidates.items(), key=lambda item: item[1], reverse=True)

    top_k = [ng[-1] for ng, _ in sorted_candidates[:k]]
    return top_k


In [9]:
# Example: Predict next words for "the"
predict_next_words(tokenized_text, n=2, input_seq="the", k=5)

# Example: For trigrams, input must be 2 words
predict_next_words(tokenized_text, n=3, input_seq="the united", k=5)


['people']