In [1]:
import nltk
import spacy
import string

from nltk.corpus import webtext
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer

from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE, KneserNeyInterpolated

from sklearn.model_selection import train_test_split

nltk.download('webtext')
nltk.download('punkt')
nltk.download('stopwords')

stop_words = stopwords.words('english')
sp = string.punctuation
spacy_nlp = spacy.load('en_core_web_sm')
all_stopwords = spacy_nlp.Defaults.stop_words

[nltk_data] Downloading package webtext to /root/nltk_data...
[nltk_data]   Unzipping corpora/webtext.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
train, test = train_test_split(webtext.sents(), test_size = 0.3)

In [3]:
len(train), len(test)

(17969, 7701)

In [4]:
def preprocess(sentences):
  punc = r"""!()-[]{};:'"\, <>./?@#$%^&*_~"""
  read = " ".join(sentences)
  for ele in read:
      if ele in punc:
          read = read.replace(ele, " ") 
  read = read.lower()

  # This will convert the word into tokens
  text_tokens = word_tokenize(read)

  # Remove all the stopwords from the tokens
  tokens_without_sw = [
        word for word in text_tokens if not word in stopwords.words("english")
    ]
  # Initialize the stemmer
  ps = PorterStemmer()

  # Stem all the words
  tokens_without_sw_stem = [ps.stem(word) for word in tokens_without_sw]  

  pre_text = [i for i in tokens_without_sw_stem if not i.isnumeric()]
  return pre_text


In [5]:
train_sentences = [preprocess(i) for i in train]
test_sentences =  [preprocess(i) for i in test]

In [6]:
def language_model(n, tokenized_text, test_sentences):
    average_perplexity = 0.0
    train_data, padded_vocab = padded_everygram_pipeline(n, tokenized_text)
    lm_model = MLE(n)
    lm_model.fit(train_data, padded_vocab)
    print(lm_model.counts)
    
    test_data, _ = padded_everygram_pipeline(n, test_sentences)

    for test in list(test_data):
        ngrams = list(test)
        p = lm_model.perplexity(ngrams)
        if p != float('inf'):
          average_perplexity += p

    sentence_count = len(test_sentences)
    average_perplexity /= sentence_count
    return average_perplexity

In [7]:
n = 2
bigram = language_model(n, train_sentences, test_sentences)
print("Average Perplexity for Bigram model Webtext Corpus:", bigram)

<NgramCounter with 2 ngram orders and 299617 ngrams>
Average Perplexity for Bigram model Webtext Corpus: 16.18012418991258


In [8]:
n = 3
trigram = language_model(n, train_sentences, test_sentences)
print("Average Perplexity for Trigram model Webtext Corpus:", trigram)

<NgramCounter with 3 ngram orders and 530286 ngrams>
Average Perplexity for Trigram model Webtext Corpus: 2.15000497089262


In [9]:
n = 4
quadgram = language_model(n, train_sentences, test_sentences)
print("Average Perplexity for Quadgram model Webtext Corpus:", quadgram)

<NgramCounter with 4 ngram orders and 814862 ngrams>
Average Perplexity for Quadgram model Webtext Corpus: 0.9840472996107742
