In [141]:
import functools
import math

import sys
sys.setrecursionlimit(10 ** 5)

from sklearn.model_selection import train_test_split

# Downloading brown corpus
import nltk 
nltk.download('brown')
nltk.download("stopwords")
nltk.download('punkt')

from nltk.corpus import webtext
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer

from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE, KneserNeyInterpolated

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [142]:
def splitPairs(word):
   return [(word[:i+1], word[i+1:]) for i in range(len(word))]

In [143]:
def segment(word):
   if not word: return []
   allSegmentations = [[first] + segment(rest)
                       for (first, rest) in splitPairs(word)]
   return max(allSegmentations, key = wordSeqFitness)

In [144]:
class OneGramDist(dict):
   def __init__(self):
      self.gramCount = 0
      for line in open('one-grams.txt'):
         (word, count) = line[:-1].split('\t')
         self[word] = int(count)
         self.gramCount += self[word]

   def __call__(self, word):
      if word in self:
         return float(self[word]) / self.gramCount
      else:
         return 1.0 / self.gramCount


In [145]:
singleWordProb = OneGramDist()
def wordSeqFitness(words):
   return functools.reduce(lambda x,y: x+y,
     (math.log10(singleWordProb(w)) for w in words))

In [146]:
segment('hellosir')

['hello', 'sir']

In [147]:
segment('howareyou')

['how', 'are', 'you']

In [148]:
segment('himynameis')

['himynameis']

Improvement Version

In [149]:
def splitPairs(word):
   return [(word[:i+1], word[i+1:]) for i in range(len(word))]

In [150]:
class OneGramDist(dict):
   def __init__(self):
      self.gramCount = 0
      for line in open('one-grams.txt'):
         (word, count) = line[:-1].split('\t')
         self[word] = int(count)
         self.gramCount += self[word]

   def __call__(self, key):
      if key in self:
         return float(self[key]) / self.gramCount
      else:
         return 1.0 / (self.gramCount * 10**(len(key) - 2))

In [151]:
singleWordProb = OneGramDist()
def wordSeqFitness(words):
   return functools.reduce(lambda x,y: x+y,
     (math.log10(singleWordProb(w)) for w in words))

In [152]:
def memoize(f):
   cache = {}

   def memoizedFunction(*args):
      if args not in cache:
         cache[args] = f(*args)
      return cache[args]

   memoizedFunction.cache = cache
   return memoizedFunction

@memoize
def segment(word):
   if not word: return []
   word = word.lower() # change to lower case
   allSegmentations = [[first] + segment(rest) for (first,rest) in splitPairs(word)]
   return max(allSegmentations, key = wordSeqFitness)

def splitPairs(word, maxLen=20):
   return [(word[:i+1], word[i+1:]) for i in range(max(len(word), maxLen))]

@memoize
def segmentWithProb(word):
   segmented = segment(word)
   return (wordSeqFitness(segmented), segmented)

In [153]:
segment('todayisanauspiciousday')

['today', 'is', 'an', 'auspicious', 'day']

In [154]:
segment('hewonthecricketmatch')

['he', 'won', 'the', 'cricket', 'match']

In [155]:
segment('Myunscientificfrienddoesnotbelievethathumanstatureismeasurableintermsofspeed')

['my',
 'unscientific',
 'friend',
 'does',
 'not',
 'believe',
 'that',
 'human',
 'stature',
 'is',
 'measurable',
 'in',
 'terms',
 'of',
 'speed']

In [156]:
segment('Myhostwentoverandstaredoutthewindowathispeacocks')

['my',
 'host',
 'went',
 'over',
 'and',
 'stared',
 'out',
 'the',
 'window',
 'a',
 'this',
 'peacocks']

In [157]:
segment('Indiawoncricketmatchonauspiciousdayofhittingacenturyandafifty')

['india',
 'won',
 'cricket',
 'match',
 'on',
 'auspicious',
 'day',
 'of',
 'hitting',
 'a',
 'century',
 'and',
 'a',
 'fifty']

In [158]:
# Create list of all sentences present in "brown" coupus
all_texts = nltk.corpus.brown.fileids()
full_brown = []

for text in all_texts:
    para = nltk.corpus.brown.sents(text)
    full_brown += [list(i) for i in para]

print("Total number of sentences in Brown corpus :", len(full_brown))

Total number of sentences in Brown corpus : 57340


In [159]:
train, test = train_test_split(full_brown, test_size=0.2)

In [160]:
print(test[:10])

[['Hence', '**zg', 'must', 'have', 'either', 'a', 'regulus', 'of', 'Af-fold', 'secants', 'or', 'a', 'regulus', 'of', 'Af-fold', 'secants', '.'], ['It', 'was', 'small', 'comfort', 'for', 'insomnia', '.'], ['Whether', 'he', 'sang', 'well', 'or', 'badly', 'had', 'nothing', 'to', 'do', 'with', 'it', '.'], ['A', 'trained', 'marksman', 'shooting', 'five', 'rounds', 'at', 'a', 'target', ',', 'all', 'under', 'practically', 'the', 'same', 'conditions', ',', 'may', 'hit', 'the', "bull's-eye", 'from', '0', 'to', '5', 'times', '.'], ['Yes', '!', '!'], ['Softer', 'knives', 'would', 'blunt', 'very', 'rapidly', ',', 'making', 'the', 'value', 'of', 'Af', 'inexact', '.'], ['A', '``', 'cattaloe', "''", 'was', 'a', 'hybrid', 'offspring', 'of', 'buffalo', 'and', 'cattle', '.'], ['No', 'more', '.'], ['``', 'Those', 'damn', 'punks', '--', 'taking', 'work', 'away', 'from', 'men', 'who', 'need', 'it', "''", '.'], ['Backs', 'higher', 'fees']]


In [161]:
len(test)

11468

In [162]:
def preprocess(sentences):
  punc = r"""!()-[]{};:'"\, ``<>./?@#$%^&*_~"""
  read = " ".join(sentences)
  for ele in read:
      if ele in punc:
          read = read.replace(ele, " ") 
  read = read.lower()

  # This will convert the word into tokens
  text_tokens = word_tokenize(read)

  # Remove all the stopwords from the tokens
  tokens_without_sw = [
        word for word in text_tokens if not word in stopwords.words("english")
    ]
  # Initialize the stemmer
  ps = PorterStemmer()

  # Stem all the words
  tokens_without_sw_stem = [ps.stem(word) for word in tokens_without_sw]  

  pre_text = [i for i in tokens_without_sw_stem if not i.isnumeric()]
  return pre_text

In [163]:
ptrain = [preprocess(i) for i in train]
ptest = [preprocess(i) for i in test]

In [164]:
n = 3
train_data, padded_vocab = padded_everygram_pipeline(n, ptrain)
lm_model = MLE(n)
lm_model.fit(train_data, padded_vocab)
print(lm_model.counts)
test_data, _ = padded_everygram_pipeline(n, ptest)

<NgramCounter with 3 ngram orders and 1693218 ngrams>


In [165]:
acc = 0
co = 0
for i in ptest:  
  co += 1
  if not i:
    continue
  sent = ''.join(i)
  try:
    seg = segment(sent)
  except:
    print(co, i)
    pass
  acc += sum([1 for j in seg if j in i]) / len(i)


267 ['white', 'colonnad', 'cedar', 'roof', 'southern', 'mansion', 'directli', 'traceabl', 'via', 'grey', 'buff', 'stone', 'grey', 'ski', 'england', 'golden', 'stucco', 'one', 'particular', 'part', 'blue', 'south', 'palladian', 'orbit', 'stretch', 'vicenza', 'old', 'mind', 'andrea', 'palladio', 'still', 'smile', 'behind', 'mani', 'old', 'rock', 'chair', 'southern', 'porch', 'deep', 'friez', 'architecton', 'music', 'rise', 'firm', 'shallow', 'freez', 'kitchen', 'feel', 'light', 'shade', 'bring', 'glitter', 'tall', 'mint', 'julep', 'sens', 'column', 'frame', 'warm', 'velvet', 'night', 'brought', 'togeth', 'million', 'coupl', 'mate', 'lip']
2155 ['suck', 'breath', 'kept', 'quiet', 'killpath', 'laid', 'sheet', 'wound', 'gold', 'wire', 'stem', 'glass', 'around', 'ear', 'eye', 'report', 'lay', 'desk', 'inton', 'act', 'lieuten', 'gunnar', 'matson', 'one', 'fail', 'see', 'station', 'keeper', 'properli', 'reliev', 'two', 'absent', 'throughout', 'entir', 'watch', 'without', 'check', 'station', 'a

In [166]:
accp = acc/len(test)
print("Accuracy of the Language Model:", accp*100, "%")

Accuracy of the Language Model: 74.97050729996356 %
