In [None]:
# for output text wrap
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

# Merging Text Files

In [None]:
import os
big_data = ""


directory = 'archive'

for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    if os.path.isfile(f):
        with open(f, 'r') as file:
            try:
                text = file.read()
                big_data = big_data + " " + text
            except:
                print(f)
with open('big_data.txt', 'w') as fw:
    fw.write(big_data)

# Loading Dataset

In [None]:
# file_path = "The_dataset.txt"
file_path = "big_data.txt"
with open(file_path, 'r') as file:
  text = file.read()
text = text.lower()
# print(text)

In [None]:
import nltk
nltk.download('punkt')
from collections import defaultdict

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Getting Sentences

In [None]:
sentences = nltk.sent_tokenize(text)
# print(sentences)

# Word Tokenizer which removes punctuations

In [None]:
tokenizer = nltk.RegexpTokenizer(r'\w+')

# Getting Vocabulary/ Unigram Counts

In [None]:
vocab = defaultdict(int)
def getVocab():
  words = tokenizer.tokenize(text)
  for word in words:
    if word in vocab:
      vocab[word] += 1
    else:
      vocab[word] = 1
getVocab()
# print(vocab)

# Getting Bigram Counts

In [None]:
bigram_counts = defaultdict(int)
def getBigramCounts():
  for sentence in sentences:
    bigrams = nltk.ngrams(tokenizer.tokenize(sentence), 2)
    for bigram in bigrams:
      if " ".join(bigram) in bigram_counts:
        bigram_counts[" ".join(bigram)] += 1
      else:
        bigram_counts[" ".join(bigram)] = 1
getBigramCounts()
# print(bigram_counts)

# Getting Trigram Counts

In [None]:
trigram_counts = defaultdict(int)
def getTrigramCounts():
  for sentence in sentences:
    trigrams = nltk.ngrams(tokenizer.tokenize(sentence), 3)
    for trigram in trigrams:
      if " ".join(trigram) in trigram_counts:
        trigram_counts[" ".join(trigram)] += 1
      else:
        trigram_counts[" ".join(trigram)] = 1
getTrigramCounts()
# print(trigram_counts)

# Getting Quadgram Counts

In [None]:
quadgram_counts = defaultdict(int)
def getQuadgramCounts():
  for sentence in sentences:
    quadgrams = nltk.ngrams(tokenizer.tokenize(sentence), 4)
    for quadgram in quadgrams:
      if " ".join(quadgram) in quadgram_counts:
        quadgram_counts[" ".join(quadgram)] += 1
      else:
        quadgram_counts[" ".join(quadgram)] = 1
getQuadgramCounts()
# print(quadgram_counts)

# Getting Bigram Probabilities

In [None]:
bigram_prob = defaultdict(int)
def findBigramProb():
  V = len(vocab)
  for bigram in bigram_counts:
    unigram = bigram.split()[0]
    prob = (bigram_counts[bigram] + 1)/(vocab[unigram] + V)
    if unigram not in bigram_prob:
      bigram_prob[unigram] = []
    bigram_prob[unigram].append([prob, bigram.split()[-1]])
findBigramProb()
# print(bigram_prob)

# Getting Trigram Probabilities

In [None]:
trigram_prob = defaultdict(int)
def findTrigramProb():
  V = len(vocab)
  for trigram in trigram_counts:
    bigram = " ".join(trigram.split()[:2])
    prob = (trigram_counts[trigram] + 1)/(bigram_counts[bigram] + V)
    if bigram not in trigram_prob:
      trigram_prob[bigram] = []
    trigram_prob[bigram].append([prob, trigram.split()[-1]])
findTrigramProb()
# print(trigram_prob)

# Getting Quadgram Probabilities

In [None]:
quadgram_prob = defaultdict(int)
def findQuadgramProb():
  V = len(vocab)
  for quadgram in quadgram_counts:
    trigram = " ".join(quadgram.split()[:3])

    prob = (quadgram_counts[quadgram] + 1)/(trigram_counts[trigram] + V)
    if trigram not in quadgram_prob:
      quadgram_prob[trigram] = []
    quadgram_prob[trigram].append([prob, quadgram.split()[-1]])
findQuadgramProb()
# print(quadgram_prob)

# Sorting words based on probabilities

In [None]:
def sortProbDicts():
  for key in bigram_prob:
    if len(bigram_prob[key]) > 1:
      bigram_prob[key] = sorted(bigram_prob[key], reverse=True)
  
  for key in trigram_prob:
    if len(trigram_prob[key]) > 1:
      trigram_prob[key] = sorted(trigram_prob[key], reverse=True)
  
  for key in quadgram_prob:
    if len(quadgram_prob[key]) > 1:
      quadgram_prob[key] = sorted(quadgram_prob[key], reverse=True)

sortProbDicts()

# Getting possible word choices from on probability dicts

In [None]:
def getWordChoices(sentence):
  choices = []
  tokens = tokenizer.tokenize(sentence.lower())
  if tokens[-1] in bigram_prob:
    # print(tokens[-1], bigram_prob[tokens[-1]])
    choices += bigram_prob[tokens[-1]]

  # print(tokens[-2:])
  if " ".join(tokens[-2:]) in trigram_prob:
    # print(" ".join(tokens[-2:]), trigram_prob[" ".join(tokens[-2:])])
    choices += trigram_prob[" ".join(tokens[-2:])]

  if " ".join(tokens[-3:]) in quadgram_prob:
    # print(" ".join(tokens[-3:]), quadgram_prob[" ".join(tokens[-3:])])
    choices += quadgram_prob[" ".join(tokens[-3:])]

  return choices

# getWordChoices('But ek institution me 3 account hi open kar sakte hai')

# Importing module for finding word similarities

In [None]:
!pip install jaro-winkler
# !pip install jaro
import jaro

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jaro-winkler
  Downloading jaro_winkler-2.0.3-py3-none-any.whl (33 kB)
Installing collected packages: jaro-winkler
Successfully installed jaro-winkler-2.0.3


# Making Predictions

In [None]:
def doPredictions(sentence):
  end_word = None
  incomplete = False
  # if word is still being type
  # removing last word and getting choices wrt to second last word
  if sentence[-1] != " ":
    end_word = sentence.split()[-1]
    incomplete = True
    sentence = " ".join(sentence.split()[:-1])

  choices = {word[1] for word in getWordChoices(sentence)}
  predictions = []

  V = len(vocab)
  num_words = sum(vocab.values())

  # merging choices with all possible words from our training corpus
  if incomplete:
    choices = set(choices.union(vocab.keys()))

  for word in choices:
    key = sentence + word
    quad_token = " ".join(key.split()[-4:])

    # adding the quadgram, trigram, bigram and unigram probabilities to get overall probability
    prob = (
        (quadgram_counts[quad_token] + 1)/ (trigram_counts[" ".join(quad_token.split()[-3:])] + V) + 
        (trigram_counts[" ".join(quad_token.split()[-3:])] + 1)/ (bigram_counts[" ".join(quad_token.split()[-2:])] + V) +
        (bigram_counts[" ".join(quad_token.split()[-2:])] + 1)/ (vocab[word] + V) + 
        (vocab[word] + 1)/ (num_words + V)
    )

    # adding similarity if sentence is incomplete
    if incomplete:
      similarity = jaro.jaro_winkler_metric(end_word, word)
      predictions.append([similarity, prob, word])
    else:
      predictions.append([prob, word])

  # sorts based on similarity (if sentence is incomplete) and then on probability
  predictions = sorted(predictions, reverse=True)
  best_preds = [pred[-1] for pred in predictions[:4]]
  return best_preds

sentence = 'But ek institution me teen account hi open kar sakte hai'
print(doPredictions('But ek institution me teen acc'))
print(doPredictions('But ek institution me teen acco'))
print(doPredictions('But ek institution me teen accou'))
print(doPredictions('But ek institution me teen account'))
print(doPredictions('But ek institution me teen account '))
print(doPredictions('But ek institution me teen account open'))

['ac', 'accha', 'acchi', 'access']
['accout', 'account', 'accounts', 'according']
['accout', 'account', 'accounts', 'according']
['account', 'accounts', 'accout', 'accent']
['hai', 'ke', 'aur', 'se']
['open', 'opener', 'pen', 'opening']


# Export dictionaries as json files for api

In [None]:
import json

with open("bigram_counts.json",'w') as f:
  json.dump(bigram_counts, f)
with open("trigram_counts.json", 'w') as f:
  json.dump(trigram_counts, f)
with open("quadgram_counts.json", 'w') as f:
  json.dump(quadgram_counts, f)
with open('bigram_prob.json', 'w') as f:
  json.dump(bigram_prob, f)
with open('trigram_prob.json', 'w') as f:
  json.dump(trigram_prob, f)
with open('quadgram_prob.json', 'w') as f:
  json.dump(quadgram_prob, f)
with open("vocab.json", 'w') as f:
  json.dump(vocab, f)