# Natural Language Processing –
# Project 2: **OCR Error Correction using Character Based Language Modeling**

Student Name - Renuka Lalit Patil

Student no. - R00195785

## Installation of required packages

In [None]:
!pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# **Part 2** : Implementation of a character based Language Model for word suggestion

please refer run.py for part 1 - Decontraction and tokenization of OCR output

In [73]:
import tensorflow as tf
import numpy as np
import string
import re

In [74]:
# using Europarl english corpus - http://www.statmt.org/europarl/

import nltk
nltk.download('punkt')
from nltk.corpus.europarl_raw import english
nltk.download('europarl_raw')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package europarl_raw to /root/nltk_data...
[nltk_data]   Package europarl_raw is already up-to-date!


True

In [75]:
text = english.raw()
text = text + 'Unitek'
text = text.lower()
print(len(text))

3059995


In [98]:
# Tokenization of the string (word of phrase)

def Tokenize(text):
  tokens = list(nltk.word_tokenize(text))
  # print(tokens)
  return tokens

In [77]:
# pre processing the text data
def clean_text(text):
  text = text.replace('--', ' ')
  text = text.replace('\n', ' ')
  tokens = Tokenize(text)
  tokens = [word for word in tokens if word.isalpha()]
  text = ' '.join(tokens)
  return text, tokens

In [78]:
text, tokens = clean_text(text)
print(len(tokens))

494840


## **LSTM model training**

In [None]:
# calculating number of characters 
chars = sorted(list(set(text))) 
char_indx = dict((char, chars.index(char)) for char in chars)     # dictionary used for mapping
vocab_size = len(chars)
print(f'number of characters:{len(chars)}')

number of characters:49


In [None]:
max_len = 15
sequences = []
next_chars = []
# segmentation and encoding data for training
for i in range(0, len(text) - max_len, 1):
    t = text[i: i + max_len]
    encoded = [char_indx[char] for char in t]
    sequences.append(encoded)                   # feature seq
             
    next_chars.append(char_indx[text[i + max_len]])              # targets
    
print(f'Number of sequences: {len(sequences)}')

Number of sequences: 2901656


In [None]:
# one hot encoding
x = np.zeros((len(sequences), max_len, len(chars)), dtype=np.bool)
y = np.zeros((len(sequences), len(chars)), dtype=np.bool)
for i, sent in enumerate(sequences):
    for t, char in enumerate(sent):
        x[i, t, char] = 1 
    y[i, next_chars[i]] = 1 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
# define model

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.LSTM(64, input_shape=(max_len, vocab_size)))
model.add(tf.keras.layers.Dense(len(chars), activation='softmax'))
print(model.summary())
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 64)                29184     
                                                                 
 dense (Dense)               (None, 49)                3185      
                                                                 
Total params: 32,369
Trainable params: 32,369
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# train model

In [None]:

history = model.fit(x, y, batch_size=64, epochs = 6)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [None]:
# saving the model and character mapping dictionary
from pickle import dump
model.save('/content/ocr_text-gen-lstm.h5')
dump(char_indx, open('/content/mapping.pkl', 'wb'))

# 3.1) Check if the proposed word is an English word, if not, suggest possible words.

In [89]:
# https://machinelearningmastery.com/develop-character-based-neural-language-model-keras/

import tensorflow as tf
import numpy as np
import string
import re
from pickle import load
 
# generate a sequence of characters with a language model
def generate_seq(model, mapping, reversed_map, seq_length, seed_text, n_chars):
  
  # generate a fixed number of characters
  encoded = [mapping[char] for char in seed_text]
  # truncate sequences to a fixed length
  encoded = tf.keras.preprocessing.sequence.pad_sequences([encoded], maxlen=seq_length, truncating='pre')
  # one hot encode
  encoded = tf.keras.utils.to_categorical(encoded, num_classes=len(mapping))
  # predict character
  predict_x = model.predict(encoded, verbose=0)
  # selecting best 10 charchters 
  # best_fit = np.argmax(predict_x,axis=1)
  best_fit = predict_x[0].argsort()[::-1][:10]
  sugg_words = []
  in_text_list = dict()
  for a in best_fit:
    if reversed_map[a] == " ":
      in_text_list[a] = seed_text
    else:
      in_text_list[a] = seed_text + reversed_map[a]

  # iterating further for each selected character
  for i in best_fit:
    in_text = in_text_list[i]
    for _ in range(n_chars-1):
      encoded = [mapping[char] for char in in_text]
      # truncate sequences to a fixed length
      encoded = tf.keras.preprocessing.sequence.pad_sequences([encoded], maxlen=seq_length, truncating='pre')
      # one hot encode
      encoded = tf.keras.utils.to_categorical(encoded, num_classes=len(mapping))
      # predict character
      predict_x = model.predict(encoded, verbose=0)
      best_fit_ = np.argmax(predict_x,axis=1)
      if reversed_map[best_fit_[0]] == " ":
        break
      in_text_list[i] = in_text_list[i] + reversed_map[best_fit_[0]]
      in_text = in_text + reversed_map[best_fit_[0]]
      
    sugg_words.append(in_text_list[i])
  # print(sugg_words)
  return sugg_words
  
# ----------------------------------------------------------------------------------


In [93]:

# load the model
model = tf.keras.models.load_model('/content/ocr_text-gen-lstm (2).h5')
# load the mapping
mapping = load(open('/content/mapping (2).pkl', 'rb'))

# reverse the mapping 
reversed_map = dict()
key_list = list(mapping.keys())
val_list = list(mapping.values())
n = len(key_list)
for i in range(n):
  key = val_list[i]
  val = key_list[i]
  reversed_map[key] = val
# print(f' The reversed mapping dict- {reversedDict}')

# Read text detected by OCR model
text_ocr = open("OCR_output.txt", "r")
x = text_ocr.read()
words = x.split()
words = [w.lower() for w in words]
print(f'List of all OCR words - {words}')

# Check if the word is an english word
not_eng = []
not_eng_indx = []
for w in range(len(words)):
  if words[w] not in tokens:
    not_eng.append(words[w])
    not_eng_indx.append(w)
print(f'List of non english words - {not_eng}')

# Suggest english words using loaded model
pred_list_all = []
for word in not_eng:
  pred_list = []
  w = ''
  # adding characters one by one into the model input 
  for c in range(len(word)-1):
    w = w + word[c]
    # print(w, len(word))
    pred_list.append(generate_seq(model, mapping, reversed_map, 15, w , len(word)))
  pred_list_all.append([x for P in pred_list for x in P])

print(f'suggested english words from corpus - {pred_list_all}')


List of all OCR words - ['unetek']
List of non english words - ['unetek']
suggested english words from corpus - [['us', 'up', 'ur', 'untelly', 'udgenin', 'uch', 'umpooct', 'ually', 'uedes', 'utalial', 'untelly', 'under', 'unally', 'unoll', 'union', 'untelly', 'unfoutt', 'unglican', 'unemple', 'unryies', 'unemple', 'unesport', 'unemple', 'unea', 'unelardic', 'uned', 'unepar', 'unequalit', 'unera', 'uneelly', 'unetrested', 'unetion', 'unetreste', 'uneterial', 'uneth', 'unettrest', 'uneta', 'unetgor', 'unetper', 'unetly', 'uneterial', 'uneterial', 'unetely', 'uneted', 'unetect', 'unetes', 'uneteer', 'unetemer', 'uneteg', 'unetear']]


# 3.2) Choose the word with the shortest weighted edit distance

In [None]:
!pip install weighted-levenshtein

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [97]:
# https://github.com/infoscout/weighted-levenshtein


from weighted_levenshtein import lev
actual_word = 'Unitek'
actual_word = actual_word.lower()

final_out_list = []

for e in range(len(pred_list_all)):
  cost_list = []
  for pred in pred_list_all[e]:
    x = lev(actual_word, pred)        
    cost_list.append(x)
  # selecting minimum distance suggested word 
  min_dist_idx = np.argmin(cost_list)
  final_out_list.append(pred_list_all[e][min_dist_idx])

print(f'Best fitting suggestions - {final_out_list}')


# replacing non english words in OCR text
for i in range(len(final_out_list)):
  words[not_eng_indx[i]] = final_out_list[i]

final_text = ' '.join(words)
print(f'final text after replacing non english words - {final_text}')
# writing replaced output text in text file for contraction
with open("Sugg_output.txt", "w") as text_file:
    text_file.write(str(final_text))

Best fitting suggestions - ['uneted']
final text after replacing non english words - uneted


please refer contract.py for part 3 