In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import string
import re
import io
import numpy as np
from unicodedata import normalize
import keras, tensorflow
from keras.models import Model
from keras.layers import Input, LSTM, Dense

## Reading data

In [None]:
def read_data(file):
    data = []
    with io.open(file, 'r') as file:
        for entry in file:
            entry = entry.strip()
            data.append(entry)
    return data

In [None]:
data = read_data('/content/drive/MyDrive/Colab Notebooks/NLP/pr-10/bilingual_pairs.txt')

## Some basics about our dataset

In [None]:
data[139990:140000]

['Never choose a vocation just because the hours are short.\tNe choisissez jamais une profession juste parce que les heures y sont courtes.',
 "No other mountain in the world is so high as Mt. Everest.\tAucune montagne au monde n'atteint la hauteur du Mont Everest.",
 "No sooner had he met his family than he burst into tears.\tÀ peine avait-il rencontré sa famille qu'il éclata en sanglots.",
 "Nothing is more disappointing than to lose in the finals.\tRien n'est plus décevant que de perdre en finale.",
 "Now that he is old, it is your duty to go look after him.\tÀ présent qu'il est vieux, c'est ton devoir de veiller sur lui.",
 "Now that you've decided to quit your job, you look happy.\tMaintenant que vous avez décidé de quitter votre emploi, vous avez l'air heureux.",
 "Now that you've decided to quit your job, you look happy.\tMaintenant que tu as décidé de quitter ton emploi, tu as l'air heureux.",
 "Now that you've decided to quit your job, you look happy.\tMaintenant que vous avez

In [None]:
len(data)

145437

In [None]:
data = data[:140000]

## Splitting our data into English and French sentences

In [None]:
def build_english_french_sentences(data):
    english_sentences = []
    french_sentences = []
    for data_point in data:
        english_sentences.append(data_point.split("\t")[0])
        french_sentences.append(data_point.split("\t")[1])
    return english_sentences, french_sentences

In [None]:
english_sentences, french_sentences = build_english_french_sentences(data)

In [None]:
len(english_sentences)

140000

In [None]:
len(french_sentences)

140000

## Data cleaning

Normalizes characters
Removes punctuation
Performs case-folding
Removes non-printable characters
Keeps only alphabetic words

In [None]:
sentence = 'My name is kalam..$@# :) assuming I am scientist!!!'
cleaned_sent = normalize('NFD', sentence).encode('ascii', 'ignore')
print(cleaned_sent)
cleaned_sent = cleaned_sent.decode('UTF-8')
print(cleaned_sent)

b'My name is kalam..$@# :) assuming I am scientist!!!'
My name is kalam..$@# :) assuming I am scientist!!!


In [None]:
def clean_sentences(sentence):
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    #cleaned_sent = normalize('NFD', sentence).encode('ascii', 'ignore')
    #cleaned_sent = cleaned_sent.decode('UTF-8')
    #cleaned_sent = cleaned_sent.split()
    cleaned_sent = sentence.split()
    cleaned_sent = [word.lower() for word in cleaned_sent]
    cleaned_sent = [word.translate(table) for word in cleaned_sent]
    cleaned_sent = [re_print.sub('', w) for w in cleaned_sent]
    cleaned_sent = [word for word in cleaned_sent if word.isalpha()]
    return ' '.join(cleaned_sent)

In [None]:
def build_clean_english_french_sentences(english_sentences, french_sentences):
    french_sentences_cleaned = []
    english_sentences_cleaned = []
    for sent in french_sentences:
        french_sentences_cleaned.append(clean_sentences(sent))
    for sent in english_sentences:
        english_sentences_cleaned.append(clean_sentences(sent))
    return english_sentences_cleaned, french_sentences_cleaned

In [None]:
english_sentences_cleaned, french_sentences_cleaned = build_clean_english_french_sentences(english_sentences,
                                                                                           french_sentences)

In [None]:
english_sentences_cleaned[40884]

'i think i can fix this'

In [None]:
french_sentences_cleaned[40884]

'je pense que je peux arranger a'

## Building our input and target datasets

In [None]:
def build_data(english_sentences_cleaned, french_sentences_cleaned):
    input_dataset = []
    target_dataset = []
    input_characters = set()
    target_characters = set()

    for french_sentence in french_sentences_cleaned:
        input_datapoint = french_sentence
        input_dataset.append(input_datapoint)
        for char in input_datapoint:
            input_characters.add(char)

    for english_sentence in english_sentences_cleaned:
        target_datapoint = "\t" + english_sentence + "\n"
        target_dataset.append(target_datapoint)
        for char in target_datapoint:
            target_characters.add(char)

    return input_dataset, target_dataset, sorted(list(input_characters)), sorted(list(target_characters))

In [None]:
input_dataset, target_dataset, input_characters, target_characters = build_data(english_sentences_cleaned,
                                                                                french_sentences_cleaned)

In [None]:
len(input_characters)

27

In [None]:
len(target_characters)

29

In [None]:
print(input_characters)

[' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [None]:
print(target_characters)

['\t', '\n', ' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


## Defining metadata for our data structures and model to work with

In [None]:
def build_metadata(input_dataset, target_dataset, input_characters, target_characters):
    num_encoder_tokens = len(input_characters)
    num_decoder_tokens = len(target_characters)
    max_encoder_seq_length = max([len(data_point) for data_point in input_dataset])
    max_decoder_seq_length = max([len(data_point) for data_point in target_dataset])

    print('Number of data points:', len(input_dataset))
    print('Number of unique input tokens:', num_encoder_tokens)
    print('Number of unique output tokens:', num_decoder_tokens)
    print('Maximum sequence length for inputs:', max_encoder_seq_length)
    print('Maximum sequence length for outputs:', max_decoder_seq_length)

    return num_encoder_tokens, num_decoder_tokens, max_encoder_seq_length, max_decoder_seq_length

In [None]:
num_encoder_tokens, num_decoder_tokens, max_encoder_seq_length, max_decoder_seq_length = build_metadata(input_dataset,
                                                                                                        target_dataset,
                                                                                                        input_characters,
                                                                                                        target_characters)

Number of data points: 140000
Number of unique input tokens: 27
Number of unique output tokens: 29
Maximum sequence length for inputs: 109
Maximum sequence length for outputs: 58


## Developing mappings for character to index and vice-versa

In [None]:
def build_indices(input_characters, target_characters):
    input_char_to_idx = {}
    input_idx_to_char = {}
    target_char_to_idx = {}
    target_idx_to_char = {}

    for i, char in enumerate(input_characters):
        input_char_to_idx[char] = i
        input_idx_to_char[i] = char

    for i, char in enumerate(target_characters):
        target_char_to_idx[char] = i
        target_idx_to_char[i] = char

    return input_char_to_idx, input_idx_to_char, target_char_to_idx, target_idx_to_char

input_char_to_idx, input_idx_to_char, target_char_to_idx, target_idx_to_char = build_indices(input_characters,
                                                                                             target_characters)

## Building data structures to accommodate our data

In [None]:
def build_data_structures(length_input_dataset, max_encoder_seq_length, max_decoder_seq_length, num_encoder_tokens, num_decoder_tokens):
    encoder_input_data = np.zeros((length_input_dataset, max_encoder_seq_length, num_encoder_tokens), dtype='float32')
    decoder_input_data = np.zeros((length_input_dataset, max_decoder_seq_length, num_decoder_tokens), dtype='float32')
    decoder_target_data = np.zeros((length_input_dataset, max_decoder_seq_length, num_decoder_tokens), dtype='float32')
    print("Dimensionality of encoder input data is : ", encoder_input_data.shape)
    print("Dimensionality of decoder input data is : ", decoder_input_data.shape)
    print("Dimensionality of decoder target data is : ", decoder_target_data.shape)

    return encoder_input_data, decoder_input_data, decoder_target_data

encoder_input_data, decoder_input_data, decoder_target_data = build_data_structures(len(input_dataset),
                                                                                    max_encoder_seq_length,
                                                                                    max_decoder_seq_length,
                                                                                    num_encoder_tokens,
                                                                                    num_decoder_tokens)

Dimensionality of encoder input data is :  (140000, 109, 27)
Dimensionality of decoder input data is :  (140000, 58, 29)
Dimensionality of decoder target data is :  (140000, 58, 29)


## Adding data to the built data structures

In [None]:
def add_data_to_data_structures(input_dataset, target_dataset, encoder_input_data, decoder_input_data, decoder_target_data):
    for i, (input_data_point, target_data_point) in enumerate(zip(input_dataset, target_dataset)):
        for t, char in enumerate(input_data_point):
            encoder_input_data[i, t, input_char_to_idx[char]] = 1.
        for t, char in enumerate(target_data_point):
            # decoder_target_data is ahead of decoder_input_data by one timestep
            decoder_input_data[i, t, target_char_to_idx[char]] = 1.
            if t > 0:
                # decoder_target_data will be ahead by one timestep
                # and will not include the start character.
                decoder_target_data[i, t - 1, target_char_to_idx[char]] = 1.
    return encoder_input_data, decoder_input_data, decoder_target_data

In [None]:
encoder_input_data, decoder_input_data, decoder_target_data = add_data_to_data_structures(input_dataset,
                                                                                          target_dataset,
                                                                                          encoder_input_data,
                                                                                          decoder_input_data,
                                                                                          decoder_target_data)

## Defining our model hyperparameters

In [None]:
batch_size = 256
epochs = 2
latent_dim = 256

## Encoder Definition

Think about how you would start an iteration of the LSTM. You have a hidden state c, an input x, but you also need an alleged previous output h, which is concatenated with x. The LSTM has therefore two hidden tensors that need to be initialized: c and h. Now h happens to be the output of the previous state, which is why you pass it as input together with c. When you set return_state=True, both c and h are returned. Together with the output, you'll therefore receive 3 tensors.

In [None]:
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

## Decoder Definition

return_sequence=TRUE, the output will be a sequence of the same length, with return_sequence=FALSE, the output will be just one vector.

In [None]:
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

## Building our model

In [None]:
model = Model(inputs=[encoder_inputs, decoder_inputs],
              outputs=decoder_outputs)

In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, None, 27)]   0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, None, 29)]   0           []                               
                                                                                                  
 lstm_2 (LSTM)                  [(None, 256),        290816      ['input_3[0][0]']                
                                 (None, 256),                                                     
                                 (None, 256)]                                                     
                                                                                            

## Training the model

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fecc2b1f880>

## Saving the model

In [None]:
model.save('/content/drive/MyDrive/Colab Notebooks/NLP/pr-10/neural_machine_translation_french_to_english.h5')

## Preparing our model for inferencing

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_c = Input(shape=(latent_dim,))
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                      [decoder_outputs] + decoder_states)

In [None]:
def decode_sequence(input_seq):

    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_char_to_idx['\t']] = 1.

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = target_idx_to_char[sampled_token_index]
        decoded_sentence += sampled_char

        if (sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length):
              stop_condition = True


        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        states_value = [h, c]

    return decoded_sentence

## Let's translate some French sentences to English

In [None]:
def decode(seq_index):
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_dataset[seq_index])
    print('Decoded sentence:', decoded_sentence)

In [None]:
decode(55000)

-
Input sentence: hier tait une bonne journe
Decoded sentence: i want tou  a o  o   o  o  o  o   o  o  o   o  o  o   o  o 


In [None]:
decode(10000)

-
Input sentence: jen ai ras le bol
Decoded sentence: i want tou  a o  o   o  o  o  o   o  o  o   o  o  o   o  o 


In [None]:
decode(200)

-
Input sentence: soyez calmes
Decoded sentence: i want tou  a o  o   o  o  o  o   o  o  o   o  o  o   o  o 


In [None]:
decode(3000)

-
Input sentence: je me sens affreusement mal
Decoded sentence: i want tou  a o  o   o  o  o  o   o  o  o   o  o  o   o  o 


In [None]:
decode(40884)

-
Input sentence: je pense que je peux arranger a
Decoded sentence: i want tou  a o  o   o  o  o  o   o  o  o   o  o  o   o  o 


OCR

In [None]:
!sudo apt install tesseract-ocr

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 24 not upgraded.
Need to get 4,850 kB of archives.
After this operation, 16.3 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1 [1,598 kB]
Get:2 http://archive.ubuntu.com/ubuntu focal/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu focal/universe amd64 tesseract-ocr amd64 4.1.1-2build2 [262 kB]
Fetched 4,850 kB in 2s (3,146 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/Fro

In [None]:
!pip install pytesseract

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.10


In [None]:
# !sudo apt install tesseract-ocr
# !pip install pytesseract
import pytesseract
import shutil
import os
import random
try:
    from PIL import Image
except ImportError:
    import Image
from google.colab import files
uploaded = files.upload()
extractedInformation = pytesseract.image_to_string(Image.open('/content/Capture.PNG'))

Saving Capture.PNG to Capture (2).PNG


In [None]:
extractedInformation

'Bonjour a tous !\n\x0c'

In [None]:
!pip install Pillow==9.0.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Pillow==9.0.0
  Downloading Pillow-9.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Pillow
  Attempting uninstall: Pillow
    Found existing installation: Pillow 8.4.0
    Uninstalling Pillow-8.4.0:
      Successfully uninstalled Pillow-8.4.0
Successfully installed Pillow-9.0.0


In [None]:
import cv2
from pytesseract import image_to_string
img_cv = cv2.imread(r'/content/Capture.PNG')
print(image_to_string(img_cv))

Bonjour a tous !

