In [1]:
import pathlib
import random
import string
import re
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import Bidirectional,GRU,LSTM,Embedding
from tensorflow.keras.layers import Dense,MultiHeadAttention,LayerNormalization,Embedding,Dropout,Layer
from tensorflow.keras import Sequential,Input
from tensorflow.keras.callbacks import ModelCheckpoint
import os 
from nltk.translate.bleu_score import sentence_bleu

In [2]:
    # %cd C:/Users/Lenovo/Desktop/CODING/major_project/parallel-corpus/sanskrit-english
    # %ls

In [3]:
file_directory = r'C://Users//sampa//Downloads/sanskrit-english-20240406T035746Z-001/sanskrit-english'

import os

# Function to read lines from a file and remove newline characters
def read_lines(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f.readlines()]

# Paths to English and Sanskrit files
english_files = [
    file_directory  +'//bhagvadgita_english.txt',
    file_directory  +'//bible_english.txt',
    file_directory  +'//manu_english.txt',
    file_directory  +'//ramayan_english.txt',
    file_directory  +'//rigveda_english.txt'
]

sanskrit_files = [
    file_directory  +'//bhagvadgita_sanskrit.txt',
    file_directory  +'//bible_sanskrit.txt',
    file_directory  +'//manu_sanskrit.txt',
    file_directory  + '//ramayan_sanskrit.txt',
    file_directory  +'//rigveda_sanskrit.txt'
]

# Read English and Sanskrit lines from respective files
eng_lines = []
sanskrit_lines = []

for eng_file, sanskrit_file in zip(english_files, sanskrit_files):
    eng_lines.extend(read_lines(os.path.join(file_directory, eng_file)))
    sanskrit_lines.extend(read_lines(os.path.join(file_directory, sanskrit_file)))

# Create pairs of English and Sanskrit sentences
sentence_pairs = []

for eng_sentence, sanskrit_sentence in zip(eng_lines, sanskrit_lines):
    sentence_pairs.append((eng_sentence, '[start] ' + sanskrit_sentence + ' [end]'))

print("Number of sentence pairs created:", len(sentence_pairs))


Number of sentence pairs created: 34374


In [4]:
import random
print(random.choice(sentence_pairs))

('lightly with piercing ends as \x92twere two ranks of heroes ranged for fight.', '[start] नेमधितान पौंस्या वर्थेव विष्टान्ता| [end]')


In [5]:
import pandas as pd

# Assuming 'sentence_pairs' contains pairs of English and Sanskrit sentences
# Split the pairs into two lists: one for English and one for Sanskrit
english_sentences = [pair[0] for pair in sentence_pairs]
sanskrit_sentences = [pair[1] for pair in sentence_pairs]

# Create a DataFrame with these lists
df = pd.DataFrame({
    'English': english_sentences,
    'Sanskrit': sanskrit_sentences
})
# Remove '[start]' and '[end]' tokens from the 'Sanskrit' column
df['Sanskrit'] = df['Sanskrit'].str.replace('\[start\] ', '')  # Remove '[start]' and the following space
df['Sanskrit'] = df['Sanskrit'].str.replace(' \[end\]', '')  # Remove '[end]' and the preceding space

# Display the first few rows of the DataFrame to verify the changes
print(df.head())



                                             English  \
0  Dhrtarastra said O Sanjaya ! What did my men a...   
1  Sanjaya said Seeing the army of the sons of Pa...   
2  O teacher ! Behold this mighty army of the son...   
3  The heroes and mighty archers, comparable in w...   
4  Dhrstaketu, Cekitana and the valourous king of...   

                                            Sanskrit  
0  [start] धृतराष्ट्र उवाच धर्मक्षेत्रे कुरुक्षेत...  
1  [start] सञ्जय उवाच दृष्ट्वा तु पाण्डवानीकं व्य...  
2  [start] पश्यैतां पाण्डुपुत्राणामाचार्य महतीं च...  
3  [start] अत्र शूरा महेष्वासा भीमार्जुनसमा युधि।...  
4  [start] धृष्टकेतुश्चेकितानः काशिराजश्च वीर्यवा...  


In [6]:
df

Unnamed: 0,English,Sanskrit
0,Dhrtarastra said O Sanjaya ! What did my men a...,[start] धृतराष्ट्र उवाच धर्मक्षेत्रे कुरुक्षेत...
1,Sanjaya said Seeing the army of the sons of Pa...,[start] सञ्जय उवाच दृष्ट्वा तु पाण्डवानीकं व्य...
2,O teacher ! Behold this mighty army of the son...,[start] पश्यैतां पाण्डुपुत्राणामाचार्य महतीं च...
3,"The heroes and mighty archers, comparable in w...",[start] अत्र शूरा महेष्वासा भीमार्जुनसमा युधि।...
4,"Dhrstaketu, Cekitana and the valourous king of...",[start] धृष्टकेतुश्चेकितानः काशिराजश्च वीर्यवा...
...,...,...
34369,and sought the somapourers home.,[start] अगछःसोमिनो गर्हम| [end]
34370,venya that mortal man hast thou for Āstrabudhn...,[start] तवं तयमिन्द्र मर्त्यमास्त्रबुध्नाय वेन...
34371,o indra many a time set free.,[start] मुहुःश्रथ्ना मनस्यवे| [end]
34372,bring indra to the east again that sun who now...,[start] तवं तयमिन्द्र सूर्यं पश्चा सन्तं पुरस ...


In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize Sanskrit sentences
sanskrit_tokenizer = Tokenizer(char_level=False)
sanskrit_tokenizer.fit_on_texts(df['Sanskrit'])
sanskrit_sequences = sanskrit_tokenizer.texts_to_sequences(df['Sanskrit'])

# Tokenize English sentences
english_tokenizer = Tokenizer(char_level=False, filters='')
english_tokenizer.fit_on_texts(df['English'])
english_sequences = english_tokenizer.texts_to_sequences(df['English'])

# Vocabulary sizes
sanskrit_vocab_size = len(sanskrit_tokenizer.word_index) + 1
english_vocab_size = len(english_tokenizer.word_index) + 1

# Padding sequences
sanskrit_padded = pad_sequences(sanskrit_sequences, padding='post')
english_padded = pad_sequences(english_sequences, padding='post')

# Decoder Input Data
decoder_input_data = english_padded[:, :-1]

# Decoder Target Data
decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]


In [8]:
import numpy as np

def one_hot_encode(sequences, num_classes, dtype=np.float32):
    # ... (rest of the function)
    one_hot_output = np.zeros((len(sequences), max(len(sequence) for sequence in sequences), num_classes), dtype=dtype)
    # ... (rest of the function)


In [9]:
def one_hot_encode_sparse(sequences, num_classes, dtype=np.float32):
    rows = []
    cols = []
    data = []
    max_sequence_length = max(len(sequence) for sequence in sequences)
    
    for i, sequence in enumerate(sequences):
        for t, word_index in enumerate(sequence):
            if word_index > 0:  # Skip 0 padding
                rows.append(i)
                cols.append(t)
                data.append(1.0)
    
    one_hot_output = csr_matrix((data, (rows, cols)), shape=(len(sequences), max_sequence_length, num_classes), dtype=dtype)
    return one_hot_output
decoder_target_one_hot = one_hot_encode(decoder_target_data, english_vocab_size)


MemoryError: Unable to allocate 1.24 TiB for an array with shape (34374, 226, 43947) and data type float32

In [None]:
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import Model

latent_dim = 256  # Dimensionality of the encoding space

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=sanskrit_vocab_size, output_dim=latent_dim)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding(encoder_inputs))
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=english_vocab_size, output_dim=latent_dim)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding(decoder_inputs), initial_state=encoder_states)
decoder_dense = Dense(english_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 256)    32082432    ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 256)    11250432    ['input_2[0][0]']                
                                                                                              

In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Switch to sparse_categorical_crossentropy and provide integer targets
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

# Make sure your target data is integer-coded, not one-hot encoded
decoder_target_data = np.argmax(decoder_target_one_hot, axis=-1)

model.fit([sanskrit_padded, decoder_input_data], decoder_target_data,
          batch_size=64,
          epochs=100,
          validation_split=0.2)


Epoch 1/100


ResourceExhaustedError: Graph execution error:

Detected at node 'model/dense/Tensordot/MatMul' defined at (most recent call last):
    File "c:\Users\sampa\anaconda3\envs\ml\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "c:\Users\sampa\anaconda3\envs\ml\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
      app.launch_new_instance()
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
      app.start()
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\ipykernel\kernelapp.py", line 739, in start
      self.io_loop.start()
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\tornado\platform\asyncio.py", line 205, in start
      self.asyncio_loop.run_forever()
    File "c:\Users\sampa\anaconda3\envs\ml\lib\asyncio\base_events.py", line 601, in run_forever
      self._run_once()
    File "c:\Users\sampa\anaconda3\envs\ml\lib\asyncio\base_events.py", line 1905, in _run_once
      handle._run()
    File "c:\Users\sampa\anaconda3\envs\ml\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue
      await self.process_one()
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one
      await dispatch(*args)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell
      await result
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\ipykernel\ipkernel.py", line 359, in execute_request
      await super().execute_request(stream, ident, parent)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request
      reply_content = await reply_content
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\ipykernel\ipkernel.py", line 446, in do_execute
      res = shell.run_cell(
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell
      return super().run_cell(*args, **kwargs)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\IPython\core\interactiveshell.py", line 3048, in run_cell
      result = self._run_cell(
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\IPython\core\interactiveshell.py", line 3103, in _run_cell
      result = runner(coro)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\IPython\core\interactiveshell.py", line 3308, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\IPython\core\interactiveshell.py", line 3490, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\IPython\core\interactiveshell.py", line 3550, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\sampa\AppData\Local\Temp\ipykernel_24880\1564568496.py", line 7, in <module>
      model.fit([sanskrit_padded, decoder_input_data], decoder_target_data,
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\keras\engine\training.py", line 993, in train_step
      y_pred = self(x, training=True)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\keras\engine\training.py", line 557, in __call__
      return super().__call__(*args, **kwargs)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\keras\engine\functional.py", line 510, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\keras\engine\functional.py", line 667, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\sampa\anaconda3\envs\ml\lib\site-packages\keras\layers\core\dense.py", line 244, in call
      outputs = tf.tensordot(inputs, self.kernel, [[rank - 1], [0]])
Node: 'model/dense/Tensordot/MatMul'
OOM when allocating tensor with shape[14464,43947] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node model/dense/Tensordot/MatMul}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_5788]

In [None]:
# model.fit([sanskrit_padded, decoder_input_data], decoder_target_one_hot,
#           batch_size=32,
#           epochs=100,  # Use an appropriate number of epochs
#           validation_split=0.2)


In [None]:
encoder_model = Model(encoder_inputs, encoder_states)


In [None]:
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs
)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)
