In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re 

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from tensorflow.keras.layers import LSTM,Dense,Embedding,Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from sklearn.model_selection import train_test_split





In [3]:
data=pd.read_csv("https://raw.githubusercontent.com/SayamAlt/English-to-French-Language-Translation-using-Seq2Seq-Modeling/main/eng-french.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175621 entries, 0 to 175620
Data columns (total 2 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   English words/sentences  175621 non-null  object
 1   French words/sentences   175621 non-null  object
dtypes: object(2)
memory usage: 2.7+ MB


In [4]:
wordnet= WordNetLemmatizer()

corpus=[]

for i in range(0,len(data)):
    review=re.sub('[^a-zA-Z]',' ',data['English words/sentences'][i])
    review=review.lower()
    review=review.split()
    review=[wordnet.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review=' '.join(review)
    corpus.append(review)

In [5]:
corpus

['hi',
 'run',
 'run',
 '',
 'wow',
 'fire',
 'help',
 'jump',
 'stop',
 'stop',
 'stop',
 'wait',
 'wait',
 'go',
 'go',
 'go',
 'hello',
 'hello',
 'see',
 'try',
 '',
 '',
 '',
 'oh',
 'attack',
 'attack',
 'cheer',
 'cheer',
 'cheer',
 'cheer',
 'get',
 'go',
 'go',
 'go',
 'got',
 'got',
 'got',
 'got',
 'got',
 'hop',
 'hop',
 'hug',
 'hug',
 'fell',
 'fell',
 'know',
 'left',
 'left',
 'lied',
 'lost',
 'paid',
 '',
 'ok',
 'ok',
 'listen',
 'way',
 'way',
 'way',
 'way',
 'way',
 'way',
 'way',
 'way',
 'way',
 'really',
 'really',
 'really',
 'thanks',
 'try',
 '',
 '',
 '',
 '',
 'ask tom',
 'awesome',
 'calm',
 'calm',
 'calm',
 'cool',
 'fair',
 'fair',
 'fair',
 'fair',
 'fair',
 'fair',
 'kind',
 'nice',
 'nice',
 'nice',
 'nice',
 'nice',
 'nice',
 'beat',
 'call',
 'call',
 'call u',
 'call u',
 'come',
 'come',
 'come',
 'come',
 'come',
 'come',
 'come',
 'come',
 'drop',
 'drop',
 'drop',
 'drop',
 'get tom',
 'get',
 'get',
 'get',
 'get',
 'get',
 'go away',
 'go a

### Onehot Representation

    If we set voc_size=10000, it means that you are limiting your vocabulary to the 10,000 most frequent words in our dataset

In [9]:
voc_size=10000

onehot_rep=[one_hot(word,voc_size) for word in corpus]
print(onehot_rep[:10])

[[4029], [2714], [2714], [], [7840], [4398], [6163], [7444], [9218], [9218]]


### Padding Sequence

In [12]:
sent_length=20
embedded_docs=pad_sequences(onehot_rep,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ...    0    0 4029]
 [   0    0    0 ...    0    0 2714]
 [   0    0    0 ...    0    0 2714]
 ...
 [   0    0    0 ... 6280  102 4540]
 [4612 4694 5087 ...  800 8179 7417]
 [6030   53 9342 ... 9342   98 6540]]


In [13]:
len(embedded_docs)

175621

In [14]:
print(embedded_docs[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0 4029]


In [15]:
num_features=20

### Encoders and Decoders Model 

    The dimensionality of the embedding space. This is the size of the dense vector to represent each word. For example, if you set embedding_vector_features to 100, each word in your vocabulary will be represented as a 100-dimensional vector.

In [16]:
# Encoder model
embeding_vector_features=40

encoder_model=Sequential()

encoder_model.add(Embedding(voc_size,embeding_vector_features,input_length=sent_length))
encoder_model.add(LSTM(100))




In [18]:
#Decoder Model 

Decoder_model=Sequential()

Decoder_model.add(Embedding(voc_size,embeding_vector_features,input_length=sent_length))
Decoder_model.add(LSTM(100))
Decoder_model.add(Dense(voc_size,activation='softmax'))


In [23]:
encoder_decoder_model=Sequential(
            [encoder_model,
            Decoder_model])
encoder_decoder_model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

print(encoder_decoder_model.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential (Sequential)     (None, 100)               456400    
                                                                 
 sequential_2 (Sequential)   (None, 10000)             1466400   
                                                                 
Total params: 1922800 (7.33 MB)
Trainable params: 1922800 (7.33 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [24]:
y=data['French words/sentences']

X_final=np.array(embedded_docs)
y_final=np.array(y)

X_final.shape,y_final.shape

((175621, 20), (175621,))

In [25]:
X_train,X_test,y_train,y_test=train_test_split(X_final,y_final,random_state=0,test_size=0.3)

In [26]:
encoder_decoder_model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10)

Epoch 1/10


ValueError: in user code:

    File "C:\Users\z004vc9h\AppData\Local\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\z004vc9h\AppData\Local\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\z004vc9h\AppData\Local\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\z004vc9h\AppData\Local\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1150, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\z004vc9h\AppData\Local\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\z004vc9h\AppData\Local\anaconda3\Lib\site-packages\keras\src\engine\input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer 'sequential_4' (type Sequential).
    
    Input 0 of layer "sequential_2" is incompatible with the layer: expected shape=(None, 20), found shape=(None, 100)
    
    Call arguments received by layer 'sequential_4' (type Sequential):
      • inputs=tf.Tensor(shape=(None, 20), dtype=int32)
      • training=True
      • mask=None
