# Week 7
## 1. Importing the modules

In [46]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Input, Dense, Embedding, AdditiveAttention, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

## 2. Reading the file

In [47]:
df = pd.read_csv('fra.txt', delimiter='\t')
cols = df.columns

In [48]:
df.drop(cols[-1], axis=1, inplace=True)

In [49]:
df = df.sample( 10000)
df.columns = ['English', 'French']

## 3. Cleaning any weird things

In [50]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

df['English'] = df['English'].apply(clean_text)
df['French'] = df['French'].apply(clean_text)

## 4. Tokenizing, padding and sequencing

In [51]:
src_tokenizer = Tokenizer(filters='')
src_tokenizer.fit_on_texts(df['English'])
src_seq = src_tokenizer.texts_to_sequences(df['English'])
src_padded = pad_sequences(src_seq, padding='post')
src_len = len(src_tokenizer.word_index) + 1

tgt_tokenizer = Tokenizer(filters='')
tgt_tokenizer.fit_on_texts(df['French'])
tgt_seq = tgt_tokenizer.texts_to_sequences(df['French'])
tgt_padded = pad_sequences(tgt_seq, padding='post')
tgt_len = len(tgt_tokenizer.word_index) + 1

print(f"Source length is {src_len} and target length is {tgt_len}")

Source length is 4829 and target length is 7409


## 5. Train-test split

In [52]:
X_train, X_test, y_train, y_test = train_test_split(src_padded, tgt_padded, random_state=42, test_size=0.2)

In [54]:
max_src_len = X_train.shape[1]
max_tgt_len = y_train.shape[1]
print(f"Max_src_len : {max_src_len} and max_tgt_len : {max_tgt_len}")

Max_src_len : 27 and max_tgt_len : 28


## 6. Model arch

In [55]:
embedding_dim = 32

In [60]:
enc_input = Input(shape=(None, ))
enc_embedding = Embedding(src_len, embedding_dim)(enc_input)
enc_lstm = LSTM(8, return_sequences=True, return_state=True)
enc_outputs, h_state, c_state = enc_lstm(enc_embedding)

dec_input = Input(shape=(None, ))
dec_embedding = Embedding(tgt_len, embedding_dim)(dec_input)
dec_lstm = LSTM(8, return_sequences=True, return_state=True)
dec_outputs, _, _ = dec_lstm(dec_embedding, initial_state=[h_state, c_state])

attention = AdditiveAttention()
attn_outputs = attention([dec_outputs, enc_outputs])
dec_input_conc = Concatenate(axis=-1)([dec_outputs, attn_outputs])

dec_dense = Dense(tgt_len, activation='softmax')
dec_outputs = dec_dense(dec_input_conc)

model = Model([enc_input, dec_input], dec_outputs)
model.compile(loss='sparse_categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
model.summary()

## 7. Model training

In [61]:
print(f"Shape of y_train is {y_train.shape}")

dec_input = y_train[:, :-1]
dec_target = y_train[:, 1:]
dec_test_input = y_test[:, :-1]
dec_test_target = y_test[:, 1:]

print(f"Shape of dec_input : {dec_input.shape} and dec_target : {dec_target.shape}")

Shape of y_train is (8000, 28)
Shape of dec_input : (8000, 27) and dec_target : (8000, 27)


In [62]:
history = model.fit(
    [X_train, dec_input],
    np.expand_dims(dec_target, -1),
    validation_split = 0.2,
    batch_size=16,
    epochs=10
)

Epoch 1/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 51ms/step - accuracy: 0.7982 - loss: 3.9799 - val_accuracy: 0.7976 - val_loss: 1.7059
Epoch 2/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 50ms/step - accuracy: 0.8054 - loss: 1.5041 - val_accuracy: 0.7998 - val_loss: 1.5001
Epoch 3/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 54ms/step - accuracy: 0.8088 - loss: 1.3975 - val_accuracy: 0.8034 - val_loss: 1.4540
Epoch 4/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 47ms/step - accuracy: 0.8109 - loss: 1.3531 - val_accuracy: 0.8041 - val_loss: 1.4319
Epoch 5/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 50ms/step - accuracy: 0.8122 - loss: 1.3234 - val_accuracy: 0.8050 - val_loss: 1.4184
Epoch 6/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 58ms/step - accuracy: 0.8136 - loss: 1.2991 - val_accuracy: 0.8058 - val_loss: 1.4071
Epoch 7/10
[1m4

In [66]:
model.evaluate([X_test, dec_test_input], np.expand_dims(dec_test_target, -1))

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 45ms/step - accuracy: 0.8137 - loss: 1.3567


[1.356658935546875, 0.8136851191520691]