In [1]:
pip install torchtext==0.16.0

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install portalocker==2.8.2

Note: you may need to restart the kernel to use updated packages.


In [3]:
!python -m spacy download de_core_news_sm

Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [4]:
import torch
import torchtext

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer



import spacy

en_nlp = spacy.load('en_core_web_sm')
de_nlp = spacy.load('de_core_news_sm')

import numpy as np
from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split

2024-09-24 17:32:15.114331: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
data = torchtext.datasets.Multi30k(split = 'train')

In [6]:
de_text = []
en_text = []

for data_idx,data in enumerate(data):
  en = ' '.join(['<sos>',data[1],'<eos>'])
  de_text.append(data[0])
  en_text.append(en)

In [7]:
class Text_Tokenizer:
  def __init__(self,nlp,tokenizer):
    self.nlp = nlp
    self.tokenizer = tokenizer

  def _spacy_tokenizer(self,text):
    return [token.lemma_ for token in self.nlp(text)]

  def _preprocess(self,text):
      text = self._spacy_tokenizer(text)
      return '|'.join(text)

  def tokenize(self,doc,maxlen,fit = None):
    doc = [self._preprocess(text) for text in doc]
    if fit == None:
      self.tokenizer.fit_on_texts(doc)

    tokenized_text = self.tokenizer.texts_to_sequences(doc)

    return tokenized_text

In [8]:
class Encoder(tf.keras.Model):
  def __init__(self,encoder_vocab_size,embedding_size,hidden_size):
    super().__init__()
    self.encoder_vocab_size = encoder_vocab_size
    self.embedding_size = embedding_size
    self.hidden_size = hidden_size

    self.embedding = tf.keras.layers.Embedding(encoder_vocab_size,embedding_size,mask_zero = True)
    self.lstm = tf.keras.layers.LSTM(hidden_size,return_sequences = True,return_state = True)

  def call(self,X):
    #print(f'Encoder')
    X = self.embedding(X)
    #print(f'X: {X.shape}')
    encoder_hidden_states,encoder_hidden,encoder_cell = self.lstm(X)
    #print(f'Encoder hidden states: {encoder_hidden_states.shape}')
    #print(f'Encoder hidden: {encoder_hidden.shape}')
    #print(f'Encoder cell: {encoder_cell.shape}')

    return encoder_hidden_states,encoder_hidden,encoder_cell

In [9]:
class Decoder(tf.keras.Model):
  def __init__(self,decoder_vocab_size,embedding_size,hidden_size):
    super().__init__()
    self.decoder_vocab_size = decoder_vocab_size
    self.embedding_size = embedding_size
    self.hidden_size = hidden_size

    self.embedding = tf.keras.layers.Embedding(decoder_vocab_size,embedding_size,mask_zero = True)
    self.lstm = tf.keras.layers.LSTM(hidden_size,return_sequences = True,return_state = True)
    self.linear = tf.keras.layers.Dense(decoder_vocab_size,activation = 'softmax')

  def call(self,X,encoder_states):
    #print(f'\n\nDecoder')
    encoder_hidden_states,encoder_hidden,encoder_cell = encoder_states
    X = self.embedding(X)
    #print(f'X: {X.shape}')

    decoder_hidden_states,decoder_hidden,decoder_cell = self.lstm(X,(encoder_hidden,encoder_cell))
    self.decoder_hidden = decoder_hidden
    self.decoder_cell = decoder_cell
    #print(f'Decoder hidden states: {decoder_hidden_states.shape}')
    #print(f'Decoder hidden: {decoder_hidden.shape}')
    #print(f'Decoder cell: {decoder_cell.shape}')

    output = self.linear(decoder_hidden_states)
    #print(f'Output: {output.shape}')

    return output

In [10]:
class Seq2Seq(tf.keras.Model):
  def __init__(self,encoder,decoder):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder

  def call(self,data):
    de_text,en_text = data
    encoder_hidden_states,encoder_hidden,encoder_cell = self.encoder(de_text)
    output = self.decoder(en_text,(encoder_hidden_states,encoder_hidden,encoder_cell))

    return output

In [11]:
class Train_Model:
    def __init__(self,model,epochs,batch_size = 32):
        self.model = model
        self.epochs = epochs
        self.batch_size = batch_size
        
    def _accuracy(self,y,y_pred):
        from sklearn.metrics import accuracy_score as accuracy
        y = np.array(y).reshape(-1)
        y_pred = np.array(y_pred).reshape(-1)
        
        accaptable_idxs = np.argwhere(y != 0).reshape(-1)
        
        y = y[accaptable_idxs]
        y_pred = y_pred[accaptable_idxs]
        
        prediction_accuracy = accuracy(y,y_pred)
        
        return prediction_accuracy
        
         
    def fit(self,X,y):
        model = self.model
        model.compile(
            loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = False),
            optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01),
        )

        print(f'Loss on Train Data:')
        model.fit(X,y,batch_size = self.batch_size,epochs = self.epochs)
        self.model = model
        train_prediction = tf.argmax(model(X),-1)
        
        train_accuracy = self._accuracy(y,train_prediction)
        print(f'Accuracy on Train Data: {train_accuracy}')
        
        return model
    
    def eval(self,X,y):
        model = self.model
        test_prediction = tf.argmax(model(X),-1)
        test_accuracy = self._accuracy(y,test_prediction)
        print(f'Accuracy on Test Data: {test_accuracy}')

In [12]:
maxlen = len(max(de_text,key = len))

de_tokenizer = Tokenizer(oov_token = '<unk>',split = '|')
en_tokenizer = Tokenizer(oov_token = '<unk>',split = '|')


de_text_tokenizer = Text_Tokenizer(de_nlp,de_tokenizer)
en_text_tokenizer = Text_Tokenizer(en_nlp,en_tokenizer)

de_text_tokenized = de_text_tokenizer.tokenize(de_text[:1000],maxlen)
en_text_tokenized = en_text_tokenizer.tokenize(en_text[:1000],maxlen)


In [13]:
en_text_input_tokenized = [s[:-1] for s in en_text_tokenized]
en_text_target_tokenized = [s[1:] for s in en_text_tokenized]

de_text_padded = tf.keras.utils.pad_sequences(de_text_tokenized,maxlen = maxlen,padding = 'post',truncating = 'post')
en_text_input_padded = tf.keras.utils.pad_sequences(en_text_input_tokenized,maxlen = maxlen,padding = 'post',truncating = 'post')
en_text_target_padded = tf.keras.utils.pad_sequences(en_text_target_tokenized,maxlen = maxlen,padding = 'post',truncating = 'post')


In [14]:
encoder_vocab_size = len(de_text_tokenizer.tokenizer.word_index) + 1
decoder_vocab_size = len(en_text_tokenizer.tokenizer.word_index) + 1
embedding_size = 128
hidden_size = 128

encoder = Encoder(encoder_vocab_size,embedding_size,hidden_size)
decoder = Decoder(decoder_vocab_size,embedding_size,hidden_size)

model = Seq2Seq(encoder,decoder)

model((de_text_padded,en_text_input_padded)).shape

de_text_padded.shape

(1000, 254)

In [17]:
X_train = [de_text_padded,en_text_input_padded]
y_train = en_text_target_padded

Trainer = Train_Model(model,epochs = 50)
Trainer.fit(X_train,y_train)
Trainer.eval(X_train,y_train)

Loss on Train Data:
Epoch 1/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 421ms/step - loss: 6.0244
Epoch 2/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 438ms/step - loss: 4.5140
Epoch 3/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 452ms/step - loss: 3.9853
Epoch 4/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 557ms/step - loss: 3.6830
Epoch 5/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 573ms/step - loss: 3.3910
Epoch 6/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 538ms/step - loss: 3.2259
Epoch 7/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 541ms/step - loss: 3.0393
Epoch 8/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 518ms/step - loss: 2.8629
Epoch 9/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 541ms/step - loss: 2.6837
Epoch 10/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

In [None]:
def translate(sentence,source_tokenizer,encoder,target_tokenizer,decoder,max_translated_len = 30):
    input_seq = source_tokenizer.tokenize([sentence],maxlen)
    print(f'Input seq: {input_seq}')
    tokenized = source_tokenizer.tokenizer.sequences_to_texts(input_seq)
    
    input_seq = tf.keras.utils.pad_sequences(input_seq,maxlen = maxlen,padding = 'post')
    encoder_output,state_h,state_c = encoder.predict(input_seq)
    
    current_word = 'sos'
    decoded_sentence = []
    
    while len(decoded_sentence) < max_translated_len:
        target_seq = np.zeros((1,1))
        target_seq[0,0] = target_tokenizer.tokenizer.word_index[current_word]
    
        prediction = decoder(target_seq,[encoder_output,state_h,state_c])
        state_h = decoder.decoder_hidden
        state_c = decoder.decoder_cell
    
        current_token_idx = np.argmax(prediction[0])
    
        current_word = target_tokenizer.tokenizer.index_word[current_token_idx]
    
        if (current_word == 'eos'):
            break
    
        decoded_sentence.append(current_word)
    
    return tokenized[0],' '.join(decoded_sentence)

In [None]:
translate(de_text[5],de_text_tokenizer,model.encoder,en_text_tokenizer,model.decoder)

Input seq: [[2, 5, 4, 40, 29, 2, 100, 31, 3, 45, 5, 13, 25, 249]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step


('ein mann in grün halten ein gitarre während der anderer mann sein hemd ansehen',
 'a woman in a red shirt be under a blanket and sleep in a vehicle')

In [None]:
de_text_tokenizer.tokenizer.sequences_to_texts([[2, 5, 4, 40, 29, 2, 100, 31, 3, 45, 5, 13, 25, 249]])

['ein mann in grün halten ein gitarre während der anderer mann sein hemd ansehen']