In [7]:
import re
from gensim.models import Word2Vec
import numpy as np
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import pathlib
import random
import string
import re
import numpy as np

import tensorflow as tf
import tensorflow.data as tf_data
import tensorflow.strings as tf_strings

import keras
from keras import layers
from keras import ops
from keras.layers import TextVectorization
import string




text_file = keras.utils.get_file(
    fname="spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)
text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"


with open(text_file) as f:
    lines = f.read().split("\n")[:-1]
english_words = []
spanish_words = []

def remove_punctuation(sentence):
    return sentence.translate(str.maketrans('', '', string.punctuation + '¿¡' ))

def preprocess_sentence(sentence):
    return remove_punctuation(sentence).lower()

for line in lines:
    eng, spa = line.split("\t")
    english_words.append(preprocess_sentence(eng))
    spanish_words.append(preprocess_sentence(spa))

def preprocess_data(text):
    text = re.sub(r'[^\w\s]', '', text)
    return text.lower().split()


embedding_dimension = 100

english_words = [preprocess_data(word) for word in english_words]
spanish_words = [preprocess_data(pair) for pair in spanish_words]
# Flatten the lists of tokens
english_tokens = [token for sentence in english_words for token in sentence]
spanish_tokens = [token for sentence in spanish_words for token in sentence]

print('English word tokens: ', (list(set(english_tokens)))[:15])
print('Spanish word token:: ',  (list(set(spanish_tokens)))[:15])

english_cbow_model = Word2Vec(sentences=english_words, vector_size=embedding_dimension, window=5, sg=0, min_count=1, epochs=50)
spanish_cbow_model = Word2Vec(sentences=spanish_words, vector_size=embedding_dimension, window=5, sg=0, min_count=1, epochs=50)

def map_words_to_vectors(sentences, cbow_model):
    vectors = []
    for sentence in sentences:
        vector = np.mean([cbow_model.wv[word] for word in sentence if word in cbow_model.wv], axis=0)
        vectors.append(vector)
    return vectors

english_vectors = map_words_to_vectors(english_words, english_cbow_model)
spanish_vectors = map_words_to_vectors(spanish_words, spanish_cbow_model)

# print("English Word to Index Dictionary:", english_word_to_index)
# print("Spanish Word to Index Dictionary:", spanish_word_to_index)
 
for i in range(5):
    print("Word:", english_words[i*4])
    print("Vector:", english_vectors[i])
    print()
print("First 10 Spanish Word Vectors:")
for i in range(10):
    print("Word:", spanish_words[i*4])
    print("Vector:", spanish_vectors[i])
    print()

English word tokens:  ['settled', 'ladies', 'szeged', 'insecure', 'caps', 'track', 'bounces', 'statistics', 'screw', 'rises', 'was', 'unpacked', 'overconfident', 'impressions', 'blond']
Spanish word token::  ['bravura', 'cerrara', 'disparéis', 'parase', 'superaron', 'perspicaz', 'tréboles', 'obsesionado', 'mantuve', 'endulzante', 'esté', 'comienzo', 'equivalía', 'bisabuelo', 'quemaron']
Word: ['go']
Vector: [-0.83642954 -2.205397   -1.9177172  -2.7171035  -2.904938    1.1876247
  0.9511666  -3.216733   -4.2320185   2.8309457   1.9119867   0.43954247
 -1.2282664   0.06773516  0.8859739  -1.240249   -1.1569941  -2.2372835
  0.8699846   1.5118818   0.45527455  5.882317    1.5936309   1.9781195
 -0.42226812  1.7007952  -2.7771466   0.02887512  2.7459216  -1.652935
  0.87697184  2.2297688  -0.14299941 -0.16993853 -1.2915593   0.3302774
 -1.2066706   1.7788837  -0.07550288  1.5663531  -1.8266319  -1.294214
 -0.60424817 -0.59220487 -2.3545537  -1.264424    2.3770056   3.328467
 -2.5673213  -0

In [8]:



max_eng_seq_length = max(len(sentence) for sentence in english_words)
max_spa_seq_length = max(len(sentence) for sentence in spanish_words)
english_vocab_size = len(set(english_tokens))
spanish_vocab_size = len(set(spanish_tokens))
print('Total english tokens: ', len(set(spanish_tokens)))
print('Spanish vocab size: ', spanish_vocab_size)
print("Max sequence length for English:", max_eng_seq_length)
print("Max sequence length for Spanish:", max_spa_seq_length)

Total english tokens:  26025
Spanish vocab size:  26025
Max sequence length for English: 47
Max sequence length for Spanish: 49


In [9]:
target_words = ["girl", "dress", "test", "walk", "food"]

for target_word in target_words:
    similar_words_english = english_cbow_model.wv.most_similar(target_word, topn=5)
    print(f"Similar words to {target_word} in English:", similar_words_english)
print('-----------------------------------------------------------')
target_words = ["bien", "hola", "marineros", "alimento", "ropa"]
for target_word in target_words:
    similar_words_spanish = spanish_cbow_model.wv.most_similar(target_word, topn=5)
    print(f"Similar words to {target_word} in English:", similar_words_spanish)




Similar words to girl in English: [('boy', 0.7343618273735046), ('woman', 0.6738501191139221), ('man', 0.5900071263313293), ('doll', 0.5262684226036072), ('child', 0.5247960686683655)]
Similar words to dress in English: [('skirt', 0.6811878681182861), ('sweater', 0.677156388759613), ('hat', 0.6525558829307556), ('shirt', 0.6519303917884827), ('jacket', 0.6336907744407654)]
Similar words to test in English: [('exam', 0.6777861714363098), ('examination', 0.618669331073761), ('subject', 0.45732221007347107), ('student', 0.4531395733356476), ('picnic', 0.449260413646698)]
Similar words to walk in English: [('swim', 0.5474985241889954), ('drive', 0.5462285876274109), ('run', 0.5331001281738281), ('taxi', 0.5215353965759277), ('shower', 0.5137079358100891)]
Similar words to food in English: [('fruit', 0.570177435874939), ('rice', 0.5175455212593079), ('vegetables', 0.5147916078567505), ('sushi', 0.5082530975341797), ('wine', 0.49756258726119995)]
---------------------------------------------

In [10]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Attention, Concatenate
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import RMSprop



# Reshape the input data to include a third dimension
english_vectors_np = np.expand_dims(english_vectors, axis=-1)
spanish_vectors_np = np.expand_dims(spanish_vectors, axis=-1)
# Split the data into train and test sets
english_train, english_test, spanish_train, spanish_test = train_test_split(english_vectors_np, spanish_vectors_np, test_size=0.3)
latent_dim = 256

encoder_inputs = Input(shape=(100, 1))

# Encoder Layer
encoder_lstm1 = LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs1, state_h1, state_c1 = encoder_lstm1(encoder_inputs)

encoder_lstm2 = LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm2(encoder_outputs1)
encoder_states = [state_h, state_c]

# Decoder Input
decoder_inputs = Input(shape=(100, 1))

# Decoder Layer
decoder_lstm1 = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs1, _, _ = decoder_lstm1(decoder_inputs, initial_state=encoder_states)

# Attention 
attention_layer = Attention()
attention_out = attention_layer([decoder_outputs1, encoder_outputs])
decoder_concat_input = Concatenate(axis=-1)([decoder_outputs1, attention_out])

# Output Layer
decoder_dense = Dense(spanish_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
optimizer = RMSprop(learning_rate=0.1)
model.compile(optimizer=optimizer, loss='mse')

model.fit([english_train, spanish_train], spanish_train,
          batch_size=32,
          epochs=30,
          validation_split=0.1)

# loss = model.evaluate([english_test, spanish_test], spanish_test)
# print("Test loss:", loss)

Epoch 1/30
[1m  22/2343[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:30:13[0m 2s/step - loss: 0.3643