<a href="https://colab.research.google.com/github/siddugoud6966/NLP_2024-2025/blob/main/NLP_ASS_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Lambda, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.backend as K
import numpy as np

# Multilingual dataset provided
data = {
    'English': [
        "What is your name?", "Where do you live?", "How old are you?",
        "What do you do for a living?", "What is your favorite color?"
    ],
    'Telugu': [
        "నీ పేరు ఏమిటి?", "నువ్వు ఎక్కడ నివసిస్తున్నావు?", "మీరు ఎంత వయస్సు ఉన్నారు?",
        "మీరు జీవించడానికి ఏమి చేస్తున్నారు?", "మీ ఇష్ట రంగు ఏది?"
    ],
    'Hindi': [
        "आपका नाम क्या है?", "आप कहाँ रहते हैं?", "आपकी उम्र क्या है?",
        "आप क्या करते हैं?", "आपका पसंदीदा रंग क्या है?"
    ],
    'Tamil': [
        "உங்கள் பெயர் என்ன?", "நீங்கள் எங்கு வாழ்கிறீர்கள்?", "உங்கள் வயது என்ன?",
        "நீங்கள் என்ன வேலை செய்கிறீர்கள்?", "உங்கள் விருப்பமான நிறம் என்ன?"
    ]
}

# Combine all sentences
all_sentences = []
for lang in data:
    all_sentences.extend(data[lang])

# Tokenize and preprocess
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(all_sentences)
sequences = tokenizer.texts_to_sequences(all_sentences)
word_index = tokenizer.word_index

# Pad sequences to make them uniform in length
max_sequence_length = max(len(seq) for seq in sequences)
padded_data = pad_sequences(sequences, maxlen=max_sequence_length)
vocab_size = len(word_index) + 1

# Parameters
embedding_dim = 64
latent_dim = 16


# **Define the VAE Model**

In [2]:
 #Encoder Model
encoder_inputs = Input(shape=(max_sequence_length,))
x = Embedding(vocab_size, embedding_dim, input_length=max_sequence_length)(encoder_inputs)
x = Bidirectional(LSTM(64, return_sequences=False))(x)
z_mean = Dense(latent_dim)(x)
z_log_sigma = Dense(latent_dim)(x)

# Reparameterization trick
def sampling(args):
    z_mean, z_log_sigma = args
    batch = tf.shape(z_mean)[0]
    dim = tf.shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_sigma) * epsilon

z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_sigma])

# Define the encoder model
encoder = Model(encoder_inputs, [z_mean, z_log_sigma, z], name='encoder')
encoder.summary()




# VAE Decoder

In [3]:
from tensorflow.keras.layers import Reshape, RepeatVector

# Decoder Model
decoder_inputs = Input(shape=(latent_dim,))
x = Dense(64, activation='relu')(decoder_inputs)

# Ensure that the Dense layer output matches max_sequence_length * embedding_dim
x = Dense(max_sequence_length * embedding_dim, activation='relu')(x)

# Reshape the output to be compatible with LSTM input
x = Reshape((max_sequence_length, embedding_dim))(x)
x = LSTM(64, return_sequences=True)(x)
x = Dense(vocab_size, activation='softmax')(x)

decoder = Model(decoder_inputs, x, name='decoder')
decoder.summary()


# **VAE Loss Function**

In [4]:
# VAE Loss Function (KL Divergence + Reconstruction Loss)
def vae_loss(y_true, y_pred):
    # Reconstruction loss
    recon_loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)
    recon_loss = K.mean(recon_loss)

    # KL Divergence
    kl_loss = 1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma)
    kl_loss = K.mean(kl_loss) * -0.5

    return recon_loss + kl_loss




#  **Compile and Train the VAE**



In [5]:
from tensorflow.keras.layers import Input, LSTM, Dense, Reshape, RepeatVector
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
# Assuming you have defined encoder, decoder, z_mean, z_log_sigma, latent_dim, max_sequence_length, embedding_dim, vocab_size
# ... (Your encoder and decoder definitions from previous cells) ...

# Combine Encoder and Decoder

In [6]:
# Check the shape of padded_data
print("Shape of padded_data:", padded_data.shape)

# Train the model
try:
    vae.fit(padded_data, padded_data, epochs=20, batch_size=8)
except Exception as e:
    print("Error during training:", e)

Shape of padded_data: (20, 7)
Error during training: name 'vae' is not defined


# **Step 3: Generating New Text with the Trained VAE**

In [7]:
# Generate new text
def generate_text(num_samples=5):
    for _ in range(num_samples):
        z_sample = np.random.normal(size=(1, latent_dim))
        generated_sequence = decoder.predict(z_sample)
        generated_indices = np.argmax(generated_sequence[0], axis=-1)
        generated_text = tokenizer.sequences_to_texts([generated_indices])
        print("Generated Text:", generated_text[0])

# Generate new samples
generate_text(5)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 310ms/step
Generated Text: करते do வாழ்கிறீர்கள் நிறம் நிறம் color color
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Generated Text: color color color color color color நிறம்
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Generated Text: color where color நிறம் color color color
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Generated Text: are color color color color color color
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Generated Text: are you you you where color நிறம்
