In [1]:
import os
import json
import re
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, 
    LSTM, 
    Dense, 
    Embedding, 
    Dropout, 
    TimeDistributed, 
    Masking,
    LayerNormalization,
    Concatenate
)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import (
    EarlyStopping, 
    ReduceLROnPlateau,
    ModelCheckpoint
)
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

# Initial simple LSTM

In [2]:
MAX_QUESTION_LENGTH = 50
MAX_ANSWER_LENGTH = 100
EMBEDDING_DIM = 256
LATENT_DIM = 512
BATCH_SIZE = 32
EPOCHS = 50
VALIDATION_SPLIT = 0.2
TEST_SPLIT = 0.1

## Data Preprocessing

In [3]:
def load_data(dataset_path):
    data = []
    for file in sorted(os.listdir(dataset_path)):
        if file.endswith(".json"):
            with open(os.path.join(dataset_path, file), "r", encoding="utf-8") as f:
                content = json.load(f)
                qa_pairs = content.get("qa_pairs", [])
                data.extend(qa_pairs)
    return pd.DataFrame(data)

In [4]:
def clean_text(text):
    text = text.lower().strip()
    text = re.sub(r'[^a-zA-Z0-9.,!?\'"]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = f"<START> {text} <END>"
    return text

In [5]:
def preprocess_data(df):
    # Clean text
    df["question"] = df["question"].apply(clean_text)
    df["answer"] = df["answer"].apply(clean_text)
    
    # Create tokenizers
    question_tokenizer = Tokenizer(oov_token="<OOV>", filters='')
    answer_tokenizer = Tokenizer(oov_token="<OOV>", filters='')
    
    # Fit tokenizers
    question_tokenizer.fit_on_texts(df["question"])
    answer_tokenizer.fit_on_texts(df["answer"])
    
    # Convert to sequences
    question_sequences = question_tokenizer.texts_to_sequences(df["question"])
    answer_sequences = answer_tokenizer.texts_to_sequences(df["answer"])
    
    # Pad sequences
    question_padded = pad_sequences(question_sequences, maxlen=MAX_QUESTION_LENGTH, padding='post')
    answer_padded = pad_sequences(answer_sequences, maxlen=MAX_ANSWER_LENGTH, padding='post')
    
    return question_padded, answer_padded, question_tokenizer, answer_tokenizer

## Model building

In [18]:
def build_model(vocab_size_q, vocab_size_a):
    # Encoder
    encoder_inputs = Input(shape=(MAX_QUESTION_LENGTH,), name='encoder_input')
    encoder_embedding = Embedding(vocab_size_q, EMBEDDING_DIM, mask_zero=True, name='encoder_embedding')
    encoder_embed = encoder_embedding(encoder_inputs)
    encoder_dropout = Dropout(0.2, name='encoder_dropout')(encoder_embed)
    encoder_lstm = LSTM(LATENT_DIM, return_state=True, name='encoder_lstm')
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_dropout)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(MAX_ANSWER_LENGTH-1,), name='decoder_input')
    decoder_embedding = Embedding(vocab_size_a, EMBEDDING_DIM, mask_zero=True, name='decoder_embedding')
    decoder_embed = decoder_embedding(decoder_inputs)
    decoder_dropout1 = Dropout(0.2, name='decoder_dropout1')(decoder_embed)
    decoder_lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True, name='decoder_lstm')
    decoder_outputs, _, _ = decoder_lstm(decoder_dropout1, initial_state=encoder_states)
    decoder_dropout2 = Dropout(0.2, name='decoder_dropout2')(decoder_outputs)
    decoder_dense = Dense(vocab_size_a, activation='softmax', name='decoder_dense')
    decoder_outputs = decoder_dense(decoder_dropout2)

    # Model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(clipnorm=1.0),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    model.summary()
    
    return model

## Training and evaluation

In [23]:
def create_inference_models(model, vocab_size_a):
    # Get relevant layers from the training model
    encoder_lstm = None
    decoder_lstm = None
    decoder_dense = None
    encoder_embedding = None
    decoder_embedding = None
    
    # Find the LSTM and Dense layers
    for layer in model.layers:
        if isinstance(layer, LSTM):
            if encoder_lstm is None:
                encoder_lstm = layer
            else:
                decoder_lstm = layer
        elif isinstance(layer, Dense):
            decoder_dense = layer
        elif isinstance(layer, Embedding):
            if encoder_embedding is None:
                encoder_embedding = layer
            else:
                decoder_embedding = layer

    # Create encoder model
    encoder_inputs = Input(shape=(MAX_QUESTION_LENGTH,))
    x = encoder_embedding(encoder_inputs)
    x = Dropout(0.2)(x)
    _, state_h, state_c = encoder_lstm(x)
    encoder_model = Model(encoder_inputs, [state_h, state_c])

    # Create decoder model
    decoder_inputs = Input(shape=(1,))
    decoder_state_input_h = Input(shape=(LATENT_DIM,))
    decoder_state_input_c = Input(shape=(LATENT_DIM,))
    
    x = decoder_embedding(decoder_inputs)
    decoder_outputs, state_h, state_c = decoder_lstm(
        x, 
        initial_state=[decoder_state_input_h, decoder_state_input_c]
    )
    decoder_outputs = decoder_dense(decoder_outputs)
    
    decoder_model = Model(
        [decoder_inputs, decoder_state_input_h, decoder_state_input_c],
        [decoder_outputs, state_h, state_c]
    )

    return encoder_model, decoder_model

In [24]:
def evaluate_model(encoder_model, decoder_model, q_test, a_test, answer_tokenizer):
    smooth = SmoothingFunction().method1
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
    
    total_bleu = 0
    total_rouge = 0
    
    for i in range(len(q_test)):
        # Encode input sequence
        states_value = encoder_model.predict(q_test[i:i+1], verbose=0)
        
        # Generate answer
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = answer_tokenizer.word_index.get('<start>', 0)
        
        decoded_tokens = []
        while len(decoded_tokens) < MAX_ANSWER_LENGTH:
            output_tokens, h, c = decoder_model.predict(
                [target_seq] + states_value,
                verbose=0
            )
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_word = answer_tokenizer.index_word.get(sampled_token_index, '')
            
            if sampled_word == '<end>' or sampled_word == '':
                break
                
            decoded_tokens.append(sampled_word)
            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index
            states_value = [h, c]
        
        # Get reference answer
        reference_tokens = [answer_tokenizer.index_word.get(idx, '') 
                          for idx in a_test[i] if idx != 0]
        reference_tokens = [token for token in reference_tokens 
                          if token not in ['<start>', '<end>']]
        
        # Calculate BLEU score
        bleu = sentence_bleu([reference_tokens], decoded_tokens, smoothing_function=smooth)
        total_bleu += bleu
        
        # Calculate ROUGE score
        rouge_scores = rouge_scorer_instance.score(
            ' '.join(reference_tokens),
            ' '.join(decoded_tokens)
        )
        total_rouge += rouge_scores['rougeL'].fmeasure
    
    return total_bleu/len(q_test), total_rouge/len(q_test)

In [None]:
df = load_data("dataset")
q_data, a_data, q_tokenizer, a_tokenizer = preprocess_data(df)
    
# Split data
indices = np.arange(len(q_data))
np.random.shuffle(indices)
q_data = q_data[indices]
a_data = a_data[indices]
    
num_val = int(len(q_data) * VALIDATION_SPLIT)
num_test = int(len(q_data) * TEST_SPLIT)
    
q_train = q_data[:-num_val-num_test]
a_train = a_data[:-num_val-num_test]
q_val = q_data[-num_val-num_test:-num_test]
a_val = a_data[-num_val-num_test:-num_test]
q_test = q_data[-num_test:]
a_test = a_data[-num_test:]
    
# Build and train model
model = build_model(len(q_tokenizer.word_index) + 1, len(a_tokenizer.word_index) + 1)
    
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)
]
    
history = model.fit(
    [q_train, a_train[:, :-1]], a_train[:, 1:],
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=([q_val, a_val[:, :-1]], a_val[:, 1:]),
    callbacks=callbacks
)
    
# Create inference models
encoder_model, decoder_model = create_inference_models(
    model, 
    len(a_tokenizer.word_index) + 1
)
    
# Evaluate
bleu_score, rouge_score = evaluate_model(
    encoder_model, 
    decoder_model, 
    q_test, 
    a_test, 
    a_tokenizer
)
    
print("\nEvaluation Results:")
print(f"BLEU Score: {bleu_score:.4f}")
print(f"ROUGE Score: {rouge_score:.4f}")

## Testing on Random questions

In [27]:
def ask_question(question, encoder_model, decoder_model, q_tokenizer, a_tokenizer, max_length=50):
    # Preprocess the question
    # Clean text (same as training)
    question = question.lower().strip()
    question = re.sub(r'[^a-zA-Z0-9.,!?\'"]', ' ', question)
    question = re.sub(r'\s+', ' ', question)
    question = f"<START> {question} <END>"
    
    # Tokenize and pad the question
    q_seq = q_tokenizer.texts_to_sequences([question])
    q_seq = pad_sequences(q_seq, maxlen=MAX_QUESTION_LENGTH, padding='post')
    
    # Encode the input sequence
    states_value = encoder_model.predict(q_seq, verbose=0)
    
    # Generate answer tokens
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = a_tokenizer.word_index.get('<start>', 0)
    
    # Collect decoded tokens
    decoded_tokens = []
    
    while len(decoded_tokens) < max_length:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value, verbose=0)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = a_tokenizer.index_word.get(sampled_token_index, '')
        
        if sampled_word == '<end>' or sampled_word == '':
            break
            
        decoded_tokens.append(sampled_word)
        
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]
    
    return ' '.join(decoded_tokens)


In [28]:
def test_random_question(df, encoder_model, decoder_model, q_tokenizer, a_tokenizer):
    # Select random question
    idx = np.random.randint(0, len(df))
    original_q = df.iloc[idx]['question']
    original_a = df.iloc[idx]['answer']
    
    # Get model's answer
    model_answer = ask_question(original_q, encoder_model, decoder_model, q_tokenizer, a_tokenizer)
    
    print("Question:", original_q)
    print("Original Answer:", original_a)
    print("Model Answer:", model_answer)
    print("-" * 50)

In [None]:
test_random_question(df, encoder_model, decoder_model, q_tokenizer, a_tokenizer)

In [None]:
print("Answer:", ask_question("What is your question?", encoder_model, decoder_model, q_tokenizer, a_tokenizer))

# Tuned LSTM

In [21]:
MAX_QUESTION_LENGTH = 50
MAX_ANSWER_LENGTH = 100
EMBEDDING_DIM = 256  # Increased from 256
LATENT_DIM = 512    # Increased from 512
BATCH_SIZE = 64     # Increased from 32
EPOCHS = 100
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.1
LEARNING_RATE = 3e-4
CLIP_NORM = 1.0

## Preprocessing

In [22]:
def augment_data(question, answer):
    augmented_pairs = []
    # Original pair
    augmented_pairs.append((question, answer))
    
    # Remove punctuation version
    q_no_punct = re.sub(r'[.,!?]', '', question)
    augmented_pairs.append((q_no_punct, answer))
    
    # Shuffle words slightly (maintaining rough meaning)
    words = question.split()
    if len(words) > 3:
        for i in range(min(3, len(words)-1)):
            shuffled = words.copy()
            shuffled[i], shuffled[i+1] = shuffled[i+1], shuffled[i]
            augmented_pairs.append((' '.join(shuffled), answer))
    
    return augmented_pairs

# Modify your data loading:
def load_data(dataset_path):
    data = []
    for file in sorted(os.listdir(dataset_path)):
        if file.endswith(".json"):
            with open(os.path.join(dataset_path, file), "r", encoding="utf-8") as f:
                content = json.load(f)
                qa_pairs = content.get("qa_pairs", [])
                for pair in qa_pairs:
                    augmented = augment_data(pair["question"], pair["answer"])
                    for q, a in augmented:
                        data.append({"question": q, "answer": a})
    return pd.DataFrame(data)

def improved_clean_text(text):
    """Enhanced text cleaning with better special character handling"""
    text = text.lower().strip()
    # Preserve more meaningful punctuation and symbols
    text = re.sub(r'[^\w\s.,!?\'"-:;$%#@&*()]', ' ', text)
    # Normalize numbers
    text = re.sub(r'\d+', 'NUM', text)
    # Normalize spaces
    text = re.sub(r'\s+', ' ', text)
    return f"<START> {text} <END>"

def preprocess_data(df):
    """Improved data preprocessing"""
    # Clean text
    df["question"] = df["question"].apply(improved_clean_text)
    df["answer"] = df["answer"].apply(improved_clean_text)
    
    # Create tokenizers with additional special tokens
    question_tokenizer = Tokenizer(oov_token="<UNK>", filters='')
    answer_tokenizer = Tokenizer(oov_token="<UNK>", filters='')
    
    # Add padding token
    question_tokenizer.word_index['<PAD>'] = 0
    answer_tokenizer.word_index['<PAD>'] = 0
    
    # Fit tokenizers
    question_tokenizer.fit_on_texts(df["question"])
    answer_tokenizer.fit_on_texts(df["answer"])
    
    # Convert to sequences
    question_sequences = question_tokenizer.texts_to_sequences(df["question"])
    answer_sequences = answer_tokenizer.texts_to_sequences(df["answer"])
    
    # Pad sequences
    question_padded = pad_sequences(question_sequences, maxlen=MAX_QUESTION_LENGTH, padding='post')
    answer_padded = pad_sequences(answer_sequences, maxlen=MAX_ANSWER_LENGTH, padding='post')
    
    return question_padded, answer_padded, question_tokenizer, answer_tokenizer

## Model building

In [23]:
def build_improved_model(vocab_size_q, vocab_size_a):
    """Build improved seq2seq model with optimized architecture"""
    # Encoder
    encoder_inputs = Input(shape=(MAX_QUESTION_LENGTH,), name='encoder_input')
    
    # Improved embedding with proper initialization
    encoder_embedding = Embedding(
        vocab_size_q, 
        EMBEDDING_DIM,
        mask_zero=True,
        embeddings_initializer='glorot_uniform',
        name='encoder_embedding'
    )
    encoder_embed = encoder_embedding(encoder_inputs)
    
    # Increase dropout rate
    encoder_dropout1 = Dropout(0.5)(encoder_embed)  # from 0.3 to 0.5
    
    # Add L2 regularization to LSTM layers
    encoder_lstm = LSTM(
        LATENT_DIM,
        return_state=True,
        kernel_initializer='glorot_uniform',
        recurrent_initializer='orthogonal',
        kernel_regularizer=tf.keras.regularizers.l2(1e-4),
        recurrent_regularizer=tf.keras.regularizers.l2(1e-4),
        name='encoder_lstm'
    )
    
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_dropout1)
    
    # Add layer normalization
    state_h = LayerNormalization()(state_h)
    state_c = LayerNormalization()(state_c)
    
    # Decoder
    decoder_inputs = Input(shape=(MAX_ANSWER_LENGTH-1,), name='decoder_input')
    
    decoder_embedding = Embedding(
        vocab_size_a,
        EMBEDDING_DIM,
        mask_zero=True,
        embeddings_initializer='glorot_uniform',
        name='decoder_embedding'
    )
    decoder_embed = decoder_embedding(decoder_inputs)
    
    # Add dropout
    decoder_dropout1 = Dropout(0.5)(decoder_embed)  # from 0.3 to 0.5
    
    decoder_lstm = LSTM(
        LATENT_DIM,
        return_sequences=True,
        return_state=True,
        kernel_initializer='glorot_uniform',
        recurrent_initializer='orthogonal',
        name='decoder_lstm'
    )
    
    decoder_outputs, _, _ = decoder_lstm(decoder_dropout1, initial_state=[state_h, state_c])
    
    # Add layer normalization
    decoder_outputs = LayerNormalization()(decoder_outputs)
    
    # Final dropout before dense layer
    decoder_dropout2 = Dropout(0.5)(decoder_outputs)  # from 0.3 to 0.5
    
    # Dense layer with proper initialization
    decoder_dense = Dense(
        vocab_size_a,
        activation='softmax',
        kernel_initializer='glorot_uniform',
        name='decoder_dense'
    )
    decoder_outputs = decoder_dense(decoder_dropout2)
    
    # Define model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    
    # Custom Adam optimizer with gradient clipping
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=LEARNING_RATE,
        clipnorm=CLIP_NORM,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-7
    )

    def sparse_categorical_crossentropy_with_smoothing(y_true, y_pred, smoothing=0.1):
        num_classes = tf.cast(tf.shape(y_pred)[-1], tf.float32)
        y_true = tf.cast(y_true, tf.int32)
        y_true_one_hot = tf.one_hot(y_true, tf.shape(y_pred)[-1])
        
        # Apply label smoothing
        y_true_smooth = (1.0 - smoothing) * y_true_one_hot + smoothing / num_classes
        
        return tf.reduce_mean(
            tf.reduce_sum(-y_true_smooth * tf.math.log(y_pred + 1e-7), axis=-1)
        )

    # Then in the model.compile():
    model.compile(
        optimizer=optimizer,
        loss=sparse_categorical_crossentropy_with_smoothing,
        metrics=['accuracy']
    )

    print(model.summary())
    
    return model

In [24]:
def create_inference_models(model, vocab_size_a):
    """Create separate encoder and decoder models for inference"""
    # Get relevant layers
    encoder_lstm = None
    decoder_lstm = None
    decoder_dense = None
    encoder_embedding = None
    decoder_embedding = None
    
    for layer in model.layers:
        if isinstance(layer, LSTM):
            if encoder_lstm is None:
                encoder_lstm = layer
            else:
                decoder_lstm = layer
        elif isinstance(layer, Dense):
            decoder_dense = layer
        elif isinstance(layer, Embedding):
            if encoder_embedding is None:
                encoder_embedding = layer
            else:
                decoder_embedding = layer

    # Create encoder model
    encoder_inputs = Input(shape=(MAX_QUESTION_LENGTH,))
    x = encoder_embedding(encoder_inputs)
    x = Dropout(0.3)(x)
    _, state_h, state_c = encoder_lstm(x)
    state_h = LayerNormalization()(state_h)
    state_c = LayerNormalization()(state_c)
    encoder_model = Model(encoder_inputs, [state_h, state_c])

    # Create decoder model
    decoder_inputs = Input(shape=(1,))
    decoder_state_input_h = Input(shape=(LATENT_DIM,))
    decoder_state_input_c = Input(shape=(LATENT_DIM,))
    
    x = decoder_embedding(decoder_inputs)
    x = Dropout(0.3)(x)
    decoder_outputs, state_h, state_c = decoder_lstm(
        x, 
        initial_state=[decoder_state_input_h, decoder_state_input_c]
    )
    decoder_outputs = LayerNormalization()(decoder_outputs)
    decoder_outputs = Dropout(0.3)(decoder_outputs)
    decoder_outputs = decoder_dense(decoder_outputs)
    
    decoder_model = Model(
        [decoder_inputs, decoder_state_input_h, decoder_state_input_c],
        [decoder_outputs, state_h, state_c]
    )

    return encoder_model, decoder_model

## Model Evalutation

In [25]:
def evaluate_model(encoder_model, decoder_model, q_test, a_test, answer_tokenizer):
    """Evaluate the model using BLEU and ROUGE scores"""
    smooth = SmoothingFunction().method1
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
    
    total_bleu = 0
    total_rouge = 0
    
    for i in range(len(q_test)):
        states_value = encoder_model.predict(q_test[i:i+1], verbose=0)
        
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = answer_tokenizer.word_index.get('<start>', 0)
        
        decoded_tokens = []
        while len(decoded_tokens) < MAX_ANSWER_LENGTH:
            output_tokens, h, c = decoder_model.predict(
                [target_seq] + states_value,
                verbose=0
            )
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_word = answer_tokenizer.index_word.get(sampled_token_index, '')
            
            if sampled_word == '<end>' or sampled_word == '':
                break
                
            decoded_tokens.append(sampled_word)
            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index
            states_value = [h, c]
        
        reference_tokens = [answer_tokenizer.index_word.get(idx, '') 
                          for idx in a_test[i] if idx != 0]
        reference_tokens = [token for token in reference_tokens 
                          if token not in ['<start>', '<end>', '<pad>']]
        
        bleu = sentence_bleu([reference_tokens], decoded_tokens, smoothing_function=smooth)
        total_bleu += bleu
        
        rouge_scores = rouge_scorer_instance.score(
            ' '.join(reference_tokens),
            ' '.join(decoded_tokens)
        )
        total_rouge += rouge_scores['rougeL'].fmeasure
    
    return total_bleu/len(q_test), total_rouge/len(q_test)


In [26]:
def ask_question(question, encoder_model, decoder_model, q_tokenizer, a_tokenizer, max_length=50):
    """Ask a question to the trained model and get its response"""
    question = improved_clean_text(question)
    
    q_seq = q_tokenizer.texts_to_sequences([question])
    q_seq = pad_sequences(q_seq, maxlen=MAX_QUESTION_LENGTH, padding='post')
    
    states_value = encoder_model.predict(q_seq, verbose=0)
    
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = a_tokenizer.word_index.get('<start>', 0)
    
    decoded_tokens = []
    
    while len(decoded_tokens) < max_length:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value, verbose=0)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = a_tokenizer.index_word.get(sampled_token_index, '')
        
        if sampled_word == '<end>' or sampled_word == '':
            break
            
        decoded_tokens.append(sampled_word)
        
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]
    
    return ' '.join(decoded_tokens)

## Training and performance

In [27]:
class WarmUpLearningRateScheduler(tf.keras.callbacks.Callback):
    def __init__(self, warmup_steps, initial_lr):
        super().__init__()
        self.warmup_steps = warmup_steps
        self.initial_lr = initial_lr
        self.step = 0
        
    def on_batch_begin(self, batch, logs=None):
        self.step += 1
        if self.step <= self.warmup_steps:
            lr = (self.step / self.warmup_steps) * self.initial_lr
            self.model.optimizer.learning_rate.assign(lr)

callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=20,
        restore_best_weights=True,
        min_delta=1e-4
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,
        patience=12,
        min_lr=1e-6,
        verbose=1
    ),
    ModelCheckpoint(
        'best_model.keras',
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    )
]

callbacks.append(WarmUpLearningRateScheduler(warmup_steps=100, initial_lr=5e-4))

In [28]:
df = load_data("dataset")
q_data, a_data, q_tokenizer, a_tokenizer = preprocess_data(df)
    
# Split data
indices = np.arange(len(q_data))
np.random.shuffle(indices)
q_data = q_data[indices]
a_data = a_data[indices]
    
num_val = int(len(q_data) * VALIDATION_SPLIT)
num_test = int(len(q_data) * TEST_SPLIT)
    
q_train = q_data[:-num_val-num_test]
a_train = a_data[:-num_val-num_test]
q_val = q_data[-num_val-num_test:-num_test]
a_val = a_data[-num_val-num_test:-num_test]
q_test = q_data[-num_test:]
a_test = a_data[-num_test:]
    
# Build and train model
model = build_improved_model(len(q_tokenizer.word_index) + 1, len(a_tokenizer.word_index) + 1)
    
history = model.fit(
    [q_train, a_train[:, :-1]],
    a_train[:, 1:],
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=([q_val, a_val[:, :-1]], a_val[:, 1:]),
    callbacks=callbacks
)

# Create inference models
encoder_model, decoder_model = create_inference_models(model, len(a_tokenizer.word_index) + 1)
    
# Evaluate
bleu_score, rouge_score = evaluate_model(encoder_model, decoder_model, q_test, a_test, a_tokenizer)
    
print("\nEvaluation Results:")
print(f"BLEU Score: {bleu_score:.4f}")
print(f"ROUGE Score: {rouge_score:.4f}")

None
Epoch 1/100
[1m162/162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 761ms/step - accuracy: 0.2187 - loss: 7.2295      
Epoch 1: val_loss improved from inf to 5.25599, saving model to best_model.keras
[1m162/162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 788ms/step - accuracy: 0.2192 - loss: 7.2234 - val_accuracy: 0.3521 - val_loss: 5.2560 - learning_rate: 5.0000e-04
Epoch 2/100
[1m162/162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 749ms/step - accuracy: 0.3494 - loss: 5.1978  
Epoch 2: val_loss improved from 5.25599 to 4.56926, saving model to best_model.keras
[1m162/162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 770ms/step - accuracy: 0.3495 - loss: 5.1968 - val_accuracy: 0.4037 - val_loss: 4.5693 - learning_rate: 5.0000e-04
Epoch 3/100
[1m162/162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 686ms/step - accuracy: 0.3993 - loss: 4.5216  
Epoch 3: val_loss improved from 4.56926 to 4.01193, saving model to best_model.keras

In [35]:
print("Answer:", ask_question("what is blockchain ?", encoder_model, decoder_model, q_tokenizer, a_tokenizer))

Answer: blockchain is a decentralized digital currency that operates without a central authority or banks. it enables peer-to-peer transactions on a global scale through a network of computers running the bitcoin protocol. transactions are verified by network nodes through cryptography and recorded in a public distributed ledger called a blockchain. bitcoin


# Final Tuned LSTM

## Data Preprocessing

In [33]:
def advanced_augment_data(question, answer, augmentation_factor=3):
    augmented_pairs = [(question, answer)] 
    
    translator = Translator()
    
    def get_synonyms(word):
        synonyms = []
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                if lemma.name() != word and '_' not in lemma.name():
                    synonyms.append(lemma.name())
        return list(set(synonyms))
    
    def back_translate(text, intermediate_langs=['fr', 'de', 'es']):
        try:
            lang = random.choice(intermediate_langs)
            intermediate = translator.translate(text, dest=lang).text
            return translator.translate(intermediate, dest='en').text
        except:
            return text
    
    # 1. Synonym replacement
    words = question.split()
    for _ in range(min(3, len(words))):
        new_words = words.copy()
        idx = random.randint(0, len(words)-1)
        synonyms = get_synonyms(words[idx])
        if synonyms:
            new_words[idx] = random.choice(synonyms)
            augmented_pairs.append((' '.join(new_words), answer))
    
    # 2. Back translation
    if len(question.split()) > 3:  # Only for longer questions
        translated = back_translate(question)
        if translated != question:
            augmented_pairs.append((translated, answer))
    
    # 3. Random deletion (with probability)
    if len(words) > 4:
        new_words = [word for word in words if random.random() > 0.2]
        if new_words:
            augmented_pairs.append((' '.join(new_words), answer))
    
    return augmented_pairs[:augmentation_factor] 