<a href="https://colab.research.google.com/github/polugariteja/531-Training-3-2/blob/main/Transformer_TASK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import numpy as np
import tensorflow as tf
import kagglehub

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import (
    Input, Dense, LayerNormalization, Dropout,
    MultiHeadAttention, GlobalAveragePooling1D,
    Concatenate, Embedding
)
from tensorflow.keras.models import Model


In [3]:
path = kagglehub.dataset_download(
    "roblexnana/the-babi-tasks-for-nlp-qa-system"
)
print("Dataset downloaded at:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/roblexnana/the-babi-tasks-for-nlp-qa-system?dataset_version_number=1...


100%|██████████| 16.7M/16.7M [00:00<00:00, 129MB/s]

Extracting files...





Dataset downloaded at: /root/.cache/kagglehub/datasets/roblexnana/the-babi-tasks-for-nlp-qa-system/versions/1


In [4]:
def find_file(base_path, filename):
    for root, _, files in os.walk(base_path):
        if filename in files:
            return os.path.join(root, filename)
    raise FileNotFoundError(filename)

train_file = find_file(path, "qa1_single-supporting-fact_train.txt")
test_file  = find_file(path, "qa1_single-supporting-fact_test.txt")

print("Train file:", train_file)
print("Test file :", test_file)

Train file: /root/.cache/kagglehub/datasets/roblexnana/the-babi-tasks-for-nlp-qa-system/versions/1/tasks_1-20_v1-2/hn/qa1_single-supporting-fact_train.txt
Test file : /root/.cache/kagglehub/datasets/roblexnana/the-babi-tasks-for-nlp-qa-system/versions/1/tasks_1-20_v1-2/hn/qa1_single-supporting-fact_test.txt


In [5]:
def parse_babi(file_path):
    stories, questions, answers = [], [], []
    story = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            nid, text = line.split(" ", 1)

            if nid == "1":
                story = []

            if "\t" in text:
                q, a, _ = text.split("\t")
                stories.append(" ".join(story))
                questions.append(q)
                answers.append(a)
            else:
                story.append(text)

    return stories, questions, answers

train_stories, train_questions, train_answers = parse_babi(train_file)
test_stories, test_questions, test_answers = parse_babi(test_file)

print("Train samples:", len(train_stories))

Train samples: 1000


In [17]:

custom_filters = '!' + '"#$%&()*+,-./:;<=>?@[\]`{|}~\t\n'
tokenizer = Tokenizer(filters=custom_filters)

processed_train_answers_for_tokenizer = [ans.replace(' ', '_') for ans in train_answers]
processed_test_answers_for_tokenizer = [ans.replace(' ', '_') for ans in test_answers]

tokenizer.fit_on_texts(train_stories + train_questions + processed_train_answers_for_tokenizer + processed_test_answers_for_tokenizer)

vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary size:", vocab_size)

max_story_len = max(len(s.split()) for s in train_stories)
max_question_len = max(len(q.split()) for q in train_questions)

def vectorize(stories, questions, answers):
    s = tokenizer.texts_to_sequences(stories)
    q = tokenizer.texts_to_sequences(questions)
    a = np.array([tokenizer.word_index[x.replace(' ', '_')] for x in answers])

    s = pad_sequences(s, maxlen=max_story_len)
    q = pad_sequences(q, maxlen=max_question_len)

    return s, q, a

x_story, x_question, y = vectorize(
    train_stories, train_questions, train_answers
)
x_story_test, x_question_test, y_test = vectorize(
    test_stories, test_questions, test_answers
)

Vocabulary size: 26


  custom_filters = '!' + '"#$%&()*+,-./:;<=>?@[\]`{|}~\t\n'


In [11]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, max_len, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = Embedding(vocab_size, embed_dim)
        self.pos_emb = Embedding(max_len, embed_dim)

    def call(self, x):
        positions = tf.range(start=0, limit=tf.shape(x)[1])
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


In [12]:
def transformer_encoder(x, head_size, num_heads, ff_dim):
    attn = MultiHeadAttention(
        num_heads=num_heads,
        key_dim=head_size
    )(x, x)

    x = LayerNormalization(epsilon=1e-6)(x + attn)

    ffn = Dense(ff_dim, activation="relu")(x)
    ffn = Dense(head_size)(ffn)

    return LayerNormalization(epsilon=1e-6)(x + ffn)


In [16]:
embed_dim = 64

story_input = Input(shape=(max_story_len,))
question_input = Input(shape=(max_question_len,))

story_embed = PositionalEmbedding(
    max_story_len, vocab_size, embed_dim
)(story_input)

question_embed = PositionalEmbedding(
    max_question_len, vocab_size, embed_dim
)(question_input)

ATTN_HEAD_SIZE = embed_dim
NUM_HEADS = 1
FFN_DIM = 64

story_encoded = transformer_encoder(
    story_embed, head_size=ATTN_HEAD_SIZE, num_heads=NUM_HEADS, ff_dim=FFN_DIM
)

question_encoded = transformer_encoder(
    question_embed, head_size=ATTN_HEAD_SIZE, num_heads=NUM_HEADS, ff_dim=FFN_DIM
)

qa_attention = MultiHeadAttention(
    num_heads=2, key_dim=32
)(
    query=question_encoded,
    value=story_encoded,
    key=story_encoded
)

story_vec = GlobalAveragePooling1D()(qa_attention)
question_vec = GlobalAveragePooling1D()(question_encoded)

merged = Concatenate()([story_vec, question_vec])

output = Dense(vocab_size, activation="softmax")(merged)

model = Model(
    inputs=[story_input, question_input],
    outputs=output
)

In [15]:
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

model.fit(
    [x_story, x_question],
    y,
    batch_size=32,
    epochs=30,
    validation_split=0.1
)


# ==============================
# STEP 9: EVALUATE
# ==============================
loss, acc = model.evaluate(
    [x_story_test, x_question_test],
    y_test
)

print("Final Transformer QA Accuracy:", acc)

Epoch 1/30
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 43ms/step - accuracy: 0.1629 - loss: 2.3345 - val_accuracy: 0.1700 - val_loss: 1.8112
Epoch 2/30
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.1751 - loss: 1.8321 - val_accuracy: 0.2400 - val_loss: 1.7766
Epoch 3/30
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.3292 - loss: 1.6494 - val_accuracy: 0.3400 - val_loss: 1.5277
Epoch 4/30
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.4424 - loss: 1.4389 - val_accuracy: 0.5200 - val_loss: 1.3974
Epoch 5/30
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.5187 - loss: 1.3256 - val_accuracy: 0.5000 - val_loss: 1.3380
Epoch 6/30
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 63ms/step - accuracy: 0.4975 - loss: 1.2591 - val_accuracy: 0.5000 - val_loss: 1.2678
Epoch 7/30
[1m29/29[0m [32m━━━━