In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Embedding,LSTM,Dense,Dropout,Bidirectional)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# using nltk toolkit to make tasks like tokenization easier
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from datasets import load_dataset
ds = load_dataset("lucadiliello/newsqa")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


data/train-00000-of-00001-ec54fbe500fc3b(…):   0%|          | 0.00/29.7M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
print(ds)

In [None]:
#hugging face dataset already has training validation split and pandas is being used for easier visualization
#will have to manually divide data for testing as it's not included, for now testing on validation
train_df = ds["train"].to_pandas()
valid_df = ds["validation"].to_pandas()

In [None]:
train_df.head()

In [None]:
valid_df.head()

In [None]:
#remove stopwords
stop_words = set(stopwords.words("english"))

In [None]:
#create labels
def is_answerable(ans):
    """Return 1 if answer text exists, else 0."""
    if isinstance(ans, dict) and "text" in ans:
        return 1 if len(ans["text"]) > 0 and ans["text"][0].strip() != "" else 0
    return 0

train_df["answerable"] = train_df["answers"].apply(is_answerable)
valid_df["answerable"] = valid_df["answers"].apply(is_answerable)


In [None]:
import re
def clean_text(text):
    if not isinstance(text, str):#type checking
        return ""
    text = re.sub(r"<.*?>", " ", text)#remove tags
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)#remove punctuaion
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    return " ".join(tokens)

def extract_answer_text(answer): #answer_text is nestted in answer
    if isinstance(answer, dict) and "spans" in answer and len(answer["spans"]) > 0:
        return answer["spans"][0]
    return ""

train_df["answer_text"] = train_df["answers"].apply(extract_answer_text)
valid_df["answer_text"] = valid_df["answers"].apply(extract_answer_text)

for col in ["context", "question", "answer_text"]:
    train_df[col] = train_df[col].apply(clean_text)
    valid_df[col] = valid_df[col].apply(clean_text)

In [None]:
#tokenization
texts = train_df["question"].astype(str).tolist()
labels = train_df["answer_text"].astype(str).tolist()

tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)
sequences_padded = pad_sequences(sequences, maxlen=50, padding='post', truncating='post')#padding

label_tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
label_tokenizer.fit_on_texts(labels)
label_sequences = label_tokenizer.texts_to_sequences(labels)
label_padded = pad_sequences(label_sequences, maxlen=50, padding='post', truncating='post')


In [None]:
X = np.array(sequences_padded)
y = np.array(label_padded)

print(X.shape)
print(y.shape)

In [None]:
#hyperparameter
vocab_size = 10000
embedding_dimensions = 128
max_length = 50

In [None]:
#declaring model
model = Sequential([
    Embedding(vocab_size, embedding_dimensions, input_length=max_length, trainable=True),
    Bidirectional(LSTM(512, return_sequences=True)), # Changed LSTM units
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dense(vocab_size, activation='softmax')
])

In [None]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy']) # Change loss function

In [None]:
model.summary()
#output shpae not required as it is not classifying

In [None]:
#dataset too large, long time to train, trying half
X_try = X[:5000]
y_try = y[:5000]
history = model.fit(X_try, y_try, validation_split=0.1, epochs=3, batch_size=64, verbose=1)