In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np

print("Loading 'ag_news_subset' dataset...")

# Load the dataset
(raw_train, raw_test), metadata = tfds.load(
    'ag_news_subset',
    split=['train', 'test'],
    with_info=True,
    as_supervised=True  # Loads as (description_text, label)
)

print("Dataset loaded successfully.")

In [None]:
# Get the class names from metadata
class_names = metadata.features['label'].names
print("Class names:", class_names)
# You should see: ['World', 'Sports', 'Business', 'Sci/Tech']

print("\nHere's an example article:")
for review, label in raw_train.take(1):
    review_text = review.numpy().decode('utf-8')
    review_label = class_names[label.numpy()]

    print(f"LABEL: {review_label}")
    print(f"ARTICLE: {review_text[:500]}...")

In [None]:
VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 100

# Create the vectorization layer
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH
)

# Adapt the layer to the training text
print("Building the vocabulary...")
train_text = raw_train.map(lambda text, label: text)
vectorize_layer.adapt(train_text)
print("Vocabulary built.")

In [None]:
# This dictionary will hold our final datasets
datasets = {}

# --- Create a validation split (20% of train data) ---
num_train = metadata.splits['train'].num_examples
num_val = int(num_train * 0.2)  # 20% for validation

val_set = raw_train.take(num_val)
train_set = raw_train.skip(num_val)

# --- Create the preprocessing function ---
def vectorize_text(text, label):
    text = vectorize_layer(text)
    return text, label

# --- Apply the function and batch the datasets ---
datasets['train'] = train_set.map(vectorize_text).batch(64).prefetch(tf.data.AUTOTUNE)
datasets['val'] = val_set.map(vectorize_text).batch(64).prefetch(tf.data.AUTOTUNE)
datasets['test'] = raw_test.map(vectorize_text).batch(64).prefetch(tf.data.AUTOTUNE)

print("All datasets are vectorized and batched.")
print(f"New training set size: {num_train - num_val}")
print(f"New validation set size: {num_val}")

In [None]:
EMBEDDING_DIM = 64
LSTM_UNITS = 64

model = tf.keras.Sequential([
    # 1. The Embedding layer
    tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM),

    # 2. The LSTM layer
    tf.keras.layers.LSTM(LSTM_UNITS),

    # 3. The classification layers
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),

    # --- KEY CHANGE HERE ---
    # Final output layer
    # 4 units (one for each class)
    # 'softmax' activation for a probability distribution
    tf.keras.layers.Dense(4, activation='softmax')
])

model.summary()

In [None]:
model.compile(
    optimizer='adam',
    # --- KEY CHANGE HERE ---
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print("Model compiled.")

In [None]:
EPOCHS = 10

print("Starting training...")

history = model.fit(
    datasets['train'],
    epochs=EPOCHS,
    validation_data=datasets['val']
)

print("Training finished.")

In [None]:
print("Evaluating on test data...")
loss, accuracy = model.evaluate(datasets['test'])

print(f"\nTest Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")