In [None]:
import os
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
from keras.utils import to_categorical
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from tensorflow.keras.callbacks import EarlyStopping

# --- Load & Prepare Dataset ---
data_dir = "/kaggle/input/bbc-full-text-document-classification/bbc"
folders = ["business", "entertainment", "politics", "sport", "tech"]
os.chdir(data_dir)

f_text, f_cat = [], []
for folder in folders:
    for file in os.listdir(folder):
        with open(os.path.join(folder, file), encoding='unicode_escape') as f:
            f_text.append(' '.join(f.readlines()))
            f_cat.append(folder)

df = pd.DataFrame({'news': f_text, 'category': f_cat})
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['category'], random_state=22)
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['category'], random_state=22)

# --- Tokenizer & BERT Model ---
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)

def tokenize_and_embed(texts):
    embeddings = []
    with torch.no_grad():
        for text in texts:
            inputs = tokenizer.encode_plus(text, return_tensors='pt', max_length=128, truncation=True, padding='max_length')
            output = bert_model(**inputs)['last_hidden_state']
            embeddings.append(output.squeeze().numpy())  # shape: (128, 768)
    return np.stack(embeddings)

# --- Embedding with BERT ---
train_embed = tokenize_and_embed(train_df['news'])
val_embed = tokenize_and_embed(val_df['news'])
test_embed = tokenize_and_embed(test_df['news'])

# --- Encode Labels ---
label_map = {'sport': 0, 'tech': 1, 'entertainment': 2, 'politics': 3, 'business': 4}
train_labels = to_categorical(train_df['category'].map(label_map), num_classes=5)
val_labels = to_categorical(val_df['category'].map(label_map), num_classes=5)
test_labels = to_categorical(test_df['category'].map(label_map), num_classes=5)

# --- Define Improved Model ---
def get_model(input_shape=(128, 768), hidden_dim=64, num_classes=5):
    inputs = keras.Input(shape=input_shape)
    x = layers.Bidirectional(layers.LSTM(hidden_dim, return_sequences=True))(inputs)
    x = layers.Flatten()(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001))(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    
    model = keras.Model(inputs, outputs)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

model = get_model()

# --- Add EarlyStopping ---
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# --- Train Model ---
model.fit(train_embed, train_labels,
          validation_data=(val_embed, val_labels),
          epochs=12,
          batch_size=32,
          callbacks=[early_stop])

# --- Evaluate on Test Set ---
loss, acc = model.evaluate(test_embed, test_labels)
print(f"Test accuracy: {acc:.4f}")

# --- Classification Report ---
y_pred = model.predict(test_embed)
y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(test_labels, axis=1)

print("\nClassification Report:")
print(classification_report(y_true_labels, y_pred_labels, target_names=label_map.keys()))

print("Confusion Matrix:")
print(confusion_matrix(y_true_labels, y_pred_labels))


In [None]:
model.save("/kaggle/working/bert_text_classifier.keras")
