In [2]:
# Cell 1: Imports
import numpy as np
import pandas as pd
import tensorflow as tf
import keras_nlp
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Cell 2: Load and Preprocess Data
# Load your email dataset
df = pd.read_csv('/home/shasank/shasank/ml/projects/email-classifier/dataSets/toy_dataSet.csv')

df.head()
df.rename(columns={'Category': 'label','Message': 'email_text'}, inplace=True)

In [7]:

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df['label'])
num_classes = len(label_encoder.classes_)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['email_text'].values, 
    encoded_labels,
    test_size=0.2, 
    random_state=42
)


In [13]:
# Cell 3: Initialize BERT Classifier and Preprocessor
# Display available presets
print("Available BERT presets:")
print(keras_nlp.models.BertClassifier.presets.keys())

# Initialize the classifier
classifier = keras_nlp.models.BertClassifier.from_preset(
    "bert_base_en",
    num_classes=num_classes
)

# Configure sequence length in preprocessor
classifier.preprocessor.sequence_length = 128

# Preprocess the data
x_train = classifier.preprocessor(X_train)
x_test = classifier.preprocessor(X_test)

# Print shape to verify
print(f"Training data shape: {x_train['token_ids'].shape}")
print(f"Test data shape: {x_test['token_ids'].shape}")

Available BERT presets:
dict_keys(['bert_tiny_en_uncased', 'bert_small_en_uncased', 'bert_medium_en_uncased', 'bert_base_en_uncased', 'bert_base_en', 'bert_base_zh', 'bert_base_multi', 'bert_large_en_uncased', 'bert_large_en', 'bert_tiny_en_uncased_sst2'])
Training data shape: (4457, 128)
Test data shape: (1115, 128)


2025-03-12 18:45:17.216690: E tensorflow/core/util/util.cc:131] oneDNN supports DT_INT64 only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.


In [15]:
# Cell 4: Build the Model
# Initialize BERT backbone
bert_backbone = keras_nlp.models.BertBackbone.from_preset("bert_base_en")

# Define input layers
input_ids = Input(shape=(128,), dtype=tf.int32, name="token_ids")
input_mask = Input(shape=(128,), dtype=tf.int32, name="padding_mask")
input_type_ids = Input(shape=(128,), dtype=tf.int32, name="segment_ids")

# Get BERT outputs
sequence_output = bert_backbone([input_ids, input_mask, input_type_ids])
pooled_output = sequence_output[:, 0, :]  # Use [CLS] token output

# Add classification head
x = Dropout(0.1)(pooled_output)
outputs = Dense(num_classes, activation="softmax")(x)

# Create model
model = Model(
    inputs=[input_ids, input_mask, input_type_ids],
    outputs=outputs
)

# Print model summary
model.summary()

KeyError: (slice(None, None, None), 0, slice(None, None, None))

In [None]:
# Cell 5: Compile and Train
# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

# Add callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    ),
    tf.keras.callbacks.ModelCheckpoint(
        'best_model.h5',
        monitor='val_accuracy',
        save_best_only=True
    )
]

# Train the model
history = model.fit(
    x_train,
    y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    callbacks=callbacks
)

In [None]:
# Cell 6: Evaluate and Visualize Results
import matplotlib.pyplot as plt

# Evaluate the model
test_results = model.evaluate(x_test, y_test)
print(f"Test accuracy: {test_results[1]:.4f}")

# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Cell 7: Make Predictions
def predict_email(text):
    # Preprocess the input text
    processed_text = preprocessor([text])
    
    # Make prediction
    prediction = model.predict(processed_text)
    predicted_class = label_encoder.inverse_transform([np.argmax(prediction[0])])[0]
    confidence = np.max(prediction[0])
    
    return {
        'predicted_class': predicted_class,
        'confidence': confidence
    }

# Example usage
sample_email = "Your sample email text here"
result = predict_email(sample_email)
print(f"Predicted class: {result['predicted_class']}")
print(f"Confidence: {result['confidence']:.2f}")