In [3]:
# 🔹 Step 0: Install Required Packages (Run once)
# !pip install pandas numpy scikit-learn tensorflow

# 🔹 Step 1: Import Libraries
import pandas as pd
import numpy as np
import re
import pickle
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 🔹 Step 2: Load Dataset
df = pd.read_csv("spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'text']

# 🔹 Step 3: Encode Binary Label (Spam or Not)
le_bin = LabelEncoder()
df['label'] = le_bin.fit_transform(df['label'])  # ham = 0, spam = 1

# 🔹 Step 4: Assign Dummy Categories for Spam
df['category'] = df['label'].apply(
    lambda x: 'None' if x == 0 else np.random.choice(['Phishing', 'Promo', 'Scam', 'Malware'])
)

# 🔹 Step 5: Encode Category Labels
le_cat = LabelEncoder()
df['cat_encoded'] = le_cat.fit_transform(df['category'])  # e.g., 'Phishing' -> 0
cat_onehot = to_categorical(df['cat_encoded'])

# 🔹 Step 6: Clean the Email Text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df['cleaned_text'] = df['text'].apply(clean_text)

# 🔹 Step 7: Tokenization and Padding
max_words = 5000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['cleaned_text'])

X = tokenizer.texts_to_sequences(df['cleaned_text'])
X = pad_sequences(X, maxlen=max_len)

y_bin = df['label'].values        # Binary label: spam or not
y_cat = cat_onehot                # Categorical: spam category

# 🔹 Step 8: Split Data
X_train, X_test, y_bin_train, y_bin_test, y_cat_train, y_cat_test = train_test_split(
    X, y_bin, y_cat, test_size=0.2, random_state=42)

# 🔹 Step 9: Define Multi-Output LSTM Model
inputs = Input(shape=(max_len,))
x = Embedding(input_dim=max_words, output_dim=32)(inputs)
x = LSTM(64, dropout=0.2, recurrent_dropout=0.2)(x)

# Output 1: Binary Spam Classifier
out_bin = Dense(1, activation='sigmoid', name='spam_output')(x)

# Output 2: Spam Category Classifier
out_cat = Dense(y_cat.shape[1], activation='softmax', name='category_output')(x)

# Compile Model
model = Model(inputs=inputs, outputs=[out_bin, out_cat])
model.compile(
    loss={'spam_output': 'binary_crossentropy', 'category_output': 'categorical_crossentropy'},
    optimizer='adam',
    metrics={'spam_output': 'accuracy', 'category_output': 'accuracy'}
)

model.summary()

# 🔹 Step 10: Train the Model
model.fit(
    X_train,
    {'spam_output': y_bin_train, 'category_output': y_cat_train},
    epochs=5,
    batch_size=32,
    validation_split=0.1
)

# 🔹 Step 11: Save Model and Supporting Files
os.makedirs("model", exist_ok=True)

# Save model
model.save("model/lstm_model.h5")

# Save tokenizer
with open("model/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# Save category label encoder
with open("model/category_encoder.pkl", "wb") as f:
    pickle.dump(le_cat, f)

print("✅ Model, tokenizer, and category encoder saved to /model directory.")


Epoch 1/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 124ms/step - category_output_accuracy: 0.8392 - category_output_loss: 0.8091 - loss: 1.2428 - spam_output_accuracy: 0.8642 - spam_output_loss: 0.4336 - val_category_output_accuracy: 0.8700 - val_category_output_loss: 0.3378 - val_loss: 0.5091 - val_spam_output_accuracy: 0.9529 - val_spam_output_loss: 0.1695
Epoch 2/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 112ms/step - category_output_accuracy: 0.8842 - category_output_loss: 0.2644 - loss: 0.3571 - spam_output_accuracy: 0.9775 - spam_output_loss: 0.0927 - val_category_output_accuracy: 0.8789 - val_category_output_loss: 0.2726 - val_loss: 0.3482 - val_spam_output_accuracy: 0.9776 - val_spam_output_loss: 0.0743
Epoch 3/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 108ms/step - category_output_accuracy: 0.8969 - category_output_loss: 0.2240 - loss: 0.2657 - spam_output_accuracy: 0.9871 - spam_output_loss: 0.0417 -



✅ Model, tokenizer, and category encoder saved to /model directory.
