In [None]:
# 🔹 Step 0: Install Required Packages (Run only once)
!pip install pandas numpy scikit-learn tensorflow

# 🔹 Step 1: Import Libraries
import pandas as pd
import numpy as np
import re
import pickle
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# 🔹 Step 2: Load and Preview Dataset
df = pd.read_csv("spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'text']
df.head()

# 🔹 Step 3: Label Encoding and Text Cleaning
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])  # ham=0, spam=1

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df['cleaned_text'] = df['text'].apply(clean_text)

# 🔹 Step 4: Tokenization and Padding
max_words = 5000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['cleaned_text'])

X = tokenizer.texts_to_sequences(df['cleaned_text'])
X = pad_sequences(X, maxlen=max_len)
y = df['label'].values

# 🔹 Step 5: Train-Test Split and LSTM Model Definition
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=32, input_length=max_len))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# 🔹 Step 6: Train the Model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)

# 🔹 Step 7: Save the Model and Tokenizer
os.makedirs("model", exist_ok=True)

model.save("model/lstm_model.h5")

with open("model/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

print("✅ Model and tokenizer saved to /model directory.")
