# Training Model

## Importing Libraries

In [12]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


## Data Preparation

In [11]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load Dataset
df = pd.read_csv("spam.csv", encoding="latin-1")
df = df.rename(columns={'v1': 'label', 'v2': 'message'})[['label', 'message']]
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\verti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data Augmentation: Implement Evasive Techniques

In [None]:
# Data Augmentation with Evasive Techniques
def add_evasive_techniques(text):
    replacements = {
        'a': '@', 'e': '3', 'i': '1', 'o': '0', 's': '$', 'b': '8',
        't': '7', 'g': '6', 'z': '2'
    }
    evasive_text = ''.join([replacements.get(c, c) for c in text.lower()])
    symbols = ['*', '~', '#', '!', '?']
    evasive_text = ''.join([ch + random.choice(symbols) if random.random() > 0.7 else ch for ch in evasive_text])
    return evasive_text

df['augmented_message'] = df.apply(
    lambda row: add_evasive_techniques(row['message']) if row['label'] == 1 and random.random() > 0.3 else row['message'],
    axis=1
)


## Text Preprocess 

In [None]:
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  
    text = text.lower()
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

df['cleaned_message'] = df['message'].apply(preprocess_text)


## Split Data

In [3]:
X = df['cleaned_message']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

##  Tokenization and Padding for LSTM Model

In [None]:
max_words = 5000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

## LSTM Model

In [4]:

lstm_model = Sequential([
    Embedding(max_words, 64, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    LSTM(32),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train_pad, y_train, epochs=15, batch_size=32, validation_data=(X_test_pad, y_test))

# LSTM Evaluation
lstm_loss, lstm_accuracy = lstm_model.evaluate(X_test_pad, y_test)
print("LSTM Model Accuracy:", lstm_accuracy)




Epoch 1/15
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 56ms/step - accuracy: 0.8745 - loss: 0.3672 - val_accuracy: 0.9797 - val_loss: 0.0842
Epoch 2/15
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 62ms/step - accuracy: 0.9860 - loss: 0.0632 - val_accuracy: 0.9815 - val_loss: 0.0684
Epoch 3/15
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 77ms/step - accuracy: 0.9973 - loss: 0.0167 - val_accuracy: 0.9844 - val_loss: 0.0651
Epoch 4/15
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 74ms/step - accuracy: 0.9994 - loss: 0.0052 - val_accuracy: 0.9833 - val_loss: 0.1145
Epoch 5/15
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 70ms/step - accuracy: 0.9988 - loss: 0.0123 - val_accuracy: 0.9833 - val_loss: 0.1059
Epoch 6/15
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 70ms/step - accuracy: 0.9976 - loss: 0.0086 - val_accuracy: 0.9844 - val_loss: 0.0941
Epoch 7/15
[1m122/12

# Comparative Analysis of Machine Learning Models

#### Naive Bayes Model & Random Forest Model

In [5]:
# TF-IDF Vectorization for Traditional Models
tfidf = TfidfVectorizer(max_features=3000)
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

# Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_pred = nb_model.predict(X_test_tfidf)
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_pred))

# Random Forest Model
rf_model = RandomForestClassifier(n_estimators=150, max_depth=20, random_state=42)
rf_model.fit(X_train_tfidf, y_train)
rf_pred = rf_model.predict(X_test_tfidf)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))



Naive Bayes Accuracy: 0.9748803827751196
Random Forest Accuracy: 0.9617224880382775


In [None]:
filename = 'spam-sms-mnb-model.pkl'
pickle.dump(mnb, open(filename, 'wb'))