<a href="https://colab.research.google.com/github/singaravelan/TensorFlow_programming/blob/main/3.%20Spam%20Identification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np

In [None]:
import os
import zipfile
import wget

# Specify the URL and destination folder
download_url = 'https://www.dropbox.com/scl/fi/rogllgwop25frgcvdbl1c/spam.csv.zip?rlkey=2hk62gqdxtlhtyy3vj1w2m1fa&raw=1'
destination_folder = 'data'

# Create the destination folder if it doesn't exist
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# Download the file using wget
zip_file_path = os.path.join(destination_folder, 'spam.csv.zip')
wget.download(download_url, zip_file_path)

# Extract the contents of the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(destination_folder)

# Remove the downloaded zip file if needed
os.remove(zip_file_path)

print(f'Data downloaded and extracted to: {destination_folder}')

Data downloaded and extracted to: data


In [None]:
# Load the data
data = pd.read_csv('/Users/singaravelang/Documents/Learning/Workshop/data/spam.csv')

In [None]:
data['Spam']=data['Category'].apply(lambda x:1 if x=='spam' else 0)
data.head(5)

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
# Use 'Spam' column as the target variable
X = data['Message']
y = data['Spam']

In [None]:
# Tokenize and pad text messages
max_words = 1000  # Consider only the top 1000 words in the dataset
max_len = 150  # Set a maximum length for the messages

tokenizer = Tokenizer(num_words=max_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
X = pad_sequences(sequences, maxlen=max_len)

In [None]:
print(X[2])

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0  47 489   8  19   4 796 901   2 175 659 261  71   2   2 337 489 555
 960  73 390 179 660 391]


In [None]:
# Model Construction
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=32, input_length=max_len))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x2986e6890>

In [None]:
# Evaluate the model on the testing data
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Test Accuracy: 98.48%


In [None]:
# Preprocess the email messages
email_messages = [
    'Sounds great! Are you home now?',
    'Will u meet ur dream partner soon? Is ur career off 2 a flyng start? 2 find out free, txt HORO followed by ur star sign, e. g. HORO ARIES'
]

# Tokenize and pad the email messages
email_sequences = tokenizer.texts_to_sequences(email_messages)
email_padded = pad_sequences(email_sequences, maxlen=max_len)

# Make predictions using the trained model
predictions = model.predict(email_padded)

# Convert predicted probabilities to binary predictions (0 for ham, 1 for spam)
binary_predictions = np.round(predictions).astype(int)

# Display the results
for i, message in enumerate(email_messages):
    prediction_result = 'Spam' if binary_predictions[i] == 1 else 'Ham'
    print(f"Email: '{message}'\nPredicted Category: {prediction_result}\n")

Email: 'Sounds great! Are you home now?'
Predicted Category: Ham

Email: 'Will u meet ur dream partner soon? Is ur career off 2 a flyng start? 2 find out free, txt HORO followed by ur star sign, e. g. HORO ARIES'
Predicted Category: Spam

