In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import warnings
warnings.filterwarnings('ignore')

# Download NLTK resources
nltk.download('stopwords')

# Load the dataset
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', names=['label', 'message'])

# Convert labels to numerical values
df['label_encoded'] = df['label'].map({'ham': 0, 'spam': 1})

# Text preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

# Apply preprocessing
df['processed_message'] = df['message'].apply(preprocess_text)

# Split the data
X = df['processed_message']
y = df['label_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Use TF-IDF for feature extraction (more reliable than TextVectorization)
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Convert to dense arrays for neural network
X_train_dense = X_train_tfidf.toarray()
X_test_dense = X_test_tfidf.toarray()

# Build a simpler but effective neural network
model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_dense.shape[1],)),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    X_train_dense, y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_test_dense, y_test),
    verbose=1
)

# Prediction function that will definitely work
def predict_message(message):
    # Preprocess the input message
    processed_message = preprocess_text(message)

    # Transform using TF-IDF
    message_tfidf = tfidf.transform([processed_message])
    message_dense = message_tfidf.toarray()

    # Make prediction
    prediction = model.predict(message_dense, verbose=0)
    prob_spam = prediction[0][0]

    # Determine the label
    if prob_spam > 0.5:
        label = "spam"
        confidence = prob_spam
    else:
        label = "ham"
        confidence = 1 - prob_spam

    return [float(confidence), label]

# Test the function
test_messages = [
    "Congratulations! You've won a $1000 Walmart gift card. Click here to claim now!",
    "Hey, are we still meeting for lunch tomorrow?",
    "URGENT: Your bank account needs verification. Please click the link to secure your account.",
    "Can you pick up some milk on your way home?"
]

print("Testing prediction function:")
print("=" * 80)
for msg in test_messages:
    result = predict_message(msg)
    print(f"Message: {msg}")
    print(f"Prediction: {result[1]} (confidence: {result[0]:.4f})")
    print("-" * 80)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.8379 - loss: 0.4723 - val_accuracy: 0.9596 - val_loss: 0.1565
Epoch 2/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9702 - loss: 0.1032 - val_accuracy: 0.9812 - val_loss: 0.0815
Epoch 3/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9897 - loss: 0.0362 - val_accuracy: 0.9812 - val_loss: 0.0853
Epoch 4/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.9969 - loss: 0.0141 - val_accuracy: 0.9821 - val_loss: 0.1003
Epoch 5/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.9959 - loss: 0.0141 - val_accuracy: 0.9803 - val_loss: 0.1067
Epoch 6/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9990 - loss: 0.0044 - val_accuracy: 0.9794 - val_loss: 0.1192
Epoch 7/10
[1m140/140

In [None]:
# Final test with the exact project format
def test_predictions():
    messages = [
        "how are you doing today?",
        "sunshine, sunshine, sunshine",
        "congratulations, you've won a free ticket to the Bahamas! click here to claim now.",
        "hey, can we schedule a meeting for tomorrow?",
        "URGENT: your account has been compromised. click here to secure it."
    ]

    for message in messages:
        prediction = predict_message(message)
        print(f"Message: {message}")
        print(f"Prediction: {prediction}")
        print()

print("Running project test cases:")
print("=" * 60)
test_predictions()

Running project test cases:
Message: how are you doing today?
Prediction: [0.999858021736145, 'ham']

Message: sunshine, sunshine, sunshine
Prediction: [0.5722014904022217, 'spam']

Message: congratulations, you've won a free ticket to the Bahamas! click here to claim now.
Prediction: [0.99982088804245, 'spam']

Message: hey, can we schedule a meeting for tomorrow?
Prediction: [0.9999999403953552, 'ham']

Message: URGENT: your account has been compromised. click here to secure it.
Prediction: [0.9971715211868286, 'spam']

