<a href="https://colab.research.google.com/github/salahAlawieh/Machine-Learning-with-Python/blob/main/fcc_sms_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1 - Setup Environment and Libraries
# Install the correct TensorFlow version and import required packages

# Uninstall conflicting versions
!pip uninstall -y tensorflow tf-nightly -q

# Install TensorFlow 2.19 and TensorFlow Datasets
!pip install tensorflow==2.19.0 tensorflow-datasets -q

# Import libraries
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import os

# Verify TensorFlow installation and GPU availability
print("TensorFlow version:", tf.__version__)
print("GPU available:", tf.config.list_physical_devices('GPU'))

# Note: In Colab, restart runtime if version mismatch occurs
if tf.__version__ != "2.19.0":
    print("\n⚠️ TensorFlow version is not 2.19.0. Restart runtime and rerun this cell.")

In [None]:
# Cell 2 - Download SMS Spam Collection Dataset
# Short: Get training and validation TSV files

!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
# Cell 3 - Load data using pandas and inspect
# Short: Load TSV files into DataFrames and explore shape and label distribution

train_df = pd.read_csv(train_file_path, sep='\t', header=None, names=['label', 'message'])
test_df = pd.read_csv(test_file_path, sep='\t', header=None, names=['label', 'message'])

print("Training set shape:", train_df.shape)
print("Validation set shape:", test_df.shape)

print("\nLabel distribution (train):")
print(train_df['label'].value_counts())

print("\nLabel distribution (validation):")
print(test_df['label'].value_counts())

# Display sample messages
print("\nSample messages:")
print(train_df.head())

In [None]:
# Cell 4 - Encode labels as 0 (ham) and 1 (spam)
# Short: Convert text labels to numeric for model training

le = LabelEncoder()
y_train = le.fit_transform(train_df['label'])  # ham=0, spam=1
y_test = le.transform(test_df['label'])

X_train = train_df['message'].values
X_test = test_df['message'].values

# Inspect a few samples
print("Training labels sample:", y_train[:10])
print("\nTraining messages sample:")
print(X_train[:5])

In [None]:
# Cell 5 - Tokenize text messages and pad sequences
# Short: Convert words to integers and pad sequences to fixed length

vocab_size = 2000     # increased vocab size to better capture spam words
max_length = 100
oov_token = "<OOV>"

# Initialize tokenizer and fit on training messages
tokenizer = keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to max_length
X_train_padded = keras.preprocessing.sequence.pad_sequences(
    X_train_seq, maxlen=max_length, padding='post', truncating='post'
)
X_test_padded = keras.preprocessing.sequence.pad_sequences(
    X_test_seq, maxlen=max_length, padding='post', truncating='post'
)

# Inspect shapes and example tokenized sequence
print("Training padded shape:", X_train_padded.shape)
print("Validation padded shape:", X_test_padded.shape)
print("\nExample tokenized & padded sequence (first 20 tokens of first message):")
print(X_train_padded[0][:20])

In [None]:
# Cell 6 - Build Neural Network
# Short: Define a sequential model with embedding and dense layers for binary classification

model = keras.Sequential([
    keras.layers.Embedding(input_dim=vocab_size, output_dim=16),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')  # sigmoid output for binary (ham/spam)
])

# Compile the model
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# Show model architecture
model.summary()

In [None]:
# Cell 7 - Train the model with class weighting to handle imbalance
# Short: Train with balanced class weights and more epochs

from sklearn.utils import class_weight

# Compute class weights to balance ham vs spam
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(enumerate(class_weights))

# Train model
history = model.fit(
    X_train_padded,
    y_train,
    epochs=15,           # more epochs for better learning
    batch_size=32,
    validation_data=(X_test_padded, y_test),
    class_weight=class_weights_dict,
    verbose=2
)

In [None]:
# Cell 8 - Evaluate model performance on validation set
# Short: Check accuracy and plot training history

loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f"\nValidation Accuracy: {accuracy:.4f}")
print(f"Validation Loss: {loss:.4f}")

# Plot training/validation accuracy and loss
plt.figure(figsize=(12,5))

# Accuracy plot
plt.subplot(1,2,1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Loss plot
plt.subplot(1,2,2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
# Cell 9 - Predict a single SMS message
# Short: Convert text to sequence, pad it, predict probability, return label

def predict_message(pred_text):
    # Convert text to integer sequence
    seq = tokenizer.texts_to_sequences([pred_text])

    # Pad sequence
    padded = keras.preprocessing.sequence.pad_sequences(seq, maxlen=max_length, padding='post')

    # Predict probability of spam
    prob = model.predict(padded, verbose=0)[0][0]

    # Lower threshold to 0.25 for better spam detection
    label = "spam" if prob > 0.25 else "ham"

    return [float(prob), label]

# Example usage
pred_text = "how are you doing today?"
prediction = predict_message(pred_text)
print(prediction)

In [None]:
# Cell 10 - Test predict_message function on sample messages
# Short: Automated test for FreeCodeCamp challenge

def test_predictions():
    test_messages = [
        "how are you doing today",
        "sale today! to stop texts call 98912460324",
        "i dont want to go. can we try it a different day? available sat",
        "our new mobile video service is live. just install on your phone to start watching.",
        "you have won £1000 cash! call to claim your prize.",
        "i'll bring it tomorrow. don't forget the milk.",
        "wow, is your arm alright. that happened to me one time too"
    ]

    test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
    passed = True

    for msg, ans in zip(test_messages, test_answers):
        prediction = predict_message(msg)
        if prediction[1] != ans:
            passed = False

    if passed:
        print("You passed the challenge. Great job!")
    else:
        print("You haven't passed yet. Keep trying.")

# Run test
test_predictions()