<a href="https://colab.research.google.com/github/royd2023/HAMvsSPAM/blob/main/fcc_sms_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip uninstall -y tensorflow tf-nightly
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

Preprocessing the Data

In [None]:
import csv
def load_tsv_file(file_path):
  labels = []
  data = []

  try:
        df = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'])
        labels = df['label'].tolist()
        data = df['message'].tolist()

  except Exception as e:
      print(f"Error reading file with pandas: {e}")

      # Method 2: Manual parsing as fallback
      try:
          with open(file_path, 'r', encoding='utf-8') as file:
              for line in file:
                  line = line.strip()
                  if line:  # Skip empty lines
                      parts = line.split('\t', 1)  # Split on first tab only
                      if len(parts) == 2:
                          label, message = parts
                          labels.append(label)
                          data.append(message)
      except Exception as e2:
          print(f"Error with manual parsing: {e2}")
          return [], []

  return labels, data


In [None]:
labels, data = load_tsv_file(train_file_path)

Now that we have separated the labels and data, we can vectorize the data.

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Convert text to sequences of integers
tokenizer = Tokenizer(num_words=10000)  # Use top 10k words
tokenizer.fit_on_texts(data)

# Convert messages to sequences of numbers
x = tokenizer.texts_to_sequences(data)
x = pad_sequences(x, maxlen=100)  # Pad to same length (100 words)

# Convert labels to binary (0 for ham, 1 for spam)
y = [1 if label == 'spam' else 0 for label in labels]

*  x = sequence of integers (each message is now a list of numbers)
*  y = labels 0 for ham and 1 for spam

Now, lets split the data into training and testing sets

In [None]:
# Shuffle the data before splitting
indices = np.random.permutation(len(x))
x_shuffled = x[indices]
y_shuffled = [y[i] for i in indices]


x = tf.constant(x)
y = tf.constant(y)

dataset_size = len(x)
train_size = int(0.8 * dataset_size)

x_train = x[:train_size]
x_test = x[train_size:]
y_train = y[:train_size]
y_test = y[train_size:]


Build the Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D

# Create the model
model = Sequential([
    Embedding(10000, 16, input_length=100),  # Word embeddings
    GlobalAveragePooling1D(),               # Average the embeddings
    Dense(16, activation='relu'),           # Hidden layer
    Dense(1, activation='sigmoid')          # Output layer (0 or 1)
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Train the model
model.fit(x_train, y_train, epochs=20, validation_split=0.2)

Evaluate the Model

In [None]:
# evaluate on test set
test_loss, test_accuracy = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")

# make predictions
predictions = model.predict(x_test)
predicted_classes = (predictions > 0.5).astype(int)

# See some results
for i in range(5):
    actual = "spam" if y_test[i] == 1 else "ham"
    predicted = "spam" if predicted_classes[i] == 1 else "ham"
    confidence = predictions[i][0]
    print(f"Actual: {actual}, Predicted: {predicted}, Confidence: {confidence:.3f}")

In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):

 # Preprocess the text (same as training data)
    sequence = tokenizer.texts_to_sequences([pred_text])
    padded = pad_sequences(sequence, maxlen=100)

    # Get prediction probability
    prediction_prob = float(model.predict(padded)[0][0])

    # Convert to label
    label = 'spam' if prediction_prob > 0.5 else 'ham'

    return [prediction_prob, label]

pred_text = "sale today! to stop texts call 98912460324"

prediction = predict_message(pred_text)
print(prediction)

In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
