In [3]:
# import libraries
import tensorflow as tf
import pandas as pd
from tensorflow.keras import layers
from tensorflow.keras import losses


import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt


print(tf.__version__)

2.19.0


In [4]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

--2025-08-13 13:00:57--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 172.67.70.149, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv’


2025-08-13 13:00:57 (6.66 MB/s) - ‘train-data.tsv’ saved [358233/358233]

--2025-08-13 13:00:57--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 172.67.70.149, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv’


2025-08-13 13:00:57 (12.6 MB/s) - ‘valid-data.tsv’ saved [118774/118774]



In [5]:
train_df = pd.read_csv(train_file_path, sep='\t', header=None, names=['label', 'text'])
train_df['label'] = (train_df['label'] == 'spam').astype('int32')

test_df = pd.read_csv(test_file_path, sep='\t', header=None, names=['label', 'text'])
test_df['label'] = (test_df['label'] == 'spam').astype('int32')

> # Data Preprocessing

In [6]:
batch_size = 32
seed = 42

raw_train_ds  = tf.data.Dataset.from_tensor_slices((train_df['text'].tolist(), train_df['label'].tolist()))
raw_train_ds  = raw_train_ds.shuffle(buffer_size=len(train_df), seed=seed)

train_size = int(0.8 * len(train_df))
val_size = len(train_df) - train_size

train_ds = raw_train_ds.take(train_size).batch(batch_size)
val_ds = raw_train_ds.skip(train_size).batch(batch_size)

In [7]:
test_ds = tf.data.Dataset.from_tensor_slices((test_df['text'].tolist(), test_df['label'].tolist()))
test_ds  = test_ds.shuffle(buffer_size=len(test_df), seed=seed).batch(batch_size)

In [8]:
import re
import string

def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [9]:
max_features = 10000
sequence_length = 250

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [10]:
train_text = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [11]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [12]:
text_batch, label_batch = next(iter(train_ds))
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print("Label", first_label)
print("Vectorized review", vectorize_text(first_review, first_label))

Review tf.Tensor(b"u studying in sch or going home? anyway i'll b going 2 sch later.", shape=(), dtype=string)
Label tf.Tensor(0, shape=(), dtype=int32)
Vectorized review (<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[  7, 810,   9, 544,  29,  78, 101, 502,  60, 168,  78,  23, 544,
         93,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   

In [13]:
print("1287 ---> ",vectorize_layer.get_vocabulary()[1287])
print(" 313 ---> ",vectorize_layer.get_vocabulary()[313])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

1287 --->  bold
 313 --->  two
Vocabulary size: 7258


In [14]:
train_ds = train_ds.map(vectorize_text)
val_ds = val_ds.map(vectorize_text)
test_ds = test_ds.map(vectorize_text)

In [15]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

> # creating the model

In [16]:
embedding_dim = 16

In [17]:
model = tf.keras.Sequential([
    layers.Embedding(max_features, embedding_dim),
    layers.Conv1D(128, 7, activation='relu', padding='same'),
    layers.GlobalAveragePooling1D(),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1, activation='sigmoid')
])
model.summary()

In [18]:
model.compile(loss=losses.BinaryCrossentropy(),
              optimizer='adam',
              metrics=[tf.metrics.BinaryAccuracy(threshold=0.5)])

In [19]:
epochs = 20
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/20
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 32ms/step - binary_accuracy: 0.8630 - loss: 0.3986 - val_binary_accuracy: 0.8612 - val_loss: 0.3016
Epoch 2/20
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - binary_accuracy: 0.9393 - loss: 0.1661 - val_binary_accuracy: 0.9785 - val_loss: 0.0825
Epoch 3/20
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - binary_accuracy: 0.9788 - loss: 0.0747 - val_binary_accuracy: 0.9833 - val_loss: 0.0532
Epoch 4/20
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - binary_accuracy: 0.9883 - loss: 0.0441 - val_binary_accuracy: 0.9856 - val_loss: 0.0416
Epoch 5/20
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - binary_accuracy: 0.9907 - loss: 0.0329 - val_binary_accuracy: 0.9892 - val_loss: 0.0330
Epoch 6/20
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - binary_accuracy: 0.9928 - loss:

In [20]:
loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - binary_accuracy: 0.9849 - loss: 0.0653
Loss:  0.06527070701122284
Accuracy:  0.9849137663841248


In [21]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):

    input_tensor = tf.constant([pred_text])
    vectorize_input = vectorize_layer(input_tensor)
    logits = model.predict(vectorize_input)
    probability = tf.sigmoid(logits)[0][0].numpy()
    label = "spam" if probability > 0.6 else "ham"
    prediction = [probability, label]

    return (prediction)

pred_text = "i dont want to go. can we try it a different day? available sat"

prediction = predict_message(pred_text)
print(prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 351ms/step
[np.float32(0.5000202), 'ham']


In [22]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
You passed the challenge. Great job!
