In [1]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

# Load data
train_df = pd.read_csv('train-data.tsv', sep='\t', header=None, names=['label', 'message'])
test_df = pd.read_csv('valid-data.tsv', sep='\t', header=None, names=['label', 'message'])

# Convert labels to numerical values
label_map = {'ham': 0, 'spam': 1}
train_df['label'] = train_df['label'].map(label_map)
test_df['label'] = test_df['label'].map(label_map)

# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['message'])
train_sequences = tokenizer.texts_to_sequences(train_df['message'])
test_sequences = tokenizer.texts_to_sequences(test_df['message'])

# Pad sequences to ensure uniform input size
max_length = max(len(x) for x in train_sequences)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

# Convert to numpy arrays
train_labels = np.array(train_df['label'])
test_labels = np.array(test_df['label'])


--2024-04-17 11:49:38--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 172.67.70.149, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv’


2024-04-17 11:49:38 (10.8 MB/s) - ‘train-data.tsv’ saved [358233/358233]

--2024-04-17 11:49:38--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 172.67.70.149, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv’


2024-04-17 11:49:38 (5.40 MB/s) - ‘valid-data.tsv’ saved [118774/118774]



In [2]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=32, input_length=max_length),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(48, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 189, 32)           246176    
                                                                 
 dropout (Dropout)           (None, 189, 32)           0         
                                                                 
 global_average_pooling1d (  (None, 32)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 48)                1584      
                                                                 
 dropout_1 (Dropout)         (None, 48)                0         
                                                                 
 dense_1 (Dense)             (None, 24)                1176      
                                                        

In [3]:
history = model.fit(train_padded, train_labels, epochs=10, validation_data=(test_padded, test_labels))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [4]:
def predict_message(pred_text):
    sequence = tokenizer.texts_to_sequences([pred_text])
    padded = pad_sequences(sequence, maxlen=max_length, padding='post')
    prediction = model.predict(padded)[0][0]
    return [prediction, 'ham' if prediction < 0.5 else 'spam']

# Example use
pred_text = "how are you doing today?"
prediction = predict_message(pred_text)
print(prediction)


[0.0071972897, 'ham']


In [5]:
def test_predictions():
    test_messages = [
        "how are you doing today",
        "sale today! to stop texts call 98912460324",
        "i dont want to go. can we try it a different day? available sat",
        "our new mobile video service is live. just install on your phone to start watching.",
        "you have won £1000 cash! call to claim your prize.",
        "i'll bring it tomorrow. don't forget the milk.",
        "wow, is your arm alright. that happened to me one time too"
    ]

    test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
    passed = True

    for msg, ans in zip(test_messages, test_answers):
        prediction = predict_message(msg)
        if prediction[1] != ans:
            passed = False

    if passed:
        print("You passed the challenge. Great job!")
    else:
        print("You haven't passed yet. Keep trying.")

test_predictions()


You passed the challenge. Great job!


In [6]:
from sklearn.metrics import confusion_matrix

test_messages = [
        "how are you doing today",
        "sale today! to stop texts call 98912460324",
        "i dont want to go. can we try it a different day? available sat",
        "our new mobile video service is live. just install on your phone to start watching.",
        "you have won £1000 cash! call to claim your prize.",
        "i'll bring it tomorrow. don't forget the milk.",
        "wow, is your arm alright. that happened to me one time too"
    ]

test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]

# Assuming you have a list of predictions and actuals
predictions = [predict_message(msg)[1] for msg in test_messages]
actuals = test_answers

# Generate the confusion matrix
cm = confusion_matrix(actuals, predictions, labels=["ham", "spam"])
print(cm)


[[4 0]
 [0 3]]


In [7]:
from sklearn.metrics import confusion_matrix
import numpy as np

# Assume `predict_message` function is already defined and properly returns [probability, 'ham'/'spam']

test_messages = [
    "how are you doing today",
    "sale today! to stop texts call 98912460324",
    "i dont want to go. can we try it a different day? available sat",
    "our new mobile video service is live. just install on your phone to start watching.",
    "you have won £1000 cash! call to claim your prize.",
    "i'll bring it tomorrow. don't forget the milk.",
    "wow, is your arm alright. that happened to me one time too"
]

test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]

# List to hold predictions and scores
predictions = []
scores = []

for msg in test_messages:
    prediction = predict_message(msg)
    predictions.append(prediction[1])  # Add predicted label to predictions list
    scores.append(prediction[0])       # Add probability score to scores list

# Generate the confusion matrix
cm = confusion_matrix(test_answers, predictions, labels=["ham", "spam"])
print("Confusion Matrix:")
print(cm)

# Display detailed output
print("\nDetailed Classification Report:")
for msg, actual, pred, score in zip(test_messages, test_answers, predictions, scores):
    print(f"Message: '{msg}'\nPredicted: {pred} (Score: {score:.4f}), Actual: {actual}\n")



Confusion Matrix:
[[4 0]
 [0 3]]

Detailed Classification Report:
Message: 'how are you doing today'
Predicted: ham (Score: 0.0072), Actual: ham

Message: 'sale today! to stop texts call 98912460324'
Predicted: spam (Score: 0.8850), Actual: spam

Message: 'i dont want to go. can we try it a different day? available sat'
Predicted: ham (Score: 0.0021), Actual: ham

Message: 'our new mobile video service is live. just install on your phone to start watching.'
Predicted: spam (Score: 0.9950), Actual: spam

Message: 'you have won £1000 cash! call to claim your prize.'
Predicted: spam (Score: 0.9971), Actual: spam

Message: 'i'll bring it tomorrow. don't forget the milk.'
Predicted: ham (Score: 0.0012), Actual: ham

Message: 'wow, is your arm alright. that happened to me one time too'
Predicted: ham (Score: 0.0043), Actual: ham

