In [25]:
# import libraries
import pandas as pd
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Embedding, Dense, Conv1D, GlobalMaxPool1D
from tensorflow.keras.callbacks import EarlyStopping



In [26]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

--2024-01-20 16:36:21--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.3.33, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv.1’


2024-01-20 16:36:22 (8.11 MB/s) - ‘train-data.tsv.1’ saved [358233/358233]

--2024-01-20 16:36:22--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.3.33, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv.1’


2024-01-20 16:36:22 (7.62 MB/s) - ‘valid-data.tsv.1’ saved [118774/118774]



In [27]:
train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
names = ["class", "message"]

In [44]:
train_X = pd.read_csv(train_file_path, sep='\t', names=names)

In [43]:
test_X = pd.read_csv(test_file_path, sep='\t', names=names)

In [45]:
X_train = train_X["message"].values.tolist()
y_train = np.array([0 if x=="ham" else 1 for x in train_X['class'].values.tolist()])
X_test = test_X["message"].values.tolist()
y_test = np.array([0 if x=="ham" else 1 for x in test_X['class'].values.tolist()])

In [46]:
vocabulary_dict = {}

for messgae in X_train:
  for vocabulary in messgae.split():
    if vocabulary not in vocabulary_dict:
      vocabulary_dict[vocabulary] = 1
    else:
      vocabulary_dict[vocabulary] += 1

In [47]:
VOCAB_SIZE = len(vocabulary_dict)
MAX_LENGTH = len(max(X_train, key=lambda p: len(p.split())).split())

In [48]:
encoded_train_X = [one_hot(d, VOCAB_SIZE) for d in X_train]
padded_train_X = pad_sequences(encoded_train_X, maxlen=MAX_LENGTH, padding='post')
encoded_test_X = [one_hot(d, VOCAB_SIZE) for d in X_test]
padded_test_X = pad_sequences(encoded_test_X, maxlen=MAX_LENGTH, padding='post')

In [49]:
embedding_dim = 50

model = Sequential()
model.add(Embedding(input_dim=VOCAB_SIZE, output_dim=embedding_dim, input_length=MAX_LENGTH))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPool1D())

model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 171, 50)           566500    
                                                                 
 conv1d_4 (Conv1D)           (None, 167, 128)          32128     
                                                                 
 global_max_pooling1d_4 (Gl  (None, 128)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_9 (Dense)             (None, 10)                1290      
                                                                 
 dense_10 (Dense)            (None, 1)                 11        
                                                                 
Total params: 599929 (2.29 MB)
Trainable params: 599929 (2.29 MB)
Non-trainable params: 0 (0.00 Byte)
__________________

In [50]:
history = model.fit(padded_train_X, y_train, epochs=10,
                    verbose=True, validation_data=(padded_test_X, y_test),
                    batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [51]:
# function to predict messages based on model
def predict_message(pred_text):

  class_dict = { 0 : "ham", 1 : "spam"}
  encoded = [one_hot(pred_text, VOCAB_SIZE)]
  padded = pad_sequences(encoded, maxlen=MAX_LENGTH, padding='post')
  prediction = [model.predict(padded)[0][0], class_dict[np.round(model.predict(padded)[0][0])]]

  return prediction

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

[5.649426e-06, 'ham']


In [52]:
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()


You passed the challenge. Great job!
