<a href="https://colab.research.google.com/github/samadpls/TensorFlow-Model-Exploration/blob/main/fcc_sms_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping


print(tf.__version__)

Collecting tf-nightly
  Downloading tf_nightly-2.15.0.dev20230910-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (493.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.2/493.2 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting wrapt<1.15,>=1.11.0 (from tf-nightly)
  Downloading wrapt-1.14.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting tb-nightly~=2.15.0.a (from tf-nightly)
  Downloading tb_nightly-2.15.0a20230909-py3-none-any.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m82.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tf-estimator-nightly~=2.14.0.dev (from tf-nightly)
  Downloading tf_estimator_nightly-2.14.0.dev2023080308-py2.py3-none-any.whl (440 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

--2023-09-10 10:28:18--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 104.26.2.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv’


2023-09-10 10:28:18 (10.5 MB/s) - ‘train-data.tsv’ saved [358233/358233]

--2023-09-10 10:28:18--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 104.26.2.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv’


2023-09-10 10:28:18 (5.16 MB/s) - ‘valid-data.tsv’ saved [118774/118774]



In [3]:
names = ["class", "message"]

In [4]:
train_dataset = pd.read_csv(train_file_path, sep='\t', names=names)
test_dataset = pd.read_csv(test_file_path, sep='\t', names=names)

In [5]:
train_dataset.head()

Unnamed: 0,class,message
0,ham,ahhhh...just woken up!had a bad dream about u ...
1,ham,you can never do nothing
2,ham,"now u sound like manky scouse boy steve,like! ..."
3,ham,mum say we wan to go then go... then she can s...
4,ham,never y lei... i v lazy... got wat? dat day ü ...


In [6]:
test_dataset.head()

Unnamed: 0,class,message
0,ham,i am in hospital da. . i will return home in e...
1,ham,"not much, just some textin'. how bout you?"
2,ham,i probably won't eat at all today. i think i'm...
3,ham,don‘t give a flying monkeys wot they think and...
4,ham,who are you seeing?


In [7]:
train_message = train_dataset["message"].tolist()
test_message = test_dataset["message"].tolist()

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_label = label_encoder.fit_transform(train_dataset["class"])
test_label = label_encoder.transform(test_dataset["class"])

In [8]:
from collections import defaultdict

vocabulary_dict = defaultdict(int)

for message in train_message:
    words = message.split()
    for word in words:
        vocabulary_dict[word] += 1

vocabulary_dict = dict(vocabulary_dict)

In [9]:
VOCAB_SIZE = len(vocabulary_dict)
MAX_LENGTH = len(max(train_message, key=lambda p: len(p.split())).split())

In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<OOV>')
tokenizer.fit_on_texts(train_message)
padded_train_message = pad_sequences(
    tokenizer.texts_to_sequences(train_message),
    maxlen=MAX_LENGTH,
    padding='post'
)

padded_test_message = pad_sequences(
    tokenizer.texts_to_sequences(test_message),
    maxlen=MAX_LENGTH,
    padding='post'
)

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Embedding, Dense

In [12]:
model = Sequential()
embedding_layer = Embedding(VOCAB_SIZE, 100, input_length=MAX_LENGTH)
model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop',metrics=['acc']) #binary bcz we are dealing with two cases 1 and 0
monitor = EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=25, verbose=1, mode='max', restore_best_weights=True)
model.fit(padded_train_message, train_label, validation_data=(padded_test_message, test_label), callbacks=[monitor])



<keras.src.callbacks.History at 0x7f4420f71c30>

In [13]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text,threshold=0.05):
    class_dict = {
        0: "ham",
        1: "spam",
    }
    encoded_message = tokenizer.texts_to_sequences([pred_text])
    padded_message = pad_sequences(encoded_message, maxlen=MAX_LENGTH, padding='post')
    prediction_score = model.predict(padded_message)[0][0]
    print(prediction_score >= threshold)
    predicted_class = class_dict[1 if prediction_score >= threshold else 0]
    print(f"Prediction Score: {prediction_score}")
    return  [prediction_score, predicted_class]



pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

False
Prediction Score: 0.004576655104756355
[0.004576655, 'ham']


In [14]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      print(prediction,'-->',msg)
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()


False
Prediction Score: 0.004576655104756355
False
Prediction Score: 0.01587953418493271
True
Prediction Score: 0.18528364598751068
True
Prediction Score: 0.09512603282928467
False
Prediction Score: 0.0076266322284936905
False
Prediction Score: 0.015368335880339146
You passed the challenge. Great job!
