<a href="https://colab.research.google.com/github/srividya-sundaravadivelu/SMS_Text_Classification_Using_Keras/blob/main/SMS_Text_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import pandas as pd
from tensorflow import keras
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
df_train = pd.read_csv(train_file_path,sep="\t",header=None)

In [None]:
df_train = df_train.rename(columns={0: 'HamOrSpam',1: 'Text'})
df_train

In [None]:
df_test = pd.read_csv(test_file_path,sep="\t",header=None)
df_test

In [None]:
df_test = df_test.rename(columns={0: 'HamOrSpam',1: 'Text'})
df_test

In [None]:
# Convert Categorical to numeric.
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
df_train["HamOrSpam"] = labelEncoder.fit_transform(df_train["HamOrSpam"])
df_test["HamOrSpam"] = labelEncoder.fit_transform(df_test["HamOrSpam"])

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
df_train["HamOrSpam"].value_counts(normalize=True)

In [None]:
X_train = df_train["Text"]
X_test = df_test["Text"]
y_train = df_train["HamOrSpam"]
y_test = df_test["HamOrSpam"]

In [None]:
print(X_train.head())
print(X_test.head())
print(y_train.head())
print(y_test.head())

In [None]:
X_train = np.array(X_train)

In [None]:
X_test = np.array(X_test)

In [None]:
vectorize_layer = keras.layers.TextVectorization()
vectorize_layer.adapt(X_train)

In [None]:
vectorize_layer.get_vocabulary()

In [None]:
vocab_size = len(vectorize_layer.get_vocabulary())

In [None]:
model = tf.keras.Sequential([
    vectorize_layer,
    tf.keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=32,
        mask_zero=True),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model.compile(optimizer="adam",
             loss = "binary_crossentropy",
             metrics=['accuracy'])


In [None]:
model.summary()

In [None]:
# early stopping callback
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                  mode="min", 
                                  patience=5,
                                  restore_best_weights=True)

In [None]:
mdl_history = model.fit(X_train,
         y_train,
         epochs = 15,
         validation_data = (X_test,y_test),
         callbacks = [callback])

In [None]:
pred_text = "sale today! to stop texts call 98912460324"

In [None]:
np.array([pred_text])

In [None]:
result = model.predict(np.array([pred_text]))

In [None]:
result[0][0]

In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
  result = model.predict(np.array([pred_text]))
  if (result[0][0] >= 0.5):
    hamOrSpam = 'spam'
  else:
    hamOrSpam = 'ham'
  prediction = [result[0][0],hamOrSpam]
  return (prediction)

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:  
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()