In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

from collections import Counter
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import tensorflow as tf
import tensorflow_datasets as tfds
import re
import string


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/spam-text-message-classification/SPAM text message 20170820 - Data.csv")
df.head(2)

## EDA

In [None]:
df.describe()

In [None]:
sns.countplot(x=df.Category)
plt.show()

print(f"{len(df[df.Category == 'ham']) / len(df) * 100:.1f}% of messages are ham.")

In [None]:
print("Ham message: ")
print(df[df.Category == "ham"].Message.iloc[0])
print("")
print("Spam message: ")
print(df[df.Category == "spam"].Message.iloc[0])

In [None]:
df["Length"] = df.Message.apply(lambda x: len(x))

In [None]:
sns.histplot(df[df.Category == "ham"].Length, kde=True, color="red", label="ham")
sns.histplot(df[df.Category == "spam"].Length, kde=True, color="blue", label="spam")
plt.legend()
plt.show()

## Training

In [None]:
def preprocessing(s):
    s = s.lower()
    s = re.sub(r"[^0-9a-z]", " ", s)
    s = re.sub(r"\s{2,}", " ", s)
    
    valid_words = [w for w in s.split() if w not in (string.punctuation and stopwords.words("english"))]
    stemmed_words = " ".join([PorterStemmer().stem(w) for w in valid_words])
    
    return stemmed_words

In [None]:
preprocessing("Hello, World! This is a SPAM      message!")

In [None]:
vectorizer = TfidfVectorizer()
le = LabelEncoder()

X = df.Message.apply(lambda s: preprocessing(s))
X = vectorizer.fit_transform(X)
y = le.fit_transform(df.Category)

In [None]:
le.fit_transform(["ham", "spam"])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### Naive Bayes

In [None]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

In [None]:
preds = naive_bayes.predict(X_test)

print(classification_report(y_test, preds))

Low recall for spam message. 23% of spam messages are missed!

### Random Forest

In [None]:
rf = RandomForestClassifier(random_state=1)
rf.fit(X_train, y_train)

In [None]:
preds = rf.predict(X_test)

print(classification_report(y_test, preds))

### LSTM

In [None]:
!nvidia-smi

In [None]:
le = LabelEncoder()
df["Category_enc"] = le.fit_transform(df.Category)

In [None]:
df.head()

In [None]:
X = df.Message
y = df.Category_enc

dataset = tf.data.Dataset.from_tensor_slices((X, y))

In [None]:
dataset = dataset.shuffle(6000, reshuffle_each_iteration=False)
ds_test = dataset.take(1000)
ds_train = dataset.skip(1000).take(4500)

In [None]:
tokenizer = tfds.features.text.Tokenizer()
token_counts = Counter()
for example in ds_train:
    tokens = tokenizer.tokenize(example[0].numpy())
    token_counts.update(tokens)

encoder = tfds.features.text.TokenTextEncoder(token_counts)

In [None]:
example_str = encoder.encode("This is a spam")
example_str

In [None]:
def encode(text_tensor, label):
    text = text_tensor.numpy()
    encoded_text = encoder.encode(text)
    
    return encoded_text, label

def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

ds_train = ds_train.map(encode_map_fn)
ds_test = ds_test.map(encode_map_fn)

In [None]:
for example in ds_train.shuffle(4500).take(5):
    print(f"Sequence length: {example[0].shape}")

In [None]:
train_data = ds_train.padded_batch(32, padded_shapes=([-1], []))
test_data = ds_train.padded_batch(32, padded_shapes=([-1], []))

In [None]:
embedding_dim = 20
vocab_size = len(token_counts) + 2

tf.random.set_seed(1)

lstm = tf.keras.Sequential()
lstm.add(tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, name="embedding_layer"))
lstm.add(tf.keras.layers.LSTM(units=64, return_sequences=True, name="lstm_layer"))
lstm.add(tf.keras.layers.Dense(64, activation="relu"))
lstm.add(tf.keras.layers.Dense(1, activation="sigmoid"))

In [None]:
lstm.summary()

In [None]:
lstm.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), metrics=["accuracy"])
lstm.fit(train_data, validation_data=test_data, epochs=20)

## GRU

In [None]:
embedding_dim = 20
vocab_size = len(token_counts) + 2

tf.random.set_seed(1)

gru = tf.keras.Sequential()
gru.add(tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, name="embedding_layer"))
gru.add(tf.keras.layers.GRU(units=64, return_sequences=True, name="gru_layer"))
gru.add(tf.keras.layers.Dense(64, activation="relu"))
gru.add(tf.keras.layers.Dense(1, activation="sigmoid"))

In [None]:
gru.summary()

In [None]:
gru.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), metrics=["accuracy"])
gru.fit(train_data, validation_data=test_data, epochs=20)