## Spam SMS Detection using LSTM

In [None]:
!pip install contractions

#### Importing the required libraries

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import pandas as pd
import numpy as np
import re
import collections
import contractions
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('dark_background')
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import warnings
warnings.simplefilter(action='ignore', category=Warning)
import keras
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#### Importing the spam sms dataset

In [None]:
df = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", encoding='latin-1')
df.head()

In [None]:
df.shape

#### There are some unwanted columns in our dataset, so we are removing it

In [None]:
df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1, inplace=True)

In [None]:
# renaming the columns
df.columns = ["Spam or Ham","Tweet"]
df.head()

#### Plotting the value counts

In [None]:
sns.countplot(df["Spam or Ham"])

In [None]:
df["Spam or Ham"].value_counts()

#### Creating a function for visualizing the count of words in the sms

In [None]:
def word_count_plot(data):
     
     word_counter = collections.Counter([word for sentence in data for word in sentence.split()])
     most_count = word_counter.most_common(30)
     
     most_count = pd.DataFrame(most_count, columns=["Word", "Count"]).sort_values(by="Count")
     most_count.plot.barh(x = "Word", y = "Count", color="green", figsize=(10, 15))
word_count_plot(df["Tweet"])

#### Performing data preprocessing techniques

In [None]:
lem = WordNetLemmatizer()
def preprocessing(data):
      sms = contractions.fix(data) 
      sms = sms.lower()
      sms = re.sub(r'https?://S+|www.S+', "", sms).strip() #removing url
      sms = re.sub("[^a-z ]", "", sms) # removing symbols and numbes
      sms = sms.split() 
      sms = [lem.lemmatize(word) for word in sms if not word in set(stopwords.words("english"))]
      sms = " ".join(sms)
      return sms
X = df["Tweet"].apply(preprocessing)

#### Word count plotting after preprocessing techniques

In [None]:
word_count_plot(X)

#### Encoding the output variables

In [None]:
from sklearn.preprocessing import LabelEncoder
lb_enc = LabelEncoder()
y = lb_enc.fit_transform(df["Spam or Ham"])

#### Tokenizing the input text using keras tokenizer

In [None]:
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(X)
text_to_sequence = tokenizer.texts_to_sequences(X) 

#### Padding the input tokenized text sequence to make all the sequence of equal length

In [None]:
max_length_sequence = max([len(i) for i in text_to_sequence])
 
padded_sequence = pad_sequences(text_to_sequence, maxlen=max_length_sequence, 
                                    padding = "pre") 
padded_sequence

#### Creating the LSTM Model

In [None]:
VOC_SIZE = len(tokenizer.word_index)+1
def create_model():
    
      model = Sequential()
      model.add(Embedding(VOC_SIZE, 32, input_length=max_length_sequence))
      model.add(LSTM(100))
      model.add(Dropout(0.4))
      model.add(Dense(20, activation="relu"))
      model.add(Dropout(0.5))
      model.add(Dense(1, activation = "sigmoid"))
      return model
lstm_model = create_model()
lstm_model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])
lstm_model.summary()

#### Training the model

In [None]:
lstm_model.fit(padded_sequence, y, epochs = 5, batch_size=16, validation_split=0.2)


#### Both validation and training accuracy is good.
#### Also refer my article about sms spam detection using lstm - [here](https://www.analyticsvidhya.com/blog/2021/05/sms-spam-detection-using-lstm-a-hands-on-guide/)
#### Do upvote if you like this notebook