In [None]:
import pandas as pd
import numpy as np
df=pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv',encoding='latin-1')
df

In [None]:
df=df.iloc[:,[0,1]]
df.columns=['labels','text']
df.head(10)

In [None]:
df.info()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(df.labels)
plt.title('Frequency of Both labels ')

In [None]:
df[df['labels']=='ham'].value_counts()

In [None]:
df[df['labels']=='spam'].value_counts()

In [None]:
X=df['text']
Y=df.labels
X,Y

In [None]:
# Encode the labels
#It is not a case of MultiClass classification
# we have to perform Binary classification
# So , LabelEncoder is a better option
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
Y=le.fit_transform(Y)
Y

In [None]:
# I have fixed the number_of_columns=1
Y=Y.reshape(-1,1)
Y

In [None]:
from nltk.corpus import stopwords   #remove stopwords
from nltk.stem.porter import PorterStemmer   #stemming
import re
#Different models for converting text to vector
from sklearn.feature_extraction.text import CountVectorizer #BOW
from sklearn.feature_extraction.text import TfidfVectorizer    # Tf-idf
from gensim.models import Word2Vec   # word2vec method


In [None]:
port_stem=PorterStemmer()
corpus=[]
for i in range(len(df['text'])):
    text_1=re.sub('[^a-zA-Z]'," ",df['text'][i])
    text_1=text_1.lower()
    text_1=text_1.split()
    text_1=[port_stem.stem(word) for word in text_1 if word not in stopwords.words('english')]
    text_1=' '.join(text_1)
    corpus.append(text_1)

In [None]:
len(corpus),len(Y)

In [None]:
from sklearn.model_selection import train_test_split
xtrain,xval,ytrain,yval=train_test_split(corpus,Y,test_size=0.2,random_state=2)

In [None]:
len(xtrain),len(ytrain),len(xval),len(yval)

In [None]:
%%time
documents=[text.split() for text in xtrain]

In [None]:
len(documents)

In [None]:
import gensim
w2v_model = gensim.models.Word2Vec(size=300, 
                                            window=3, 
                                            min_count=5, 
                                            workers=8)
# min_count (int, optional) – Ignores all words with total frequency lower than this.
# workers (int, optional) – Use these many worker threads to train the model (=faster training with multicore machines).
# window (int, optional) – Maximum distance between the current and predicted word within a sentence.
# vector_size (int, optional) – Dimensionality of the word vectors.

In [None]:
w2v_model

In [None]:
w2v_model.build_vocab(documents)
# Build vocabulary from a dictionary of word frequencies.

In [None]:
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

In [None]:
%%time
w2v_model.train(documents,total_examples=len(documents),epochs=32)

In [None]:
w2v_model.most_similar("answer")

In [None]:
from keras.preprocessing.text import Tokenizer
tokenizer=Tokenizer()

In [None]:
tokenizer.fit_on_texts(xtrain)
vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

In [None]:
tokenizer

In [None]:
%%time
from keras.preprocessing.sequence import pad_sequences
x_train = pad_sequences(tokenizer.texts_to_sequences(xtrain), maxlen=300)
x_test = pad_sequences(tokenizer.texts_to_sequences(xval), maxlen=300)

In [None]:
x_train

In [None]:
len(x_train),len(ytrain),len(x_test),len(yval)

In [None]:
# Build Embedding Layer
embedding_matrix = np.zeros((vocab_size, 300))
print(embedding_matrix)
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

In [None]:
embedding_matrix

In [None]:
from keras.layers import Activation, Dense, Dropout, Embedding
embedding_layer = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=300, trainable=False)

In [None]:
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
# Build The model
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

In [None]:
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]

In [None]:
%%time
history = model.fit(x_train, ytrain,
                    batch_size=32,
                    epochs=8,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

In [None]:
%%time
score = model.evaluate(x_test, yval, batch_size=32)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
 
epochs = range(len(acc))
 
plt.plot(epochs, acc, 'b', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
 
plt.figure()
 
plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
 
plt.show()