In [None]:

import numpy as np 
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
df=pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv")

In [None]:
df.head()

In [None]:
df=df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)

In [None]:
df.head()

In [None]:
df['v2'][0]

In [None]:
df['v2'][2]

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt

In [None]:
sns.countplot('v1',data=df)
plt.title("No. of ham vs spam emails")

In [None]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
import re
import string

In [None]:
df['v2'] = df['v2'].apply(clean)
df.head()

In [None]:
from nltk.corpus import stopwords


In [None]:
stopword=set(stopwords.words('english'))


In [None]:
stopword

In [None]:
def remove_stopwords(text):
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    return text

In [None]:
df['v2'] = df['v2'].apply(remove_stopwords)
df.head()

In [None]:
import nltk

In [None]:
stemmer = nltk.SnowballStemmer("english")

In [None]:
stemmer

In [None]:
def stemming(text):
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

In [None]:
df['v2'] = df['v2'].apply(stemming)
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['v1']=le.fit_transform(df['v1'])
df.head()

In [None]:
le.classes_

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

In [None]:
df[df['v1']==0]['v2']

In [None]:
x=df['v2']
y=df['v1']

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

In [None]:
x_test

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english', ngram_range=(1,3))
x_train_vectorizer=count.fit_transform(x_train)

In [None]:
x_test_vectorizer=count.transform(x_test)

In [None]:
x_train_vectorizer.toarray()

In [None]:
count.vocabulary_

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer


In [None]:
tfidf = TfidfTransformer()

x_train_tfidf = tfidf.fit_transform(x_train_vectorizer)

x_train_tfidf.toarray()

In [None]:
x_test_tfidf = tfidf.transform(x_test_vectorizer)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


In [None]:
model_vectorizer= MultinomialNB().fit(x_train_vectorizer, y_train)
prediction_vectorizer=model_vectorizer.predict(x_test_vectorizer)
print(confusion_matrix(y_test,prediction_vectorizer))
print (classification_report(y_test, prediction_vectorizer))


In [None]:
model_tfidf= MultinomialNB().fit(x_train_tfidf, y_train)
prediction_tfidf=model_tfidf.predict(x_test_tfidf)
print (classification_report(y_test, prediction_tfidf))
print(confusion_matrix(y_test,prediction_tfidf))

In [None]:
import xgboost as xgb


In [None]:
xgb_model=xgb.XGBClassifier(
        learning_rate=0.1,
        max_depth=7,
        n_estimators=80,
        use_label_encoder=False,
        eval_metric='auc' )

In [None]:
xgb_model_vectorizer = xgb_model.fit(x_train_vectorizer, y_train)
xgb_predictions_vectorizer=xgb_model_vectorizer.predict(x_test_vectorizer)
print(confusion_matrix(y_test,xgb_predictions_vectorizer))
print (classification_report(y_test, xgb_predictions_vectorizer))

In [None]:
xgb_model = xgb_model.fit(x_train_tfidf, y_train)
xgb_predictions=xgb_model.predict(x_test_tfidf)
print(confusion_matrix(y_test,xgb_predictions))
print (classification_report(y_test, xgb_predictions))

In [None]:

from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding,SpatialDropout1D
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping


In [None]:
max_words = 1000
max_len = 500
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [None]:
sequences_matrix

In [None]:
inputs = Input(name='inputs',shape=[max_len])
layer = Embedding(max_words,50,input_length=max_len)(inputs)
layer = LSTM(64)(layer)
layer = Dense(256,name='FC1')(layer)
layer = Activation('relu')(layer)
layer = Dropout(0.5)(layer)
layer = Dense(1,name='out_layer')(layer)
layer = Activation('sigmoid')(layer)
model = Model(inputs=inputs,outputs=layer)

In [None]:
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

In [None]:
model.fit(sequences_matrix,y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001,patience=10)])

In [None]:
test_sequences = tokenizer.texts_to_sequences(x_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [None]:
accr = model.evaluate(test_sequences_matrix,y_test)

In [None]:
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
lstm_prediction=model.predict(test_sequences_matrix)

In [None]:
lstm_prediction

In [None]:
for prediction in lstm_prediction:
    print(prediction[0])

In [None]:
res=[]
for prediction in lstm_prediction:
    if prediction[0]<0.5:
        res.append(0)
    else:
        res.append(1)

In [None]:
print(confusion_matrix(y_test,res))

In [None]:
max_words = 50000
max_len = 1000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [None]:
from keras.models import Sequential


In [None]:
# inputs = Input(name='inputs',shape=[max_len])
# layer = Embedding(max_words,50,input_length=max_len)(inputs)
# layer = LSTM(64)(layer)
# layer = Dense(256,name='FC1')(layer)
# layer = Activation('relu')(layer)
# layer = Dropout(0.5)(layer)
# layer = Dense(1,name='out_layer')(layer)
# layer = Activation('sigmoid')(layer)
# model = Model(inputs=inputs,outputs=layer)
model2 = Sequential()
model2.add(Embedding(max_words, 100, input_length=max_len))
model2.add(SpatialDropout1D(0.2))
model2.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model2.add(Dense(1, activation='sigmoid'))
model2.summary()
model2.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

In [None]:
history2=model2.fit(sequences_matrix,y_train,batch_size=256,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001,patience=4)])

In [None]:
test_sequences = tokenizer.texts_to_sequences(x_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [None]:
accr = model2.evaluate(test_sequences_matrix,y_test)

In [None]:
lstm2_prediction=model2.predict(test_sequences_matrix)

In [None]:
res2=[]
for prediction in lstm2_prediction:
    if prediction[0]<0.5:
        res2.append(0)
    else:
        res2.append(1)

In [None]:
print(confusion_matrix(y_test,res2))

In [None]:
model2.save("Spam_Classifier_model.h5")

In [None]:
import keras

In [None]:
load_model=keras.models.load_model("./Spam_Classifier_model.h5")

In [None]:
test = 'Get free ringtone like never before get free '
def clean_text(text):
    print(text)
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    print(text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
test=[clean_text(test)]
print(test)
seq = tokenizer.texts_to_sequences(test)
padded = sequence.pad_sequences(seq, maxlen=1000)
pred = load_model.predict(padded)
print("pred", pred)
if pred<0.5:
    print("ham")
else:
    print("spam")