# Task 3

In [115]:
import pandas as pd
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

# Data collected from kaggle

In [116]:
msgs = pd.read_csv(r"D:\spam1.csv", encoding='cp1252')
msgs = msgs.iloc[:, [0,1]]
msgs.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Data pre-processing

In [117]:
msgs.rename(columns={'v1':'label', 'v2':'message'}, inplace = True)

In [118]:
msgs.replace({"label":{'ham':1,'spam':0}}, inplace = True)

In [119]:
msgs.message = msgs.message.str.lower()

In [120]:
from nltk.corpus import stopwords

In [121]:
abcd = stopwords.words('english')
abcd

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [122]:
words_to_remove = ['not',"wouldn't","don't","didn't","aren't","doesn't","hasn't","won't","haven't","hadn't","mightn't","wasn't","isn't","needn't","shouldn't","weren't","couldn't"]
abcd = [word for word in abcd if word not in words_to_remove]
abcd

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [123]:
import string

In [124]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [125]:
msgs.message = msgs.message.astype(str)

In [126]:
def text_process(mess):
    """
    1. remove the punctuation
    2. remove the stopwords
    3. return the clean text words
    """
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = "".join(nopunc)
    return[word for word in nopunc.split() if word not in abcd]

# COUNT VECTORIZER AND TF-IDF

In [127]:
from sklearn.feature_extraction.text import CountVectorizer

In [128]:
bow_transformer = CountVectorizer(analyzer=text_process).fit(msgs['message'])

In [129]:
len(bow_transformer.vocabulary_)
# only clean words

9423

In [130]:
tdm = bow_transformer.transform(msgs['message'])

In [131]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer()

In [132]:
tf_idf = TfidfVectorizer()

In [133]:
tf_idf_metrics = tf_idf.fit_transform(msgs.message)

# Train - Test split

In [134]:
msgs_x = msgs.iloc[:,1]
msgs_y = msgs.iloc[:,0]

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(tdm, msgs_y, test_size=.2)
x1_train, x1_test, y1_train, y1_test = train_test_split(tf_idf_metrics, msgs_y, test_size=.2)

In [135]:
from sklearn.linear_model import LogisticRegression
log_tdm = LogisticRegression()
log_tfidf = LogisticRegression()

In [136]:
log_tdm.fit(x_train,y_train)

In [137]:
log_tfidf.fit(x1_train, y1_train)

In [138]:
from sklearn.metrics import *

In [139]:
pred_log_tfidf = log_tfidf.predict(x1_test)

In [140]:
pred_log_tdm = log_tdm.predict(x_test)

In [141]:
tab_tfidf = confusion_matrix(y1_test, pred_log_tfidf)
tab_tfidf

array([[ 143,   39],
       [   2, 1172]], dtype=int64)

In [142]:
tab_tdm = confusion_matrix(y_test, pred_log_tdm)
tab_tdm

array([[ 145,   22],
       [   1, 1188]], dtype=int64)

In [143]:
print(classification_report(y1_test, pred_log_tfidf))

              precision    recall  f1-score   support

           0       0.99      0.79      0.87       182
           1       0.97      1.00      0.98      1174

    accuracy                           0.97      1356
   macro avg       0.98      0.89      0.93      1356
weighted avg       0.97      0.97      0.97      1356



In [144]:
print(classification_report(y_test, pred_log_tdm))

              precision    recall  f1-score   support

           0       0.99      0.87      0.93       167
           1       0.98      1.00      0.99      1189

    accuracy                           0.98      1356
   macro avg       0.99      0.93      0.96      1356
weighted avg       0.98      0.98      0.98      1356



In [145]:
from sklearn.tree import DecisionTreeClassifier
dt_tfidf = DecisionTreeClassifier(criterion='entropy')
dt_tdm = DecisionTreeClassifier(criterion='entropy')

In [146]:
dt_tfidf.fit(x1_train, y1_train)

In [147]:
dt_tdm.fit(x_train, y_train)

In [148]:
pred_tfidf_dt = dt_tfidf.predict(x1_test)
pred_tdm_dt = dt_tdm.predict(x_test)

In [149]:
confusion_matrix(y1_test, pred_tfidf_dt)

array([[ 157,   25],
       [  14, 1160]], dtype=int64)

In [150]:
print(classification_report(y1_test, pred_tfidf_dt))

              precision    recall  f1-score   support

           0       0.92      0.86      0.89       182
           1       0.98      0.99      0.98      1174

    accuracy                           0.97      1356
   macro avg       0.95      0.93      0.94      1356
weighted avg       0.97      0.97      0.97      1356



In [152]:
confusion_matrix(y_test, pred_tdm_dt)

array([[ 140,   27],
       [   9, 1180]], dtype=int64)

In [153]:
print(classification_report(y_test, pred_tdm_dt))

              precision    recall  f1-score   support

           0       0.94      0.84      0.89       167
           1       0.98      0.99      0.98      1189

    accuracy                           0.97      1356
   macro avg       0.96      0.92      0.94      1356
weighted avg       0.97      0.97      0.97      1356



In [154]:
from sklearn.svm import SVC
svm_tfidf = SVC()
svm_tdm = SVC()

In [155]:
svm_tfidf.fit(x1_train, y1_train)

In [156]:
svm_tdm.fit(x_train, y_train)

In [157]:
pred_svm_tfidf = svm_tfidf.predict(x1_test)
pred_svm_tdm = svm_tdm.predict(x_test)

In [158]:
confusion_matrix(y1_test, pred_svm_tfidf)

array([[ 159,   23],
       [   0, 1174]], dtype=int64)

In [159]:
confusion_matrix(y_test, pred_log_tdm)

array([[ 145,   22],
       [   1, 1188]], dtype=int64)

In [160]:
print(classification_report(y1_test, pred_svm_tfidf))

              precision    recall  f1-score   support

           0       1.00      0.87      0.93       182
           1       0.98      1.00      0.99      1174

    accuracy                           0.98      1356
   macro avg       0.99      0.94      0.96      1356
weighted avg       0.98      0.98      0.98      1356



In [161]:
print(classification_report(y_test, pred_svm_tdm))

              precision    recall  f1-score   support

           0       1.00      0.84      0.91       167
           1       0.98      1.00      0.99      1189

    accuracy                           0.98      1356
   macro avg       0.99      0.92      0.95      1356
weighted avg       0.98      0.98      0.98      1356



# LSTMS

In [162]:
msgs = pd.read_csv(r"D:\spam1.csv", encoding='cp1252')
msgs = msgs.iloc[:, [0,1]]
msgs.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [163]:
msgs.rename(columns={'v1':'label', 'v2':'message'}, inplace = True)

In [164]:
msgs.replace({"label":{'ham':1,'spam':0}}, inplace = True)

In [165]:
msgs_x = msgs.iloc[:,1]
msgs_y = msgs.iloc[:,0]

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(msgs_x, msgs_y, test_size=.2)

In [166]:
y_train = to_categorical(y_train)

In [167]:
max_num_words = 10000      # max words to be taken from entire corpus
seq_len = 50              # 50 words each from each document
embedding_size = 100      # embedding size of each word

In [168]:
from keras.preprocessing.text import Tokenizer         # is used for tokenization
from keras.preprocessing.sequence import pad_sequences # used to bring all samples in same  lengthy -- extra zeroes

In [169]:
# embedding done on entire data
# embedding is being generated here
# seq len is selecting only those words which are present in your 10k
tokenizer = Tokenizer(num_words=max_num_words)
tokenizer.fit_on_texts(msgs.message)
x_train = tokenizer.texts_to_sequences(x_train)  # will convert the text into sequence ids
x_train = pad_sequences(x_train, maxlen= seq_len)

x_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, maxlen=seq_len)

model = Sequential()
model.add(Embedding(input_dim = max_num_words,
                    input_length=seq_len,
                    output_dim = embedding_size))

In [170]:
model.add(LSTM(5))
model.add(Dense(2, activation='softmax')) # 2 becoz 2 class in the target variablee
from tensorflow.keras.optimizers import Adam
adam = Adam(lr = .001)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



In [171]:
model.fit(x_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x20a333abc70>

In [172]:
pred= model.predict(x_test)



In [173]:
pred = pred.argmax(axis = 1)

In [174]:
from sklearn.metrics import *

In [175]:
tab1 = confusion_matrix(y_test, pred)
tab1

array([[ 174,    9],
       [   2, 1171]], dtype=int64)

In [176]:
accuracy_score(y_test, pred)

0.9918879056047197

# Evaluation

In [177]:
evaluation = pd.DataFrame()
evaluation['tdm_log'] = [accuracy_score(y_test, pred_log_tdm)]
evaluation['tfidf_log'] = [accuracy_score(y1_test, pred_log_tfidf)]
evaluation['tdm_dt'] = [accuracy_score(y_test, pred_tdm_dt)]
evaluation['tfidf_dt'] = [accuracy_score(y1_test, pred_tfidf_dt)]
evaluation['tdm_svm'] = [accuracy_score(y_test, pred_svm_tdm)]
evaluation['tfidf_svm'] = [accuracy_score(y1_test, pred_svm_tfidf)]
evaluation

Unnamed: 0,tdm_log,tfidf_log,tdm_dt,tfidf_dt,tdm_svm,tfidf_svm
0,0.789823,0.969764,0.789086,0.971239,0.794248,0.983038


I have used countvectorizer, Tf-Idf, tokenizer and lstms for building the model. I have used machine learning algorithms like svm, logistic regression and Decision tree, where I have got best model accuracy in logistic with term document metrics with 98.45% and in svm with term documentg metrics again using 98.08%. 

While later on i have also used LSTMS and i got the final accuracy 99.14% after doing some hyper-parameter tuning.