In [1]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
import pandas as pd
import numpy as np
# 0 for not spam and 1 spam

In [2]:
df=pd.read_csv('emails2.csv')

In [3]:
df.columns[df.isna().any()] #checking for missing values

Index([], dtype='object')

In [4]:
df.groupby('spam').count() #imbalance dataset

Unnamed: 0_level_0,text
spam,Unnamed: 1_level_1
0,4360
1,1368


In [5]:
df0=df[df.spam==0] #tackling imbalance dataset
df1=df[df.spam==1]
df=pd.concat([df0[:1368],df1]) 

In [6]:
x=df.text
y=df.spam

In [7]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=10)

In [8]:
v = CountVectorizer()
X_train = v.fit_transform(X_train)
X_test = v.transform(X_test)

In [9]:
lrmodel = LogisticRegression()

svmmodel = SVC()

nbmodel = MultinomialNB()

In [10]:
def evaluvate(model):
    model.fit(X_train,y_train)
    y_predicted=model.predict(X_test)
    print(model.score(X_test,y_test))
    cm = confusion_matrix(y_test, y_predicted)
    print(cm)
    print(classification_report(y_test, y_predicted))
    
    ownemail=['click this link and win cash']
    testemail=v.transform(ownemail)
    if model.predict(testemail)[0]:
        print('click this link and win cash - Spam')
    else:
        print('click this link and win cash - Not Spam')
    ownemail=['can you call me']
    testemail=v.transform(ownemail)
    if model.predict(testemail)[0]:
        print('can you call me - Spam')
    else:
        print('can you call me - Not Spam')

    

In [11]:
evaluvate(lrmodel)

0.9854014598540146
[[264   7]
 [  1 276]]
              precision    recall  f1-score   support

           0       1.00      0.97      0.99       271
           1       0.98      1.00      0.99       277

    accuracy                           0.99       548
   macro avg       0.99      0.99      0.99       548
weighted avg       0.99      0.99      0.99       548

click this link and win cash - Spam
can you call me - Not Spam


In [12]:
evaluvate(svmmodel)

0.9343065693430657
[[239  32]
 [  4 273]]
              precision    recall  f1-score   support

           0       0.98      0.88      0.93       271
           1       0.90      0.99      0.94       277

    accuracy                           0.93       548
   macro avg       0.94      0.93      0.93       548
weighted avg       0.94      0.93      0.93       548

click this link and win cash - Spam
can you call me - Spam


In [13]:
evaluvate(nbmodel)

0.9872262773722628
[[266   5]
 [  2 275]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       271
           1       0.98      0.99      0.99       277

    accuracy                           0.99       548
   macro avg       0.99      0.99      0.99       548
weighted avg       0.99      0.99      0.99       548

click this link and win cash - Spam
can you call me - Not Spam


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras import Sequential, layers
import tensorflow as tf

In [15]:
input_shape = X_train.shape[1]
model = Sequential([
  layers.Dense(128, activation='relu', input_shape=(input_shape,)),
  layers.Dense(64, activation='relu'),
  layers.Dense(2, activation='softmax')
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(), #not one hot code 
              metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [16]:
model.fit(X_train.toarray(), y_train, epochs=5, batch_size=10) #changiing into numpy array

Epoch 1/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.9216 - loss: 0.2487
Epoch 2/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 24ms/step - accuracy: 0.9951 - loss: 0.0180
Epoch 3/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 24ms/step - accuracy: 1.0000 - loss: 0.0025
Epoch 4/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 24ms/step - accuracy: 1.0000 - loss: 8.4792e-04
Epoch 5/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 24ms/step - accuracy: 1.0000 - loss: 3.2913e-04


<keras.src.callbacks.history.History at 0x1aa970d1410>

In [17]:
ownemail = ['click this link and win cash']
testemail = v.transform(ownemail)
prediction = model.predict(testemail.toarray())[0]

if prediction.argmax() == 1:  # Assuming 1 corresponds to spam
    print('Spam')
else:
    print('Not Spam')

ownemail = ['can you call me']
testemail = v.transform(ownemail)
prediction = model.predict(testemail.toarray())[0]
if prediction.argmax() == 1:  # Assuming 1 corresponds to spam
    print('Spam')
else:
    print('Not Spam')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Spam
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Not Spam


In [18]:
y_pred=model.predict(X_test.toarray())
y_pred=np.argmax(y_pred,axis=1)

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [19]:
def accuracy(y_pred,y):
    noofcorrect=0
    for i,j in zip(y_pred,y):
        if i==j:
            noofcorrect+=1
    return noofcorrect/len(y)
    

In [20]:
accuracy(y_pred,y_test)

0.9908759124087592

In [21]:
#Conclusion - Neural network performs the best followed by naive bayes and LR