## Spam Filteration

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

In [4]:
data = pd.read_csv('spam.csv', encoding = 'latin-1')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5575 entries, 0 to 5574
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5575 non-null   object
 1   v2          5575 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.9+ KB


In [5]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5570,ham,The guy did some bitching but I acted like i'd...,,,
5571,ham,Rofl. Its true to its name,,,
5572,ham,"Virat Kohli, AB de Villiers set to auction the...",,,
5573,spam,Aarohi your lucky mobile no won a prize contac...,,,


In [8]:
df = data[['v2', 'v1']].rename(columns = {'v2': 'email', 'v1': 'label'})
df.head()

Unnamed: 0,email,label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [None]:
#we want to build a model to detect a spam message

In [9]:
df.label.replace({'ham': 0, 'spam': 1}, inplace = True)
df

Unnamed: 0,email,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
5570,The guy did some bitching but I acted like i'd...,0
5571,Rofl. Its true to its name,0
5572,"Virat Kohli, AB de Villiers set to auction the...",0
5573,Aarohi your lucky mobile no won a prize contac...,1


In [10]:
df['label'].value_counts()

label
0    4826
1     749
Name: count, dtype: int64

In [11]:
df['label'].value_counts(normalize = True)

label
0    0.86565
1    0.13435
Name: proportion, dtype: float64

In [12]:
X = df['email']
Y = df['label']

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, 
                                            random_state = 10, stratify = Y)

In [14]:
Y_train.value_counts(normalize = True)

label
0    0.865695
1    0.134305
Name: proportion, dtype: float64

In [15]:
X_train.head()

3077               Okay but i thought you were the expert
3594     Huh but i cant go 2 ur house empty handed right?
4707    Did you say bold, then torch later. Or one tor...
3762                   K.i will send in  &lt;#&gt;  min:)
4880                          When/where do I pick you up
Name: email, dtype: object

In [16]:
#Creation of bag of words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vector = TfidfVectorizer()

In [17]:
new_X_train = vector.fit(X_train)
new_X_train

In [33]:
X_train_vector = vector.transform(X_train)

In [34]:
X_test_vector = vector.transform(X_test)

In [35]:
X_train_vector.shape

(4460, 7623)

In [36]:
X_test_vector.shape

(1115, 7623)

In [37]:
# to see the bag of words
count_array = X_train_vector.toarray()
count_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [39]:
#to display the bag of words in a dataframe
df100 = pd.DataFrame(data = count_array, columns = vector.get_feature_names_out())
df100.head()

Unnamed: 0,00,000,000pes,0089,01223585236,01223585334,0125698789,02,0207,02072069400,...,ó_,û_,û_thanks,ûªm,ûªt,ûªve,ûï,ûïharry,ûò,ûówell
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
#let's train the model
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

In [68]:
mnb = MultinomialNB()
mnb.fit(X_train_vector, Y_train)

In [60]:
prediction = mnb.predict(X_test_vector)

In [66]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy_score(Y_test, prediction)

0.9614349775784753

In [67]:
print(confusion_matrix(Y_test, prediction))
print(classification_report(Y_test, prediction))

[[965   0]
 [ 43 107]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.71      0.83       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [46]:
bnb = BernoulliNB()
bnb.fit(X_train_vector, Y_train)
prediction = bnb.predict(X_test_vector)

In [47]:
accuracy_score(Y_test, prediction)

0.9775784753363229

In [48]:
print(confusion_matrix(Y_test, prediction))
print(classification_report(Y_test, prediction))

[[964   1]
 [ 24 126]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [74]:
#create a function, will send
def enter_text(message):
    if mnb.predict(vector.transform([message])) == 1:
        print('Spam message')
    else:
        print('Not Spam')

In [75]:
enter_text('Hello, how are you doing my friend?')

Not Spam


In [76]:
enter_text('Winner, you have a good chance to win exciting prize money')

Spam message


In [77]:
enter_text('password, you have a good chance to win exciting prize money')

Not Spam


## TRYING THIS WITH CountVectorizer

In [79]:
cvec = CountVectorizer()
X_train_vector = cvec.fit_transform(X_train)
X_test_vector = cvec.transform(X_test)

In [80]:
X_train_vector.shape

(4460, 7623)

In [81]:
X_test_vector.shape

(1115, 7623)

In [82]:
df100 = pd.DataFrame(data = X_train_vector.toarray(), columns = cvec.get_feature_names_out())
df100.head()

Unnamed: 0,00,000,000pes,0089,01223585236,01223585334,0125698789,02,0207,02072069400,...,ó_,û_,û_thanks,ûªm,ûªt,ûªve,ûï,ûïharry,ûò,ûówell
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
mnb = MultinomialNB()
mnb.fit(X_train_vector, Y_train)
prediction = mnb.predict(X_test_vector)

In [84]:
accuracy_score(Y_test, prediction)

0.9919282511210762

In [85]:
print(confusion_matrix(Y_test, prediction))
print(classification_report(Y_test, prediction))

[[963   2]
 [  7 143]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       965
           1       0.99      0.95      0.97       150

    accuracy                           0.99      1115
   macro avg       0.99      0.98      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [86]:
#create a function, will send
def enter_text(message):
    if mnb.predict(vector.transform([message])) == 1:
        print('Spam message')
    else:
        print('Not Spam')

In [87]:
enter_text('Hello, how are you doing my friend?')

Not Spam


In [88]:
enter_text('Winner, you have a good chance to win exciting prize money')

Spam message


In [109]:
enter_text('password? you have a good chance to win exciting prize money')

Spam message


In [93]:
enter_text('congratulations. You have won the prize money')

Spam message


In [98]:
enter_text('You have won a free mobile phone')

Spam message


In [107]:
enter_text('number 1235433 , you win the lottery prize')

Spam message


In [None]:
# CountVectorizer works better here!!!