  # SPAM DETECTION                                                           

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [5]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [6]:
df.shape

(5572, 3)

In [7]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size = 0.2)

In [9]:
X_train.shape

(4457,)

In [10]:
X_test.shape

(1115,)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer 

v = CountVectorizer()
X_train_cv = v.fit_transform(X_train.values)
X_train_cv


<4457x7781 sparse matrix of type '<class 'numpy.int64'>'
	with 59559 stored elements in Compressed Sparse Row format>

In [12]:
X_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [13]:
X_train_cv.shape

(4457, 7781)

In [14]:
v.get_feature_names_out()

array(['00', '000', '008704050406', ..., 'èn', 'ú1', '〨ud'], dtype=object)

In [16]:
v.vocabulary_

{'dont': 2422,
 'put': 5539,
 'your': 7745,
 'phone': 5218,
 'on': 4963,
 'silent': 6182,
 'mode': 4582,
 'ok': 4946,
 'pls': 5287,
 'help': 3425,
 'me': 4435,
 'tell': 6768,
 'ashley': 1089,
 'that': 6827,
 'cant': 1648,
 'find': 2877,
 'her': 3437,
 'number': 4885,
 'oh': 4941,
 'have': 3380,
 'you': 7741,
 'emigrated': 2594,
 'or': 5003,
 'something': 6320,
 'maybe': 4428,
 '30': 417,
 'was': 7421,
 'bit': 1372,
 'hopeful': 3518,
 'sorry': 6346,
 'missed': 4542,
 'babe': 1182,
 'up': 7214,
 'late': 4056,
 'and': 956,
 'slept': 6242,
 'in': 3665,
 'hope': 3516,
 'enjoy': 2615,
 'driving': 2475,
 'lesson': 4114,
 'boytoy': 1481,
 'miss': 4541,
 'too': 6996,
 'teasing': 6755,
 'kiss': 3975,
 'any': 990,
 'special': 6377,
 'today': 6961,
 'cuz': 2140,
 'ibored': 3606,
 'don': 2418,
 'wanna': 7407,
 'study': 6551,
 'can': 1637,
 'let': 4116,
 'know': 3988,
 'details': 2282,
 'of': 4921,
 'fri': 3021,
 'when': 7511,
 'out': 5041,
 'cos': 2025,
 'not': 4858,
 'tom': 6976,
 'mentionned': 44

In [17]:
v.get_feature_names_out()[4963]

'on'

In [21]:
X_train[:4]

1278                Dont put your phone on silent mode ok
4847    Pls help me tell Ashley that i cant find her n...
3095    Have you emigrated or something? Ok maybe 5.30...
4512    Sorry I missed you babe. I was up late and sle...
Name: Message, dtype: object

In [27]:
X_train_np = X_train_cv.toarray()
np.where(X_train_np[0]!=1)

(array([   0,    1,    2, ..., 7778, 7779, 7780], dtype=int64),)

In [26]:
X_train_np[0][2422]

1

In [28]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [29]:
X_test_cv = v.transform(X_test)

In [32]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.97      0.91      0.94       149

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115



In [38]:
emails = [ ' Hey Jason can we get together to watch cricket? ', 'Upto 20% discount on purchasing!!']
emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)