In [71]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [29]:
df = pd.read_csv('spam.csv')

In [30]:
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [31]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [32]:
df.Category.value_counts()/len(df)*100

ham     86.593683
spam    13.406317
Name: Category, dtype: float64

In [33]:
df['spam'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)

In [34]:
df.head(5)

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [35]:
new_df = pd.read_csv('spam.csv')

In [36]:
new_df['Category'].replace({'ham':0,'spam':1}, inplace=True)

In [37]:
new_df.head(5)

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [38]:
df.head(5)

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [39]:
df.shape

(5572, 3)

#### **Train Test Split**

In [44]:
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam,test_size=0.2)

In [45]:
X_train.shape

(4457,)

In [46]:
X_test.shape

(1115,)

In [47]:
X_train[:4]

224     U say leh... Of course nothing happen lar. Not...
940     Better. Made up for Friday and stuffed myself ...
1504                      Ill be there on  &lt;#&gt;  ok.
1946    Can ü call me at 10:10 to make sure dat i've w...
Name: Message, dtype: object

In [48]:
y_train[:4]

224     0
940     0
1504    0
1946    0
Name: spam, dtype: int64

#### **Create bag of words representation using CountVectorizer**

In [69]:
v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_test_cv = v.transform(X_test)

In [54]:
X_train_cv

<4457x7749 sparse matrix of type '<class 'numpy.int64'>'
	with 59463 stored elements in Compressed Sparse Row format>

In [55]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [56]:
X_train_cv.shape

(4457, 7749)

In [59]:
v.get_feature_names_out()[1771]

'cheesy'

In [60]:
v.vocabulary_

{'say': 5922,
 'leh': 4079,
 'of': 4899,
 'course': 2052,
 'nothing': 4836,
 'happen': 3322,
 'lar': 4023,
 'not': 4832,
 'romantic': 5811,
 'jus': 3875,
 'bit': 1366,
 'only': 4954,
 'lor': 4214,
 'thk': 6844,
 'nite': 4793,
 'scenery': 5933,
 'so': 6278,
 'nice': 4776,
 'better': 1330,
 'made': 4309,
 'up': 7175,
 'for': 2925,
 'friday': 2988,
 'and': 959,
 'stuffed': 6541,
 'myself': 4670,
 'like': 4118,
 'pig': 5218,
 'yesterday': 7695,
 'now': 4845,
 'feel': 2795,
 'bleh': 1384,
 'but': 1580,
 'at': 1118,
 'least': 4065,
 'its': 3767,
 'writhing': 7619,
 'pain': 5062,
 'kind': 3943,
 'ill': 3614,
 'be': 1272,
 'there': 6822,
 'on': 4943,
 'lt': 4254,
 'gt': 3255,
 'ok': 4924,
 'can': 1634,
 'call': 1611,
 'me': 4412,
 '10': 252,
 'to': 6923,
 'make': 4331,
 'sure': 6622,
 'dat': 2178,
 've': 7254,
 'woken': 7568,
 'that': 6802,
 'was': 7386,
 'random': 5565,
 'saw': 5921,
 'my': 4666,
 'old': 4936,
 'roomate': 5814,
 'campus': 1633,
 'he': 3361,
 'graduated': 3210,
 'hot': 3506,
 

In [62]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [63]:
np.where(X_train_np[0]!=0)

(array([1366, 2052, 3322, 3875, 4023, 4079, 4214, 4776, 4793, 4832, 4836,
        4899, 4954, 5811, 5922, 5933, 6278, 6844]),)

#### **Naive Bayes Classifier**

In [68]:
model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [70]:
y_pred = model.predict(X_test_cv)

In [73]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       969
           1       0.96      0.93      0.94       146

    accuracy                           0.99      1115
   macro avg       0.97      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



#### **Test on a random datapoint**

In [102]:
message = {"Upto 20% off on parking, exclusing offer just for you"}

In [103]:
message_cnt = v.transform(message)

model.predict(message_cnt)

array([1])