In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [4]:
df['spam'] = df['Category'].apply(lambda x : 1 if x == 'spam' else 0)

In [5]:
df.shape

(5572, 3)

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [8]:
X_train.shape

(4457,)

In [9]:
X_test.shape

(1115,)

In [10]:
type(X_train)

pandas.core.series.Series

In [11]:
X_train[:4]

4190    Well the general price is  &lt;#&gt; /oz, let ...
4774       Ok then u tell me wat time u coming later lor.
4640    Of course. I guess god's just got me on hold r...
1808      Do have a nice day today. I love you so dearly.
Name: Message, dtype: object

In [12]:
type(y_train)

pandas.core.series.Series

In [13]:
y_train[:4]

4190    0
4774    0
4640    0
1808    0
Name: spam, dtype: int64

In [14]:
type(X_train.values)

numpy.ndarray

# Create bag of words representation using CountVectorizer

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7709 sparse matrix of type '<class 'numpy.int64'>'
	with 59449 stored elements in Compressed Sparse Row format>

In [16]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [17]:
X_train_cv.shape

(4457, 7709)

In [18]:
v.get_feature_names_out()[1771]

'chef'

In [19]:
v.vocabulary_

{'well': 7419,
 'the': 6790,
 'general': 3089,
 'price': 5388,
 'is': 3721,
 'lt': 4229,
 'gt': 3244,
 'oz': 5028,
 'let': 4069,
 'me': 4384,
 'know': 3936,
 'if': 3577,
 'when': 7446,
 'how': 3504,
 'much': 4610,
 'you': 7668,
 'want': 7343,
 'ok': 4895,
 'then': 6803,
 'tell': 6728,
 'wat': 7363,
 'time': 6880,
 'coming': 1918,
 'later': 4009,
 'lor': 4185,
 'of': 4871,
 'course': 2043,
 'guess': 3253,
 'god': 3152,
 'just': 3849,
 'got': 3188,
 'on': 4915,
 'hold': 3449,
 'right': 5760,
 'now': 4820,
 'do': 2360,
 'have': 3341,
 'nice': 4746,
 'day': 2173,
 'today': 6918,
 'love': 4207,
 'so': 6270,
 'dearly': 2187,
 'nah': 4655,
 'man': 4308,
 'my': 4640,
 'car': 1650,
 'meant': 4392,
 'to': 6911,
 'be': 1278,
 'crammed': 2055,
 'full': 3025,
 'people': 5130,
 'was': 7356,
 'actually': 818,
 'about': 771,
 'send': 5991,
 'reminder': 5675,
 'wonderful': 7542,
 'weekend': 7405,
 'double': 2406,
 'your': 7673,
 'mins': 4472,
 'txts': 7076,
 'orange': 4959,
 'or': 4955,
 'linerental': 

In [20]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [21]:
np.where(X_train_np[0] != 0)

(array([3089, 3244, 3504, 3577, 3721, 3936, 4069, 4229, 4384, 4610, 5028,
        5388, 6790, 7343, 7419, 7446, 7668], dtype=int64),)

In [23]:
X_train_np[0][1771]

0

In [24]:
#train the bayes model
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [25]:
X_test_cv = v.transform(X_test)

In [26]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       981
           1       0.97      0.94      0.95       134

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [27]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

# Train the model using sklearn pipeline and reduce number of lines of code

In [28]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [29]:
clf.fit(X_train, y_train)

In [30]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       981
           1       0.97      0.94      0.95       134

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115

