In [1]:
from imageio import imopen
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("Sample CSV/spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [4]:
df['spam'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [5]:
X_train, X_test, y_train, y_test = train_test_split(df['Message'], df['spam'], test_size=0.2)

print(X_train.shape)
print(X_test.shape)
print(type(X_train))
print(type(y_train))

(4457,)
(1115,)
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [17]:
X_train.head

<bound method NDFrame.head of 2051                Hi where you. You in home or calicut?
1049    I walked an hour 2 c u! doesnt that show I ca...
4550    Haha, my friend tyler literally just asked if ...
480     When're you guys getting back? G said you were...
4497    In case you wake up wondering where I am, I fo...
                              ...                        
2225    I prefer my free days... Tues, wed, fri oso ca...
3201    Just curious because my cuz asked what I was u...
3146           I.ll get there tomorrow and send it to you
1040    They just talking thats it de. They wont any o...
1387                          All e best 4 ur exam later.
Name: Message, Length: 4457, dtype: object>

In [7]:
y_train.head

<bound method NDFrame.head of 2051    0
1049    0
4550    0
480     0
4497    0
       ..
2225    0
3201    0
3146    0
1040    0
1387    0
Name: spam, Length: 4457, dtype: int64>

In [8]:
v = CountVectorizer()
X_train_cv = v.fit_transform(X_train.values)
print(X_train_cv.shape)
print(type(X_train_cv))

(4457, 7783)
<class 'scipy.sparse._csr.csr_matrix'>


In [9]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [10]:
X_train_cv.shape

(4457, 7783)

In [11]:
v.get_feature_names_out()[978]

'anot'

In [12]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [13]:
np.where(X_train_np[0] != 0)

(array([1611, 3457, 3508, 3659, 4985, 7522, 7746], dtype=int64),)

In [15]:
X_train[0:4][1049]

'I walked an hour 2 c u! doesn\x92t that show I care y wont u believe im serious?'

In [None]:
X_train_np[0][978]

1

### Train the naive bayes model


In [None]:
model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [None]:
X_test_cv = v.transform(X_test)

### Evaluate Performance


In [None]:
y_pred = model.predict(X_test_cv)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       978
           1       0.98      0.96      0.97       137

    accuracy                           0.99      1115
   macro avg       0.99      0.98      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [None]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

### Train the model using sklearn pipeline and reduce number of lines of code


In [None]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       978
           1       0.98      0.96      0.97       137

    accuracy                           0.99      1115
   macro avg       0.99      0.98      0.98      1115
weighted avg       0.99      0.99      0.99      1115

