In [None]:
import pandas as pd
import numpy as np
import sklearn

In [None]:
import io
import requests
url = "https://raw.githubusercontent.com/codebasics/nlp-tutorials/main/9_bag_of_words/spam.csv"
s = requests.get(url).content
df = pd.read_csv(io.StringIO(s.decode('utf-8')))
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [None]:
df['spam'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [None]:
X_train.shape

(4457,)

In [None]:
X_test.shape

(1115,)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv.shape

(4457, 7770)

In [None]:
v.get_feature_names_out()[1000:1050]

array(['anti', 'antibiotic', 'any', 'anybody', 'anyhow', 'anymore',
       'anyone', 'anyones', 'anyplaces', 'anythiing', 'anythin',
       'anything', 'anythingtomorrow', 'anytime', 'anyway', 'anyways',
       'anywhere', 'aom', 'apart', 'apartment', 'apes', 'aphex', 'apnt',
       'apo', 'apologetic', 'apologise', 'apologize', 'apology', 'app',
       'apparently', 'appeal', 'appear', 'appendix', 'applausestore',
       'applebees', 'apples', 'application', 'apply', 'appointment',
       'appointments', 'appreciate', 'appreciated', 'approaches',
       'approaching', 'appropriate', 'approx', 'apps', 'appt', 'appy',
       'april'], dtype=object)

In [None]:
v.vocabulary_

{'darren': 2175,
 'was': 7415,
 'saying': 5947,
 'dat': 2179,
 'if': 3612,
 'meeting': 4439,
 'da': 2149,
 'ge': 3115,
 'den': 2250,
 'we': 7439,
 'dun': 2502,
 'meet': 4437,
 'dinner': 2340,
 'cos': 2023,
 'later': 4042,
 'leave': 4075,
 'xy': 7687,
 'will': 7541,
 'feel': 2814,
 'awkward': 1181,
 'him': 3449,
 'lunch': 4269,
 'lor': 4217,
 'that': 6836,
 'is': 3753,
 'wondar': 7599,
 'full': 3051,
 'flim': 2910,
 'sorry': 6338,
 'can': 1638,
 'help': 3416,
 'you': 7730,
 'on': 4945,
 'this': 6876,
 'eatin': 2534,
 'my': 4675,
 'only': 4956,
 'haf': 3312,
 'msn': 4631,
 'it': 3764,
 'yijue': 7723,
 'hotmail': 3521,
 'com': 1909,
 'or': 4982,
 'go': 3175,
 'home': 3485,
 'first': 2885,
 'lar': 4029,
 'wait': 7376,
 'me': 4418,
 'put': 5524,
 'down': 2442,
 'stuff': 6558,
 'the': 6840,
 '2nd': 404,
 'time': 6920,
 'have': 3375,
 'tried': 7057,
 'to': 6952,
 'contact': 1988,
 'won': 7598,
 '1450': 302,
 'prize': 5439,
 'claim': 1829,
 'just': 3879,
 'call': 1613,
 '09053750005': 174,
 'b

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [None]:
X_test_cv = v.transform(X_test)

In [None]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       964
           1       0.97      0.91      0.94       151

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

In [None]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [None]:
clf.fit(X_train,y_train)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       964
           1       0.97      0.91      0.94       151

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115

