<h2 align='center'>NLP Tutorial: Text Representation - Bag Of Words (BOW)</h2>

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [4]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [5]:
df.shape

(5572, 3)

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


<h3>Train test split</h3>

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [8]:
X_train.shape

(4457,)

In [9]:
X_test.shape

(1115,)

In [10]:
type(X_train)

pandas.core.series.Series

In [11]:
X_train[:4]

5403             So gd got free ice cream... I oso wan...
1766    Hi this is yijue... It's regarding the 3230 te...
724              Ya even those cookies have jelly on them
3966                         Love you aathi..love u lot..
Name: Message, dtype: object

In [12]:
type(y_train)

pandas.core.series.Series

In [13]:
y_train[:4]

5403    0
1766    0
724     0
3966    0
Name: spam, dtype: int64

In [14]:
type(X_train.values)

numpy.ndarray

<h3>Create bag of words representation using CountVectorizer</h3>

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7695 sparse matrix of type '<class 'numpy.int64'>'
	with 58931 stored elements in Compressed Sparse Row format>

In [16]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [17]:
X_train_cv.shape

(4457, 7695)

In [18]:
v.get_feature_names_out()[1771]

'chik'

In [19]:
v.vocabulary_

{'so': 6234,
 'gd': 3084,
 'got': 3190,
 'free': 2973,
 'ice': 3568,
 'cream': 2053,
 'oso': 4968,
 'wan': 7322,
 'hi': 3411,
 'this': 6806,
 'is': 3725,
 'yijue': 7647,
 'it': 3736,
 'regarding': 5628,
 'the': 6766,
 '3230': 443,
 'textbook': 6742,
 'intro': 3697,
 'to': 6886,
 'algorithms': 907,
 'second': 5928,
 'edition': 2509,
 'selling': 5959,
 'for': 2926,
 '50': 535,
 'ya': 7615,
 'even': 2642,
 'those': 6814,
 'cookies': 1991,
 'have': 3342,
 'jelly': 3788,
 'on': 4901,
 'them': 6775,
 'love': 4199,
 'you': 7654,
 'aathi': 745,
 'lot': 4186,
 'yup': 7681,
 'no': 4757,
 'need': 4688,
 'll': 4133,
 'jus': 3848,
 'wait': 7302,
 'rain': 5510,
 'stop': 6444,
 'ugh': 7065,
 'fuck': 3019,
 'resubbing': 5711,
 'eve': 2640,
 'we': 7362,
 'can': 1625,
 'go': 3149,
 'normal': 4783,
 'pilates': 5179,
 'after': 852,
 'our': 4977,
 'let': 4058,
 'me': 4377,
 'know': 3931,
 'if': 3586,
 'anything': 999,
 'else': 2544,
 'salad': 5835,
 'or': 4943,
 'desert': 2245,
 'something': 6255,
 'how': 

In [20]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [21]:
np.where(X_train_np[0]!=0)

(array([2053, 2973, 3084, 3190, 3568, 4968, 6234, 7322], dtype=int64),)

In [26]:
X_train[:4][1579]

KeyError: 1579

In [27]:
X_train_np[0][1771]

0

<h3>Train the naive bayes model</h3>

In [28]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [29]:
X_test_cv = v.transform(X_test)

<h3>Evaluate Performance</h3>

In [30]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       967
           1       0.95      0.91      0.93       148

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [31]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

<h3>Train the model using sklearn pipeline and reduce number of lines of code</h3>

In [32]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [33]:
clf.fit(X_train, y_train)

In [34]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       967
           1       0.95      0.91      0.93       148

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

