NLP Tutorial: Text Representation - Bag Of Words (BOW)

In [4]:
import pandas as pd
import numpy as np



In [3]:
df= pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [6]:
# encoding spam to be 1 and ham to  be 0
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [7]:
df.shape

(5572, 3)

In [8]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


Train test split

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [10]:
X_train.shape

(4457,)

In [11]:
X_test.shape

(1115,)

Create bag of words representation using CountVectorizer

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7704 sparse matrix of type '<class 'numpy.int64'>'
	with 59287 stored elements in Compressed Sparse Row format>

In [17]:
#converting sparse matrix to numpy array 2D
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [18]:
X_train_cv.shape

(4457, 7704)

In [19]:
v.get_feature_names_out()[1771]

'charlie'

In [20]:
v.vocabulary_

{'double': 2450,
 'mins': 4478,
 'txt': 7048,
 'price': 5368,
 'linerental': 4116,
 'on': 4908,
 'latest': 4017,
 'orange': 4952,
 'bluetooth': 1430,
 'mobiles': 4528,
 'call': 1638,
 'mobileupd8': 4531,
 'for': 2973,
 'the': 6765,
 'very': 7231,
 'offers': 4875,
 '08000839402': 52,
 'or': 4948,
 'call2optout': 1640,
 'lf56': 4077,
 'santa': 5867,
 'calling': 1651,
 'would': 7569,
 'your': 7668,
 'little': 4140,
 'ones': 4914,
 'like': 4099,
 'from': 3048,
 'xmas': 7610,
 'eve': 2686,
 '09058094583': 185,
 'to': 6887,
 'book': 1450,
 'time': 6853,
 'hey': 3443,
 'they': 6790,
 'not': 4805,
 'watching': 7360,
 'movie': 4587,
 'tonight': 6923,
 'so': 6250,
 'll': 4149,
 'prob': 5388,
 'home': 3497,
 'early': 2536,
 'dont': 2436,
 'kick': 3924,
 'coco': 1903,
 'when': 7434,
 'he': 3392,
 'down': 2457,
 'are': 1080,
 'you': 7663,
 'comingdown': 1939,
 'later': 4016,
 'in': 3651,
 'that': 6761,
 'case': 1708,
 'guess': 3297,
 'see': 5954,
 'at': 1145,
 'campus': 1661,
 'lodge': 4165,
 'tddn

In [21]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [22]:
np.where(X_train_np[0]!=0)

(array([  52, 1430, 1638, 1640, 2450, 2973, 4017, 4077, 4116, 4478, 4528,
        4531, 4875, 4908, 4948, 4952, 5368, 6765, 7048, 7231], dtype=int64),)

Train the naive bayes model

In [23]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [24]:
X_test_cv = v.transform(X_test)

Evaluate Performance

In [25]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       969
           1       0.98      0.88      0.93       146

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

