# Text Representation - Bag Of Words (BOW)

In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [4]:
df['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


##  Train test split


In [5]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(df.Message,df.spam,test_size=0.2)

In [6]:
X_train.shape

(4457,)

In [7]:
X_test.shape

(1115,)

In [8]:
type(X_train)

pandas.core.series.Series

In [9]:
X_train[:4]

2346    Hi this is yijue, can i meet u at 11 tmr?
1202                         I know she called me
2333                     We are both fine. Thanks
1932          What pa tell me.. I went to bath:-)
Name: Message, dtype: object

In [10]:
type(y_train)

pandas.core.series.Series

In [11]:
y_train[:4]

2346    0
1202    0
2333    0
1932    0
Name: spam, dtype: int64

## Create bag of words representation using CountVectorizer

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7716 sparse matrix of type '<class 'numpy.int64'>'
	with 58949 stored elements in Compressed Sparse Row format>

In [13]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [14]:
X_train_cv.shape

(4457, 7716)

In [15]:
v.get_feature_names_out()

array(['00', '000', '000pes', ..., 'zyada', 'ú1', '〨ud'], dtype=object)

In [16]:
v.get_feature_names_out()[1775]

'chg'

In [17]:
#v.vocabulary_

In [18]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [19]:
np.where(X_train_np[0]!=0)

(array([ 259, 1113, 1631, 3419, 3736, 4416, 6829, 6898, 7672]),)

In [20]:
X_train[:4]

2346    Hi this is yijue, can i meet u at 11 tmr?
1202                         I know she called me
2333                     We are both fine. Thanks
1932          What pa tell me.. I went to bath:-)
Name: Message, dtype: object

In [22]:
X_train[:4][1202]

'I know she called me'

In [23]:
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [24]:
X_train_np[0][933]

0

## Train the naive bayes model

In [25]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(X_train_cv,y_train)

In [26]:
X_test_cv = v.transform(X_test)

## Evaluate Performance

In [27]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       952
           1       0.95      0.94      0.95       163

    accuracy                           0.98      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.98      0.98      0.98      1115



In [28]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count=v.transform(emails)
model.predict(emails_count)

array([0, 1])

## Train the model using sklearn pipeline and reduce number of lines of code


In [29]:
from sklearn.pipeline import Pipeline

clf=Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])



In [30]:
clf.fit(X_train,y_train)

In [31]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       952
           1       0.95      0.94      0.95       163

    accuracy                           0.98      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.98      0.98      0.98      1115

