# Text Representation using Bag Of Words (BOW)

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [10]:
# def get_spam_number(x):
#     if x == 'spam':
#         return 1
#     return 0

In [11]:
df['spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

In [12]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


## Train test split

In [15]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [16]:
x_train.shape

(4457,)

In [17]:
x_test.shape

(1115,)

In [18]:
type(x_train)

pandas.core.series.Series

In [19]:
x_train[:4]

2557    Fuck babe ... What happened to you ? How come ...
3536                             I'm at home. Please call
5078    Guy, no flash me now. If you go call me, call ...
2869             Aight, tomorrow around  &lt;#&gt;  it is
Name: Message, dtype: object

In [20]:
type(y_train)

pandas.core.series.Series

In [21]:
y_train[:4]

2557    0
3536    0
5078    0
2869    0
Name: spam, dtype: int64

In [22]:
x_train.values

array(['Fuck babe ... What happened to you ? How come you never came back?',
       "I'm at home. Please call",
       'Guy, no flash me now. If you go call me, call me. How madam. Take care oh.',
       ..., 'R we going with the  &lt;#&gt;  bus?',
       'Heart is empty without love.. Mind is empty without wisdom.. Eyes r empty without dreams &amp; Life is empty without frnds.. So Alwys Be In Touch. Good night &amp; sweet dreams',
       "I'm in solihull, | do you want anything?"], dtype=object)

## Create bag of words representation using CountVectorizer

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

x_train_cv = v.fit_transform(x_train.values)
x_train_cv

<4457x7725 sparse matrix of type '<class 'numpy.int64'>'
	with 59265 stored elements in Compressed Sparse Row format>

In [25]:
x_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [26]:
x_train_cv.shape

(4457, 7725)

In [27]:
v.get_feature_names_out()[1000:1050]

array(['anythingtomorrow', 'anytime', 'anyway', 'anyways', 'anywhere',
       'aom', 'apart', 'apartment', 'apeshit', 'aphex', 'apnt', 'apo',
       'apologetic', 'apologise', 'apologize', 'apology', 'app',
       'apparently', 'appear', 'appendix', 'applausestore', 'applebees',
       'apples', 'application', 'apply', 'applyed', 'appointment',
       'appointments', 'appreciate', 'approaches', 'approaching',
       'approve', 'approx', 'apps', 'appt', 'appy', 'april', 'aproach',
       'apt', 'aptitude', 'aquarius', 'ar', 'arcade', 'archive', 'ard',
       'are', 'area', 'aren', 'arent', 'aretaking'], dtype=object)

In [28]:
v.get_feature_names_out()

array(['00', '000', '000pes', ..., 'zyada', 'ú1', '〨ud'], dtype=object)

In [29]:
v.get_feature_names_out().shape

(7725,)

In [30]:
dir(v)

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_stop_words_id',
 '_validate_data',
 '_validate_ngram_range',
 '_validate_params',
 '_validate_vocabulary',
 '_warn_for_unused_params',
 '_white_spaces',
 '_word_ngrams',
 'analyzer',
 'binary',
 'build_analyzer',
 'build_preprocessor',
 'bui

In [32]:
v.vocabulary_

{'fuck': 3011,
 'babe': 1177,
 'what': 7452,
 'happened': 3318,
 'to': 6900,
 'you': 7685,
 'how': 3515,
 'come': 1905,
 'never': 4724,
 'came': 1616,
 'back': 1186,
 'at': 1107,
 'home': 3467,
 'please': 5211,
 'call': 1597,
 'guy': 3274,
 'no': 4765,
 'flash': 2875,
 'me': 4381,
 'now': 4809,
 'if': 3596,
 'go': 3147,
 'madam': 4283,
 'take': 6654,
 'care': 1645,
 'oh': 4879,
 'aight': 877,
 'tomorrow': 6926,
 'around': 1065,
 'lt': 4233,
 'gt': 3253,
 'it': 3754,
 'is': 3743,
 'its': 3760,
 'too': 6939,
 'late': 4016,
 'but': 1566,
 'wish': 7516,
 'the': 6780,
 'same': 5856,
 'hi': 3417,
 'dear': 2186,
 'we': 7390,
 'saw': 5889,
 'both': 1434,
 'are': 1045,
 'happy': 3326,
 'where': 7462,
 'my': 4635,
 'battery': 1239,
 'low': 4222,
 'otherwise': 4966,
 'had': 3284,
 'part': 5053,
 'time': 6869,
 'job': 3821,
 'na': 4645,
 'tuition': 7041,
 'need': 4697,
 'be': 1259,
 'in': 3640,
 'strong': 6497,
 'arms': 1061,
 'get': 3102,
 'your': 7691,
 'garden': 3062,
 'ready': 5561,
 'for': 29

In [35]:
v.get_feature_names_out()[1905]

'come'

In [36]:
x_train_np = x_train_cv.toarray()
x_train_np

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [44]:
np.where(x_train_np[0]!=0)
# np.where(x_train_np[2]!=0)

(array([1177, 1186, 1616, 1905, 3011, 3318, 3515, 4724, 6900, 7452, 7685],
       dtype=int64),)

In [47]:
x_train[:4][2869]

'Aight, tomorrow around  &lt;#&gt;  it is'

In [49]:
x_train_np[0][1177]

1

## Train the naive bayes model

In [50]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x_train_cv, y_train)

In [52]:
x_test_cv = v.transform(x_test)
x_test_cv

<1115x7725 sparse matrix of type '<class 'numpy.int64'>'
	with 13812 stored elements in Compressed Sparse Row format>

## Evaluate the Performance

In [55]:
from sklearn.metrics import classification_report

y_pred = model.predict(x_test_cv)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       962
           1       0.97      0.94      0.95       153

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [56]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

## Train the model using sklearn pipeline and reduce number of lines of code

In [57]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [58]:
clf.fit(x_train, y_train)

In [60]:
y_predict = clf.predict(x_test)
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       962
           1       0.97      0.94      0.95       153

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115

