<h2 align='center'>NLP Tutorial: Text Representation - Bag Of Words (BOW)</h2>

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [4]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [5]:
df.shape

(5572, 3)

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


<h3>Train test split</h3>

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)



In [8]:
X_train.shape

(4457,)

In [9]:
X_test.shape

(1115,)

In [10]:
type(X_train)

pandas.core.series.Series

In [11]:
X_train[:4]

5143    Hi darlin im on helens fone im gonna b up the ...
3949    I like to think there's always the possibility...
3411    Joy's father is John. Then John is the ____ of...
4537    Never blame a day in ur life. Good days give u...
Name: Message, dtype: object

In [12]:
type(y_train)

pandas.core.series.Series

In [13]:
y_train[:4]

5143    0
3949    0
3411    0
4537    0
Name: spam, dtype: int64

In [14]:
type(X_train.values)

numpy.ndarray

<h3>Create bag of words representation using CountVectorizer</h3>

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7759 sparse matrix of type '<class 'numpy.int64'>'
	with 59302 stored elements in Compressed Sparse Row format>

In [16]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [17]:
X_train_cv.shape

(4457, 7759)

In [25]:
v.get_feature_names()[1771]

'cheesy'

In [26]:
v.vocabulary_

{'hi': 3455,
 'darlin': 2184,
 'im': 3643,
 'on': 4946,
 'helens': 3422,
 'fone': 2954,
 'gonna': 3212,
 'up': 7181,
 'the': 6815,
 'princes': 5419,
 'nite': 4800,
 'please': 5260,
 'come': 1912,
 'tb': 6726,
 'love': 4253,
 'kate': 3928,
 'like': 4138,
 'to': 6932,
 'think': 6845,
 'there': 6833,
 'always': 937,
 'possibility': 5335,
 'of': 4908,
 'being': 1318,
 'in': 3668,
 'pub': 5496,
 'later': 4054,
 'joy': 3885,
 'father': 2808,
 'is': 3774,
 'john': 3866,
 'then': 6827,
 '____': 750,
 'if': 3629,
 'ans': 988,
 'ths': 6880,
 'you': 7718,
 'hav': 3379,
 'lt': 4275,
 'gt': 3290,
 'iq': 3765,
 'tis': 6911,
 'ias': 3603,
 'question': 5537,
 'try': 7054,
 'answer': 990,
 'never': 4765,
 'blame': 1389,
 'day': 2199,
 'ur': 7200,
 'life': 4127,
 'good': 3213,
 'days': 2200,
 'give': 3170,
 'happiness': 3360,
 'bad': 1201,
 'experience': 2736,
 'both': 1455,
 'are': 1050,
 'essential': 2670,
 'all': 915,
 'gods': 3195,
 'blessings': 1399,
 'morning': 4599,
 'house': 3544,
 'maid': 4338,

In [23]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [27]:
np.where(X_train_np[0]!=0)

(array([1912, 2184, 2954, 3212, 3422, 3455, 3643, 3928, 4253, 4800, 4946,
        5260, 5419, 6726, 6815, 7181], dtype=int64),)

In [29]:
X_train[:4]

5143    Hi darlin im on helens fone im gonna b up the ...
3949    I like to think there's always the possibility...
3411    Joy's father is John. Then John is the ____ of...
4537    Never blame a day in ur life. Good days give u...
Name: Message, dtype: object

In [30]:
X_train_np[0][1771]

0

<h3>Train the naive bayes model</h3>

In [31]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

MultinomialNB()

In [32]:
X_test_cv = v.transform(X_test)

<h3>Evaluate Performance</h3>

In [33]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       966
           1       0.96      0.89      0.92       149

    accuracy                           0.98      1115
   macro avg       0.97      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [34]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

<h3>Train the model using sklearn pipeline and reduce number of lines of code</h3>

In [35]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [36]:
clf.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [37]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       966
           1       0.96      0.89      0.92       149

    accuracy                           0.98      1115
   macro avg       0.97      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115

