## SMS Spam Classifier: Multinomial Naive Bayes

The notebook is divided into the following sections:
1. Importing and preprocessing data
2. Building the model: Multinomial Naive Bayes
    - Model building 
    - Model evaluation

### 1. Importing and Preprocessing Data

In [1]:
import pandas as pd

# reading the training data
train = pd.read_csv("movie_review_train.csv")
test = pd.read_csv("movie_review_test.csv")
train

Unnamed: 0,class,text
0,Pos,a common complaint amongst film critics is ...
1,Pos,whew this film oozes energy the kind of b...
2,Pos,steven spielberg s amistad which is bas...
3,Pos,he has spent his entire life in an awful litt...
4,Pos,being that it is a foreign language film with...
...,...,...
1595,Neg,if anything stigmata should be taken as...
1596,Neg,john boorman s zardoz is a goofy cinemati...
1597,Neg,the kids in the hall are an acquired taste ...
1598,Neg,there was a time when john carpenter was a gr...


In [2]:
test

Unnamed: 0,class,text
0,Pos,films adapted from comic books have had plent...
1,Pos,every now and then a movie comes along from a...
2,Pos,you ve got mail works alot better than it des...
3,Pos,jaws is a rare film that grabs your atte...
4,Pos,moviemaking is a lot like being the general m...
...,...,...
395,Neg,one of the first films of 1999 is this mtv pi...
396,Neg,susan granger s review of america s sweethe...
397,Neg,susan granger s review of jeepers creepers ...
398,Neg,this independent film written and directed by...


In [3]:
# number of SMSes / documents
len(train.text)

1600

In [4]:
# counting spam and ham instances
ham_spam = train["class"].value_counts()
ham_spam

Pos    800
Neg    800
Name: class, dtype: int64

In [5]:
# mapping labels to 0 and 1
train['label'] = train["class"].map({'Pos':1, 'Neg':0})
test['label'] = test["class"].map({'Pos':1, 'Neg':0})

In [6]:
train.head()

Unnamed: 0,class,text,label
0,Pos,a common complaint amongst film critics is ...,1
1,Pos,whew this film oozes energy the kind of b...,1
2,Pos,steven spielberg s amistad which is bas...,1
3,Pos,he has spent his entire life in an awful litt...,1
4,Pos,being that it is a foreign language film with...,1


In [7]:
# we can now drop the column 'Class'
train = train.drop('class', axis=1)
train.head()

Unnamed: 0,text,label
0,a common complaint amongst film critics is ...,1
1,whew this film oozes energy the kind of b...,1
2,steven spielberg s amistad which is bas...,1
3,he has spent his entire life in an awful litt...,1
4,being that it is a foreign language film with...,1


In [8]:
# we can now drop the column 'Class'
test = test.drop('class', axis=1)
test.head()

Unnamed: 0,text,label
0,films adapted from comic books have had plent...,1
1,every now and then a movie comes along from a...,1
2,you ve got mail works alot better than it des...,1
3,jaws is a rare film that grabs your atte...,1
4,moviemaking is a lot like being the general m...,1


In [9]:
# convert to X and y
X_train = train.text
y_train = train.label
print(X_train.shape)
print(y_train.shape)

(1600,)
(1600,)


In [10]:
# convert to X and y
X_test = test.text
y_test = test.label
print(X_test.shape)
print(y_test.shape)

(400,)
(400,)


In [11]:
X_train.head()

0     a common complaint amongst film critics is   ...
1     whew   this film oozes energy   the kind of b...
2     steven spielberg s   amistad     which is bas...
3     he has spent his entire life in an awful litt...
4     being that it is a foreign language film with...
Name: text, dtype: object

In [12]:
y_train.head()

0    1
1    1
2    1
3    1
4    1
Name: label, dtype: int64

In [13]:
# vectorizing the sentences; removing stop words
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english')

In [14]:
vect.fit(X_train)

CountVectorizer(stop_words='english')

In [15]:
# printing the vocabulary
vect.vocabulary_

{'common': 6284,
 'complaint': 6360,
 'film': 11832,
 'critics': 7378,
 'aren': 1810,
 'literate': 18603,
 'scripts': 27918,
 'available': 2319,
 'quiz': 25250,
 'gives': 13262,
 'signs': 28809,
 'hope': 15074,
 'art': 1928,
 'writing': 35521,
 'isn': 16771,
 'dead': 7904,
 'hollywood': 14963,
 'need': 21281,
 'look': 18770,
 'independent': 15988,
 'films': 11851,
 'thoughtful': 32060,
 'content': 6771,
 'paul': 23045,
 'attanasio': 2173,
 'script': 27912,
 'takes': 31486,
 'tepid': 31839,
 'thriller': 32090,
 'scandals': 27629,
 'late': 18017,
 '50s': 313,
 'delivers': 8215,
 'telling': 31765,
 'parable': 22815,
 'emptiness': 10340,
 'post': 24178,
 'war': 34707,
 'american': 1334,
 'dream': 9554,
 'golden': 13449,
 'bubble': 4285,
 'surrounds': 31156,
 'protects': 24849,
 'tv': 33056,
 'networks': 21378,
 'sponsors': 29948,
 'riddled': 26743,
 'symbols': 31370,
 '58': 328,
 'chrysler': 5614,
 'radio': 25314,
 'announcement': 1534,
 'sputnik': 30053,
 'heavy': 14532,
 'handed': 14185,

In [16]:
# vocab size
len(vect.vocabulary_.keys())

35858

In [17]:
vect = CountVectorizer(stop_words='english', min_df=.03, max_df=.8)

In [18]:
vect.fit(X_train)

CountVectorizer(max_df=0.8, min_df=0.03, stop_words='english')

In [19]:
len(vect.vocabulary_.keys())

1643

In [20]:
# transforming the train and test datasets
X_train_transformed = vect.transform(X_train)
X_test_transformed = vect.transform(X_test)

In [21]:
# note that the type is transformed (sparse) matrix
print(type(X_train_transformed))
print(X_train_transformed)

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 4)	1
  (0, 59)	2
  (0, 72)	1
  (0, 78)	1
  (0, 81)	1
  (0, 99)	3
  (0, 101)	1
  (0, 192)	1
  (0, 211)	1
  (0, 213)	1
  (0, 220)	1
  (0, 264)	2
  (0, 287)	1
  (0, 316)	1
  (0, 323)	1
  (0, 328)	1
  (0, 340)	1
  (0, 342)	1
  (0, 357)	1
  (0, 386)	1
  (0, 413)	2
  (0, 445)	2
  (0, 464)	1
  (0, 503)	1
  (0, 506)	1
  :	:
  (1599, 1247)	1
  (1599, 1258)	2
  (1599, 1267)	1
  (1599, 1271)	1
  (1599, 1331)	1
  (1599, 1335)	1
  (1599, 1339)	1
  (1599, 1366)	3
  (1599, 1371)	1
  (1599, 1375)	1
  (1599, 1379)	1
  (1599, 1413)	1
  (1599, 1421)	1
  (1599, 1429)	2
  (1599, 1447)	1
  (1599, 1501)	1
  (1599, 1533)	1
  (1599, 1550)	1
  (1599, 1555)	1
  (1599, 1556)	1
  (1599, 1570)	1
  (1599, 1579)	2
  (1599, 1589)	1
  (1599, 1609)	1
  (1599, 1616)	1


In [22]:
print(X_test_transformed)

  (0, 11)	1
  (0, 18)	2
  (0, 23)	1
  (0, 29)	1
  (0, 31)	1
  (0, 53)	1
  (0, 73)	1
  (0, 94)	1
  (0, 110)	1
  (0, 118)	1
  (0, 133)	1
  (0, 134)	1
  (0, 140)	1
  (0, 141)	1
  (0, 151)	3
  (0, 152)	1
  (0, 173)	1
  (0, 176)	1
  (0, 177)	1
  (0, 188)	2
  (0, 189)	1
  (0, 195)	1
  (0, 197)	1
  (0, 203)	1
  (0, 206)	1
  :	:
  (399, 1314)	2
  (399, 1325)	1
  (399, 1354)	1
  (399, 1371)	1
  (399, 1375)	2
  (399, 1382)	1
  (399, 1399)	2
  (399, 1411)	1
  (399, 1416)	1
  (399, 1418)	1
  (399, 1430)	1
  (399, 1463)	2
  (399, 1471)	1
  (399, 1475)	1
  (399, 1514)	2
  (399, 1516)	1
  (399, 1542)	1
  (399, 1546)	2
  (399, 1549)	1
  (399, 1609)	3
  (399, 1632)	1
  (399, 1633)	3
  (399, 1637)	1
  (399, 1638)	1
  (399, 1641)	2


In [23]:
X_test_transformed

<400x1643 sparse matrix of type '<class 'numpy.int64'>'
	with 51663 stored elements in Compressed Sparse Row format>

### 2. Building and Evaluating the Model

In [25]:
from sklearn.naive_bayes import BernoulliNB

# instantiate bernoulli NB object
bnb = BernoulliNB()

# fit 
bnb.fit(X_train_transformed,y_train)

BernoulliNB()

In [27]:
# predict class
y_pred_class = bnb.predict(X_test_transformed)

# predict probability
y_pred_proba =bnb.predict_proba(X_test_transformed)

# accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.79

In [None]:
# note that alpha=1 is used by default for smoothing
mnb

### Model Evaluation

In [28]:
# printing the overall accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.79

In [29]:
# confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)
# help(metrics.confusion_matrix)

array([[177,  23],
       [ 61, 139]])

In [30]:
confusion = metrics.confusion_matrix(y_test, y_pred_class)
print(confusion)
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
TP = confusion[1, 1]

[[177  23]
 [ 61 139]]


In [None]:
sensitivity = TP / float(FN + TP)
print("sensitivity",sensitivity)

In [None]:
specificity = TN / float(TN + FP)
print("specificity",specificity)

In [None]:
precision = TP / float(TP + FP)
print("precision",precision)
print(metrics.precision_score(y_test, y_pred_class))

In [None]:
print("precision",precision)
print("PRECISION SCORE :",metrics.precision_score(y_test, y_pred_class))
print("RECALL SCORE :", metrics.recall_score(y_test, y_pred_class))
print("F1 SCORE :",metrics.f1_score(y_test, y_pred_class))

In [None]:
y_pred_class

In [None]:
y_pred_proba

In [None]:
# creating an ROC curve
from sklearn.metrics import confusion_matrix as sk_confusion_matrix
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_proba[:,1])
roc_auc = auc(false_positive_rate, true_positive_rate)

In [None]:
# area under the curve
print (roc_auc)

In [None]:
# matrix of thresholds, tpr, fpr
pd.DataFrame({'Threshold': thresholds, 
              'TPR': true_positive_rate, 
              'FPR':false_positive_rate
             })

In [None]:
# plotting the ROC curve
%matplotlib inline  
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC')
plt.plot(false_positive_rate, true_positive_rate)