In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('spam.csv', encoding="latin-1")

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# cleaning up the dataframe
df['label'] = df['v1'].map({'ham': 0, 'spam': 1})
df['message'] = df['v2']

df.drop(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [5]:
df.head() # label - 0 = ham, 1 = spam

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# create variables to hold the features and labels
X = df['message']
y = df['label']

In [7]:
# fit the feature data properly
cv = CountVectorizer()
X = cv.fit_transform(X) # Fit the Data

In [8]:
# split the training and testing data properly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [9]:
# Binomial Naive Bayes Classifier
clfB = BernoulliNB()

# Multinomial Naive Bayes Classifier
clfM = MultinomialNB()

In [10]:
# fit the training data
clfB.fit(X_train, y_train)
clfM.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [11]:
# accuracy score after testing on the test data
clfB.score(X_test, y_test)
clfM.score(X_test, y_test)

0.9802867383512545

In [12]:
# perform classification on an array of test vectors
y_predB = clfB.predict(X_test)
y_predM = clfM.predict(X_test)

In [13]:
# Output: The Classification Reports
print("\n=======================================================\n")
print("Classification Report for Binomial Naive Bayes: \n\n\n", classification_report(y_test, y_predB))
print("\n The Accuracy is: ", clfB.score(X_test, y_test))
print("\n=======================================================\n")
print("Classification Report for Multinomial Naive Bayes: \n\n\n", classification_report(y_test, y_predM))
print("\n The Accuracy is: ", clfM.score(X_test, y_test))
print("\n=======================================================")



Classification Report for Binomial Naive Bayes: 


               precision    recall  f1-score   support

           0       0.99      1.00      0.99       491
           1       0.98      0.90      0.94        67

    accuracy                           0.99       558
   macro avg       0.98      0.95      0.96       558
weighted avg       0.99      0.99      0.99       558


 The Accuracy is:  0.985663082437276


Classification Report for Multinomial Naive Bayes: 


               precision    recall  f1-score   support

           0       1.00      0.98      0.99       491
           1       0.88      0.97      0.92        67

    accuracy                           0.98       558
   macro avg       0.94      0.98      0.96       558
weighted avg       0.98      0.98      0.98       558


 The Accuracy is:  0.9802867383512545



In [14]:
# After training the model, it is desirable to have a way to persist the model for future use without having to retrain. To achieve this, we add the following lines to save our model as a .pkl file for the later use.
from sklearn.externals import joblib

joblib.dump(clfB, 'NB_spam_model.pkl')

# can be used later as
### NB_spam_model = open('NB_spam_model.pkl','rb')
### clf = joblib.load(NB_spam_model)



['NB_spam_model.pkl']