## Spam Detection using Naive Bayes Classifier

Classify messages as spam or ham. Dataset download from [here](https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection#)


In [4]:
# Load libraries

import pandas as pd

In [5]:
# Load dataset
dataPath = 'C:/Users/siddh/Desktop/GitHub_Projects/Udacity ML Nanodegree/SpamDetection_NaiveBayes/smsspamcollection/SMSSpamCollection'
data = pd.read_table(dataPath,delimiter='\t',header=None,names=['Label','Message'])
data.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Change Label to 0,1 with the mapping *spam = 1* and *ham = 0*

In [6]:
mapping = {'spam':1,'ham':0}
data.Label = data.Label.map(mapping)
data.head()

Unnamed: 0,Label,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### Split data into Train and Test sets

In [15]:
from sklearn.cross_validation import train_test_split

Xtrain, Xtest, Ytrain, Ytest = train_test_split(data.Message, data.Label, random_state=1)

### Extract features from training set corpus using Bag Of Words approach

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer()
training_data = count_vector.fit_transform(Xtrain)
testing_data = count_vector.transform(Xtest)

### Build a Naive Bayes Classifier to identify spam messages

In [19]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(training_data,Ytrain)

predictions = nb.predict(testing_data)

### Measure the model performance

In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print('Accuracy = ' + str(accuracy_score(Ytest, predictions)))
print('Precision = ' + str(precision_score(Ytest,predictions)))
print('Recall = ' + str(recall_score(Ytest,predictions)))
print('F1 Score = ' + str(f1_score(Ytest,predictions)))

Accuracy = 0.988513998564
Precision = 0.972067039106
Recall = 0.940540540541
F1 Score = 0.956043956044
