In [57]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load Dataset
data_frame = pd.read_table('dataset/SMSSpamCollection', sep='\t', header=None, names = ['label', 'sms_message'])

# Print dataset
data_frame.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [58]:
# Change categorical label to numerical label
data_frame['label'] = data_frame.label.map({'ham': 0, 'spam': 1})
print(data_frame.shape)
data_frame.head()

(5572, 2)


Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [59]:
# Splitting dataset into training set and testing set
X_train, X_test, y_train, y_test = train_test_split(data_frame['sms_message'], 
                                                    data_frame['label'], 
                                                    random_state=1)
print('Number of rows in the total set: {}'.format(data_frame.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


In [60]:
#Count each word into binary matric
count_vector = CountVectorizer()

training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

In [61]:
# Creating classification based on naive bayes
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

# Do prediction
predictions = naive_bayes.predict(testing_data)

In [62]:
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.9885139985642498
Precision score:  0.9720670391061452
Recall score:  0.9405405405405406
F1 score:  0.9560439560439562
