In [26]:
import pandas as pd

df = pd.read_table('SMS',
                   sep='\t', 
                   header=None, 
                   names=['label', 'message'])


df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
#Data Preprocessing
df['label'] = df.label.map({'ham':0, 'spam':1})
print(df.shape)
df.head() # returning (rows, columns)

(5572, 2)


Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [28]:
# spliting into training and testing datasets
# Using from sklearn.model_selection import train_test_split to avoid seeing deprecation warning.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['message'], 
                                                    df['label'], 
                                                    random_state=1)

print('Total number of rows : {}'.format(df.shape[0]))
print('Training set count of rows: {}'.format(X_train.shape[0]))
print('Test set count of rows: {}'.format(X_test.shape[0]))

Total number of rows : 5572
Training set count of rows: 4179
Test set count of rows: 1393


In [29]:
# Instantiate the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()

# Fit the training data and then return the matrix
training = count.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing = count.transform(X_test)

In [30]:

from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training, y_train)

MultinomialNB()

In [31]:
predictions = naive_bayes.predict(testing)

In [32]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.9885139985642498
Precision score:  0.9720670391061452
Recall score:  0.9405405405405406
F1 score:  0.9560439560439562
