In [29]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
os.listdir('dataset/')

['readme', 'SMSSpamCollection']

In [3]:
df = pd.read_table('dataset/SMSSpamCollection', sep='\t', names=['label', 'email'])

In [4]:
df.head()

Unnamed: 0,label,email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
transformation = {'ham': 0, 'spam': 1}
df['label'] = df.label.map(transformation)

In [21]:
df.head()

Unnamed: 0,label,email
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [22]:
X_train, X_test, y_train, y_test = train_test_split(df.email, df.label, random_state=1)

In [23]:
print('Dataset size is {}.'.format(df.shape[0]))
print('Train size is {}.'.format(len(X_train)))
print('Test size is {}.'.format(len(X_test)))

Dataset size is 5572.
Train size is 4179.
Test size is 1393.


In [39]:
count_vector = CountVectorizer(stop_words='english')
count_vector

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [40]:
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

In [26]:
model = MultinomialNB()

In [27]:
model.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [32]:
y_pred = model.predict(testing_data)

In [33]:
print('Accuracy score is {}.'.format(accuracy_score(y_pred, y_test)))
print('Precission score is {}.'.format(precision_score(y_pred, y_test)))
print('Recall score is {}.'.format(recall_score(y_pred, y_test)))
print('F1 score is {}.'.format(f1_score(y_pred, y_test)))

Accuracy score is 0.9885139985642498.
Precission score is 0.9405405405405406.
Recall score is 0.9720670391061452.
F1 score is 0.9560439560439562.
