In [9]:
import pandas as pd
df = pd.read_csv('smsspamcollection/SMSSpamCollection',
                 sep="\t",
                   header=None, 
                   names=['label', 'sms_message'])

# Output printing out first 5 rows
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
# Check the shape of the dataset
df.shape

(5572, 2)

In [4]:
#Check if any null values are present in the dataset
df.isnull().sum()

label          0
sms_message    0
dtype: int64

In [11]:
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
# Mapping of the categorical fields 0 to address and 
df['label'] = df.label.map({'spam':0, 'ham':1})
df.head() # returns (rows, columns)

Unnamed: 0,label,sms_message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
# split into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


In [15]:
# Instantiate the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

In [16]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
predictions = naive_bayes.predict(testing_data)

In [19]:
from sklearn.metrics import classification_report,accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))
print(classification_report(y_test,predictions))

Accuracy score:  0.9885139985642498
Precision score:  0.9909390444810544
Recall score:  0.9958609271523179
F1 score:  0.9933938893476465
              precision    recall  f1-score   support

           0       0.97      0.94      0.96       185
           1       0.99      1.00      0.99      1208

    accuracy                           0.99      1393
   macro avg       0.98      0.97      0.97      1393
weighted avg       0.99      0.99      0.99      1393



In [22]:
from sklearn.naive_bayes import GaussianNB
gaussian_naive_bayes = GaussianNB()
gaussian_naive_bayes.fit(training_data.toarray(), y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [24]:
predictions_gaussian = gaussian_naive_bayes.predict(testing_data.toarray())

In [25]:
print('Accuracy score: ', format(accuracy_score(y_test, predictions_gaussian)))
print('Precision score: ', format(precision_score(y_test, predictions_gaussian)))
print('Recall score: ', format(recall_score(y_test, predictions_gaussian)))
print('F1 score: ', format(f1_score(y_test, predictions_gaussian)))
print(classification_report(y_test,predictions_gaussian))

Accuracy score:  0.8994974874371859
Precision score:  0.9899082568807339
Recall score:  0.8932119205298014
F1 score:  0.9390774586597042
              precision    recall  f1-score   support

           0       0.57      0.94      0.71       185
           1       0.99      0.89      0.94      1208

    accuracy                           0.90      1393
   macro avg       0.78      0.92      0.83      1393
weighted avg       0.93      0.90      0.91      1393

