In [84]:
# Import the libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [53]:
# Loading the dataset
dataset = pd.read_table("./smsspamcollection/SMSSpamCollection", 
                        sep = "\t",
                        header = None,
                        names = ["label", "sms_message"])
dataset.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [54]:
# binarize the labels
dataset['label'] = dataset['label'].map({"ham": 0, "spam": 1})
dataset.head()

Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [55]:
# Splitting the dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(dataset['sms_message'], 
                                                    dataset['label'], 
                                                    random_state = 1,
                                                    test_size = 0.25,
                                                    shuffle = True)

In [74]:
# Create an instance of CountVectorizer (lowercasing, deleting punctuations, deleting stopwords, etc.)
count_vector = CountVectorizer()
print(count_vector)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [75]:
# Fit the training data and return the matrix
training_data = count_vector.fit_transform(X_train)
# Transform testing data and return the matrix.
test_data = count_vector.transform(X_test)

In [77]:
# =============== *Optional - Getting the frequency matric ===============
# Create an instance of CountVectorizer
count_vector_2 = CountVectorizer()
# Getting the feature names which makes our vocabulary
count_vector_2.fit(X_train)
print("Feature names: ", count_vector_2.get_feature_names())
# Create a matrix with the rows being each of the 4 documents, and the columns being each word. 
doc_array = count_vector_2.transform(X_train).toarray()
print("Count matrix: ", doc_array)
# Convert'doc_array' into a dataframe and set the column names to the word names
frequency_matrix = pd.DataFrame(data = doc_array, columns = count_vector_2.get_feature_names())
print("Frequency matrix: ", frequency_matrix)
# ========================================



Count matrix:  [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Frequency matrix:        00  000  008704050406  0121  01223585236  01223585334  0125698789  02  \
0      0    0             0     0            0            0           0   0   
1      0    0             0     0            0            0           0   0   
2      0    0             0     0            0            0           0   0   
3      0    0             0     0            0            1           0   0   
4      0    0             0     0            0            0           0   0   
5      0    0             0     0            0            0           0   0   
6      0    0             0     0            0            0           0   0   
7      0    0             0     0            0            0           0   0   
8      0    0             0     0            0            0           0   0   
9      0    0             0     0            0          

In [81]:
# Fit the Naive Bayes to our dataset
naive_bayes = MultinomialNB()
naive_bayes.fit(X = training_data, y = y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [83]:
# Predict the test data
predictions = naive_bayes.predict(X = test_data)
print(predictions)

[0 0 0 ... 0 1 0]


In [85]:
# Evaluating the model
accuracy_ = accuracy_score(y_test, predictions)
recall_ = recall_score(y_test, predictions)
precision_ = precision_score(y_test, predictions)
f1_score_ = f1_score(y_test, predictions)