In [7]:
import pandas as pd
# Dataset available using filepath 'SMSSpamCollection.txt'
df = pd.read_table(filepath_or_buffer = 'SMSSpamCollection.txt',sep = '\t', names = ['label', 'sms_message'] )

# Output printing out first 5 rows
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
df['label'] = df.label.map({'ham':0, 'spam':1})
print("\n The shape of our dataframe is:", df.shape)


 The shape of our dataframe is: (5572, 2)


In [13]:
# Explaining the BoW concept
# Step 1: Convert all strings to lower case

documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

lower_case_documents = []
for i in documents:
  lower_case_documents.append(i.lower())

print(lower_case_documents)

['hello, how are you!', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']


In [14]:
# Step 2: Remove all punctuations

sans_punctuation_documents = []
import string

punctuations = string.punctuation

for text in lower_case_documents:
  new_text = ""

  for j in text:
    if j not in punctuations: # if each sentence's letters is not in punctuation dictionary
      new_text = new_text + j

  sans_punctuation_documents.append(new_text)

print(sans_punctuation_documents)


['hello how are you', 'win money win from home', 'call me now', 'hello call hello you tomorrow']


In [17]:
# Step 3: Tokenization
"""
Tokenizing a sentence in a document set means splitting up a sentence into individual words using a delimiter.
The delimiter specifies what character we will use to identify the beginning and the end of a word
(for example we could use a single space as the delimiter for identifying words in our document set.)

"""

pre_processed_documents = []

for i in sans_punctuation_documents:
  splitList = i.split(" ") # Split words by identifying the blank spaces between the words
  pre_processed_documents.append(splitList)

print(pre_processed_documents)

[['hello', 'how', 'are', 'you'], ['win', 'money', 'win', 'from', 'home'], ['call', 'me', 'now'], ['hello', 'call', 'hello', 'you', 'tomorrow']]


In [18]:
# Step 4: Count frequencies using Counter from Python library (frequency of each word in a dict format)

freq_list = []
import pprint
from collections import Counter

for i in pre_processed_documents:
  frequency_counts = Counter(i)
  freq_list.append(frequency_counts)

print(freq_list)


[Counter({'hello': 1, 'how': 1, 'are': 1, 'you': 1}), Counter({'win': 2, 'money': 1, 'from': 1, 'home': 1}), Counter({'call': 1, 'me': 1, 'now': 1}), Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]


In [19]:
'''
Here we will look to create a frequency matrix on a smaller document set to make sure we understand how the
document-term matrix generation happens. We have created a sample document set 'documents'.
'''
documents = ['Hello, how are you!',
                'Win money, win from home.',
                'Call me now.',
                'Hello, Call hello you tomorrow?']

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()

# To see all the parameters
print(count_vector.get_params())

{'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'dtype': <class 'numpy.int64'>, 'encoding': 'utf-8', 'input': 'content', 'lowercase': True, 'max_df': 1.0, 'max_features': None, 'min_df': 1, 'ngram_range': (1, 1), 'preprocessor': None, 'stop_words': None, 'strip_accents': None, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'vocabulary': None}


In [25]:
count_vector.fit(documents)
count_vector.get_feature_names_out() # The get_feature_names() method returns our feature names for this dataset, which is the set of words that make up our vocabulary for 'documents'.

array(['are', 'call', 'from', 'hello', 'home', 'how', 'me', 'money',
       'now', 'tomorrow', 'win', 'you'], dtype=object)

In [28]:
documents

['Hello, how are you!',
 'Win money, win from home.',
 'Call me now.',
 'Hello, Call hello you tomorrow?']

In [27]:
doc_array = count_vector.transform(documents).toarray()
doc_array

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]])

In [29]:
frequency_matrix = pd.DataFrame(doc_array, columns = count_vector.get_feature_names_out())
frequency_matrix

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


In [33]:
# Again showing the main Data

import pandas as pd
# Dataset available using filepath 'SMSSpamCollection.txt'
df = pd.read_table(filepath_or_buffer = 'SMSSpamCollection.txt',sep = '\t', names = ['label', 'sms_message'] )

# Output printing out first 5 rows
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [36]:
# Training testing split

from sklearn.model_selection import train_test_split

x = df['sms_message']
y = df['label']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42, test_size=0.2)

print("Number of total rows in the set: {}".format(df.shape[0]))
print("Number of rows in the training set: {}".format(x_train.shape[0]))
print("Number of rows in the test set: {}".format(x_test.shape[0]))


Number of total rows in the set: 5572
Number of rows in the training set: 4457
Number of rows in the test set: 1115


In [38]:
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(x_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(x_test)

In [39]:
"""
What does the term 'Naive' in 'Naive Bayes' mean ?

The term 'Naive' in Naive Bayes comes from the fact that the algorithm considers the features that it is using to make the predictions to be independent of each other, which may not always be the case.
So in our Diabetes example, we are considering only one feature, that is the test result.
Say we added another feature, 'exercise'.
Let's say this feature has a binary value of 0 and 1, where the former signifies that the individual exercises less than or equal to 2 days a week and the latter signifies that the individual exercises greater than or equal to 3 days a week.
If we had to use both of these features, namely the test result and the value of the 'exercise' feature, to compute our final probabilities, Bayes' theorem would fail.
Naive Bayes' is an extension of Bayes' theorem that assumes that all the features are independent of each other.

"""

"\nWhat does the term 'Naive' in 'Naive Bayes' mean ?\n\nThe term 'Naive' in Naive Bayes comes from the fact that the algorithm considers the features that it is using to make the predictions to be independent of each other, which may not always be the case.\nSo in our Diabetes example, we are considering only one feature, that is the test result. \nSay we added another feature, 'exercise'. \nLet's say this feature has a binary value of 0 and 1, where the former signifies that the individual exercises less than or equal to 2 days a week and the latter signifies that the individual exercises greater than or equal to 3 days a week. \nIf we had to use both of these features, namely the test result and the value of the 'exercise' feature, to compute our final probabilities, Bayes' theorem would fail. \nNaive Bayes' is an extension of Bayes' theorem that assumes that all the features are independent of each other.\n\n"

In [40]:
# Step 5: Naive Bayes implementation using scikit-learn
"""
Specifically, we will be using the multinomial Naive Bayes implementation.
This particular classifier is suitable for classification with discrete features (such as in our case, word counts for text classification).
It takes in integer word counts as its input.
On the other hand Gaussian Naive Bayes is better suited for continuous data as it assumes that the input data has a Gaussian(normal) distribution.

"""
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

In [45]:
y_test

Unnamed: 0,label
3245,ham
944,ham
1044,ham
2484,ham
812,ham
...,...
4264,ham
2439,ham
5556,ham
4205,ham


In [44]:
predictions = naive_bayes.predict(testing_data)
predictions

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype='<U4')

In [48]:
# Making predictions

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions,pos_label='ham')))
print('Recall score: ', format(recall_score(y_test, predictions,pos_label='ham')))
print('F1 score: ', format(f1_score(y_test, predictions,pos_label='ham')))

Accuracy score:  0.9919282511210762
Precision score:  0.9907692307692307
Recall score:  1.0
F1 score:  0.9953632148377125
