In [1]:
# read data
import pandas as pd

In [2]:
# slight issue with the encoding of the csv file
import chardet

with open('datasets/bag_of_words_dataset/spam.csv', 'rb') as f:
    result = chardet.detect(f.read())
    print(result)

{'encoding': 'Windows-1252', 'confidence': 0.7269493857068697, 'language': ''}


In [3]:
messages = pd.read_csv(
    filepath_or_buffer="datasets/bag_of_words_dataset/spam.csv",
    delimiter=',',
    #     names=["label", "message"],
    encoding='Windows-1252',
)

print(messages.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [4]:
# remove unnamed colums and clean the data
messages = messages.drop(columns=["Unnamed: 2","Unnamed: 3","Unnamed: 4"], axis=1)
messages.rename(columns={"v1": "label", "v2": "message"}, inplace=True)
print(messages)

     label                                            message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham              Will Ì_ b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [5]:
# import regular expression and perform cleansing
import re

In [6]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [7]:
wordnet_lemmatizer = WordNetLemmatizer()

In [8]:
# further clean the data 
# messages -> replace all the special characters -> lemmatization -> store the corpus as a list
corpus_list = []

for i in range(len(messages)):
    ind_review = re.sub(pattern="[^a-zA-Z]", repl=" ", string=messages['message'][i])
#     print(ind_review)
    ind_review = ind_review.lower()
    ind_review = ind_review.split()
    ind_review = [wordnet_lemmatizer.lemmatize(word) for word in ind_review if word not in set(stopwords.words("english"))]
    ind_review = " ".join(ind_review)
    corpus_list.append(ind_review)

# lets look at the final data
print(type(corpus_list))
# print(corpus_list)

<class 'list'>


In [9]:
# we will be using sklearn BOW approach
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
# create BOW
# there are multiple important options: stopwords, binary, lower_case, N-gram
bow_generator = CountVectorizer(max_features=3000) # max features set the total words in the vocabulary
X_train = bow_generator.fit_transform(corpus_list).toarray()

In [15]:
print(type(X_train))
print(X_train.shape)
print(X_train)

<class 'numpy.ndarray'>
(5572, 3000)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
