In [1]:
# i will follow the same problem statement as in lecture 6 and 7
# earlier approaches of BOW and one-hot-encoding didn't take importance of words in a sentence into account.
# however, tfidf approach solves this problem

$$\text{Term Freq (TF)} = \frac{\text{No. of repetation of word in a sentence}}{\text{No. of words in a sentence}}$$

and 

$$\text{Inverse Document Freq (IDF)} = \log_e \left( \frac{\text{No. of sentences}}{\text{No. of sentences containing the word}} \right)$$

Finally, we get a matrix by multiplying individual entries with the correponding pairs.

Clearly, we have a matrix that assigns importance to individual words in a sentence, where words that are repeated across all sentences are given no importance, whereas previously, they were still assigned some value.

In [2]:
# read data
import pandas as pd

In [3]:
# slight issue with the encoding of the csv file
import chardet

with open('datasets/bag_of_words_dataset/spam.csv', 'rb') as f:
    result = chardet.detect(f.read())
    print(result)

{'encoding': 'Windows-1252', 'confidence': 0.7269493857068697, 'language': ''}


In [4]:
messages = pd.read_csv(
    filepath_or_buffer="datasets/bag_of_words_dataset/spam.csv",
    delimiter=',',
    #     names=["label", "message"],
    encoding='Windows-1252',
)

print(messages.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [5]:
# remove unnamed colums and clean the data
messages = messages.drop(columns=["Unnamed: 2","Unnamed: 3","Unnamed: 4"], axis=1)
messages.rename(columns={"v1": "label", "v2": "message"}, inplace=True)
print(messages)

     label                                            message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham              Will Ì_ b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [6]:
# import regular expression and perform cleansing
import re

In [7]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [8]:
wordnet_lemmatizer = WordNetLemmatizer()

In [9]:
# further clean the data 
# messages -> replace all the special characters -> lemmatization -> store the corpus as a list
corpus_list = []

for i in range(len(messages)):
    ind_review = re.sub(pattern="[^a-zA-Z]", repl=" ", string=messages['message'][i])
#     print(ind_review)
    ind_review = ind_review.lower()
    ind_review = ind_review.split()
    ind_review = [wordnet_lemmatizer.lemmatize(word) for word in ind_review if word not in set(stopwords.words("english"))]
    ind_review = " ".join(ind_review)
    corpus_list.append(ind_review)

# lets look at the final data
print(type(corpus_list))
# print(corpus_list)

<class 'list'>


In [10]:
# we will be using sklearn BOW approach
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
# create BOW with default n_gram range of (1,1)
# there are multiple important options: stopwords, lower_case, N-gram
tfid_generator = TfidfVectorizer(max_features=100) 
X_train = tfid_generator.fit_transform(corpus_list).toarray()

In [12]:
print(type(X_train))
print(X_train.shape)
print(X_train)

<class 'numpy.ndarray'>
(5572, 100)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [14]:
print(tfid_generator.vocabulary_)
# NOTICE: 'go': np.int64(930), here 930 refers to the index

{'go': np.int64(21), 'great': np.int64(25), 'got': np.int64(24), 'wat': np.int64(89), 'ok': np.int64(58), 'free': np.int64(17), 'win': np.int64(93), 'text': np.int64(77), 'txt': np.int64(85), 'say': np.int64(68), 'already': np.int64(0), 'think': np.int64(80), 'life': np.int64(37), 'hey': np.int64(28), 'week': np.int64(91), 'back': np.int64(5), 'like': np.int64(38), 'still': np.int64(73), 'send': np.int64(70), 'friend': np.int64(18), 'prize': np.int64(63), 'claim': np.int64(8), 'call': np.int64(6), 'mobile': np.int64(48), 'co': np.int64(9), 'home': np.int64(30), 'want': np.int64(88), 'today': np.int64(82), 'cash': np.int64(7), 'day': np.int64(13), 'reply': np.int64(65), 'www': np.int64(96), 'right': np.int64(66), 'take': np.int64(75), 'time': np.int64(81), 'message': np.int64(45), 'com': np.int64(10), 'oh': np.int64(57), 'yes': np.int64(99), 'make': np.int64(43), 'way': np.int64(90), 'dont': np.int64(15), 'miss': np.int64(47), 'ur': np.int64(87), 'going': np.int64(22), 'da': np.int64(12

In [15]:
# lets try to change the n_gram range
tfid_generator = TfidfVectorizer(max_features=100, ngram_range=(1,2)) # combination of bigram and unigram
X_train = tfid_generator.fit_transform(corpus_list).toarray()
print(X_train.shape)

(5572, 100)


In [16]:
# now, lets look at the vocabulary
print(tfid_generator.vocabulary_)
# NOTICE: 'please call': np.int64(322)

{'go': np.int64(21), 'great': np.int64(25), 'got': np.int64(24), 'wat': np.int64(89), 'ok': np.int64(58), 'free': np.int64(17), 'win': np.int64(93), 'text': np.int64(77), 'txt': np.int64(85), 'say': np.int64(68), 'already': np.int64(0), 'think': np.int64(80), 'life': np.int64(37), 'hey': np.int64(28), 'week': np.int64(91), 'back': np.int64(5), 'like': np.int64(38), 'still': np.int64(73), 'send': np.int64(70), 'friend': np.int64(18), 'prize': np.int64(63), 'claim': np.int64(8), 'call': np.int64(6), 'mobile': np.int64(48), 'co': np.int64(9), 'home': np.int64(30), 'want': np.int64(88), 'today': np.int64(82), 'cash': np.int64(7), 'day': np.int64(13), 'reply': np.int64(65), 'www': np.int64(96), 'right': np.int64(66), 'take': np.int64(75), 'time': np.int64(81), 'message': np.int64(45), 'com': np.int64(10), 'oh': np.int64(57), 'yes': np.int64(99), 'make': np.int64(43), 'way': np.int64(90), 'dont': np.int64(15), 'miss': np.int64(47), 'ur': np.int64(87), 'going': np.int64(22), 'da': np.int64(12

In [17]:
# setting a configuration where i can see the matrix clearly
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000, 
    formatter=dict(float=lambda x: "%.3g" % x))

In [19]:
print(X_train[:10])

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.435 0 0 0.461 0.544 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.55 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.456 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.473 0 0 0 0 0 0 0 0.492 0 0 0 0 0 0 0 0.572 0 0 0 0 0 0]
 [0.464 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.886 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.486 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.659 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [None]:
# one should alos change the hyperparameters like ngram_range or max_features to increase the accuracy!