In [1]:
%load_ext autoreload
%autoreload 2

### Import Dependencies

In [2]:
import pandas as pd
import numpy as np

### Import Raw Dataset

In [3]:
raw_dataset = pd.read_csv('../data/raw/raw_dataset.csv')

In [4]:
raw_dataset.head(5)

Unnamed: 0,SMS_id,SMS
0,1,"\tGo until jurong point, crazy.. Available on..."
1,2,\tOk lar... Joking wif u oni...\n
2,3,\tFree entry in 2 a wkly comp to win FA Cup f...
3,4,\tU dun say so early hor... U c already then ...
4,5,"\tNah I don't think he goes to usf, he lives ..."


In [5]:
raw_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5574 entries, 0 to 5573
Data columns (total 2 columns):
SMS_id    5574 non-null int64
SMS       5574 non-null object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


In [6]:
raw_dataset = raw_dataset.rename(columns={"SMS_id": "id","SMS":"sms"})

In [7]:
raw_dataset.head(5)

Unnamed: 0,id,sms
0,1,"\tGo until jurong point, crazy.. Available on..."
1,2,\tOk lar... Joking wif u oni...\n
2,3,\tFree entry in 2 a wkly comp to win FA Cup f...
3,4,\tU dun say so early hor... U c already then ...
4,5,"\tNah I don't think he goes to usf, he lives ..."


In [8]:
raw_dataset = raw_dataset.drop(['id'],axis=1,inplace=False)
raw_dataset.head(5)

Unnamed: 0,sms
0,"\tGo until jurong point, crazy.. Available on..."
1,\tOk lar... Joking wif u oni...\n
2,\tFree entry in 2 a wkly comp to win FA Cup f...
3,\tU dun say so early hor... U c already then ...
4,"\tNah I don't think he goes to usf, he lives ..."


Now we have a dataframe of raw dataset which we can use for preprocesing.
* We will use a sample from this dataframe for testing dataprocessing and feature extraction and then we will use the full dataframe for final results.

### Create a Sample Dataframe

In [9]:
# We use this dataframe for testing
test_df = raw_dataset[:20]
print(test_df['sms'])

0      \tGo until jurong point, crazy.. Available on...
1                     \tOk lar... Joking wif u oni...\n
2      \tFree entry in 2 a wkly comp to win FA Cup f...
3      \tU dun say so early hor... U c already then ...
4      \tNah I don't think he goes to usf, he lives ...
5      \tFreeMsg Hey there darling it's been 3 week'...
6      \tEven my brother is not like to speak with m...
7      \tAs per your request 'Melle Melle (Oru Minna...
8      \tWINNER!! As a valued network customer you h...
9      \tHad your mobile 11 months or more? U R enti...
10     \tI'm gonna be home soon and i don't want to ...
11     \tSIX chances to win CASH! From 100 to 20,000...
12     \tURGENT! You have won a 1 week FREE membersh...
13     \tI've been searching for the right words to ...
14              \tI HAVE A DATE ON SUNDAY WITH WILL!!\n
15     \tXXXMobileMovieClub: To use your credit, cli...
16                       \tOh k...i'm watching here:)\n
17     \tEh u remember how 2 spell his name... Y

### Data Preprocessing

* Removing unnecessary punctuation, tags
* Removing stop words — frequent words such as ”the”, ”is”, etc. that do not have specific semantic
* Tokenization — convert sentences to words
* Stemming — words are reduced to a root by removing inflection through dropping unnecessary characters, usually a suffix.
* Lemmatization — Another approach to remove inflection by determining the part of speech and utilizing detailed database of the language.

#### Clean/Normalize words

#### Test preprocessing

In [10]:
from src.functions import preprocessor
test_cleaned_dataset = test_df['sms'].apply(preprocessor.preprocess).to_frame()  #preprocess will return tokenized text after cleaning 

[nltk_data] Downloading package punkt to /Users/emmanuvel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/emmanuvel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/emmanuvel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
test_tokenized_dataset = test_df['sms'].apply(preprocessor.get_tokenized_text).to_frame()  #preprocess will return tokenized text after cleaning 
print(test_tokenized_dataset)

                                                  sms
0   [jurong, point, crazy.., Available, bugis, gre...
1                   [lar, ..., Joking, wif, oni, ...]
2   [Free, entry, wkly, comp, win, Cup, final, tkt...
3      [dun, say, early, hor, ..., already, say, ...]
4   [Nah, n't, think, goes, usf, lives, around, th...
5   [FreeMsg, Hey, darling, week, word, back, like...
6   [Even, brother, like, speak, They, treat, like...
7   [per, request, 'Melle, Melle, Oru, Minnaminung...
8   [WINNER, valued, network, customer, selected, ...
9   [Had, mobile, months, entitled, Update, latest...
10  [gon, home, soon, n't, want, talk, stuff, anym...
11  [SIX, chances, win, CASH, From, 100, 20,000, p...
12  [URGENT, You, week, FREE, membership, 100,000,...
13  ['ve, searching, right, words, thank, breather...
14                   [HAVE, DATE, SUNDAY, WITH, WILL]
15  [XXXMobileMovieClub, use, credit, click, WAP, ...
16                                    [..., watching]
17  [remember, spell, name, 

#### Final Preprocessing

In [12]:
cleaned_dataset = raw_dataset['sms'].apply(preprocessor.preprocess).to_frame()  #preprocess will return tokenized text after cleaning 
tokenized_dataset = raw_dataset['sms'].apply(preprocessor.get_tokenized_text).to_frame() 

### Feature Extraction

* The mapping of textual data to real valued vectors is called feature extraction.
* One of the simplest techniques to numerically represent text is BAG OF WORDS (BOW).
* We make the list of unique words in the text corpus called vocabulary. Then we can represent each sentence or    document as a vector with each word represented as 1 for present and 0 for absent from the vocabulary

In [21]:
# Generate Vocabulary
from src.functions import feature_extractor
test_vocabulary, test_vectorized_list = feature_extractor.get_vocabulary(test_tokenized_dataset['sms']) # get_vocabualry() will return a dict of vocabulary available across whole dataset

In [22]:
print(test_vocabulary)

{'jurong': 0, 'point': 1, 'crazy..': 2, 'Available': 3, 'bugis': 4, 'great': 5, 'world': 6, 'buffet': 7, '...': 8, 'Cine': 9, 'got': 10, 'amore': 11, 'wat': 12, 'lar': 13, 'Joking': 14, 'wif': 15, 'oni': 16, 'Free': 17, 'entry': 18, 'wkly': 19, 'comp': 20, 'win': 21, 'Cup': 22, 'final': 23, 'tkts': 24, '21st': 25, 'May': 26, '2005': 27, 'Text': 28, '87121': 29, 'receive': 30, 'question': 31, 'std': 32, 'txt': 33, 'rate': 34, 'apply': 35, '08452810075over18': 36, 'dun': 37, 'say': 38, 'early': 39, 'hor': 40, 'already': 41, 'Nah': 42, "n't": 43, 'think': 44, 'goes': 45, 'usf': 46, 'lives': 47, 'around': 48, 'though': 49, 'FreeMsg': 50, 'Hey': 51, 'darling': 52, 'week': 53, 'word': 54, 'back': 55, 'like': 56, 'fun': 57, 'still': 58, 'XxX': 59, 'chgs': 60, 'send': 61, '1.50': 62, 'rcv': 63, 'Even': 64, 'brother': 65, 'speak': 66, 'They': 67, 'treat': 68, 'aids': 69, 'patent': 70, 'per': 71, 'request': 72, "'Melle": 73, 'Melle': 74, 'Oru': 75, 'Minnaminunginte': 76, 'Nurungu': 77, 'Vettam':

In [37]:
print(test_vectorized_list)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 8], [13, 8, 14, 15, 16, 8], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 18, 31, 32, 33, 34, 35, 36], [37, 38, 39, 40, 8, 41, 38, 8], [42, 43, 44, 45, 46, 47, 48, 49], [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 32, 60, 61, 62, 63], [64, 65, 56, 66, 67, 68, 56, 69, 70], [71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85], [86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102], [103, 104, 105, 106, 107, 108, 109, 110, 111, 17, 112, 113, 114, 107, 115, 116], [117, 118, 119, 43, 120, 121, 122, 123, 124, 125, 126, 127, 128], [129, 130, 21, 131, 132, 133, 134, 135, 33, 136, 61, 137, 138, 139, 140, 141, 142, 35, 143, 144], [145, 146, 53, 115, 147, 148, 149, 150, 151, 54, 152, 153, 154, 155, 156, 157], [125, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 163, 146, 169, 170, 171], [172, 173, 174, 175, 176], [177, 178, 179, 180, 181, 182, 183, 33, 184, 180, 185, 186, 187, 188], [8, 189], [190, 191, 192, 8, 19

In [38]:
# Get frequency of each word in the vocabulary for feature engineering
test_frequency_distribution = feature_extractor.get_word_frequency(test_vectorized_list)

Calculate frequencies of each word in the vocabulary...
Finished calculating frequency distribution...


In [39]:
print(test_frequency_distribution)

{0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 8, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 2, 18: 2, 19: 1, 20: 1, 21: 2, 22: 1, 23: 1, 24: 1, 25: 1, 26: 1, 27: 1, 28: 1, 29: 1, 30: 1, 31: 1, 32: 2, 33: 3, 34: 1, 35: 2, 36: 1, 37: 1, 38: 2, 39: 1, 40: 1, 41: 1, 42: 1, 43: 2, 44: 1, 45: 1, 46: 1, 47: 1, 48: 1, 49: 1, 50: 1, 51: 1, 52: 1, 53: 2, 54: 2, 55: 1, 56: 3, 57: 1, 58: 1, 59: 1, 60: 1, 61: 2, 62: 1, 63: 1, 64: 1, 65: 1, 66: 1, 67: 1, 68: 1, 69: 1, 70: 1, 71: 1, 72: 1, 73: 1, 74: 1, 75: 1, 76: 1, 77: 1, 78: 1, 79: 1, 80: 1, 81: 1, 82: 1, 83: 1, 84: 1, 85: 1, 86: 1, 87: 1, 88: 1, 89: 1, 90: 1, 91: 1, 92: 1, 93: 1, 94: 1, 95: 1, 96: 1, 97: 1, 98: 1, 99: 1, 100: 1, 101: 1, 102: 1, 103: 1, 104: 1, 105: 1, 106: 1, 107: 2, 108: 1, 109: 1, 110: 1, 111: 1, 112: 1, 113: 1, 114: 1, 115: 2, 116: 1, 117: 1, 118: 1, 119: 1, 120: 1, 121: 1, 122: 1, 123: 1, 124: 1, 125: 2, 126: 1, 127: 1, 128: 1, 129: 1, 130: 1, 131: 1, 132: 1, 133: 1, 134: 1, 135: 1, 136: 1, 137: 1, 138: 

In [42]:
# Retrieve top words in the vocabulary (threshold_value is used for band pass filtering, ie to remove low frequency words)
test_threshold_value = 0
test_top_words=feature_extractor.get_top_words(test_frequency_distribution, test_threshold_value) # get_top_words returns a dict top words with key-> new index, value -> index of word in the vocabulary
print(test_top_words)

{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 67, 68: 68, 69: 69, 70: 70, 71: 71, 72: 72, 73: 73, 74: 74, 75: 75, 76: 76, 77: 77, 78: 78, 79: 79, 80: 80, 81: 81, 82: 82, 83: 83, 84: 84, 85: 85, 86: 86, 87: 87, 88: 88, 89: 89, 90: 90, 91: 91, 92: 92, 93: 93, 94: 94, 95: 95, 96: 96, 97: 97, 98: 98, 99: 99, 100: 100, 101: 101, 102: 102, 103: 103, 104: 104, 105: 105, 106: 106, 107: 107, 108: 108, 109: 109, 110: 110, 111: 111, 112: 112, 113: 113, 114: 114, 115: 115, 116: 116, 117: 117, 118: 118, 119: 119, 120: 120, 121: 121,

In [43]:
# Create a dic of top words and their frequency distribution to visualize using wordcloud
test_word_freq = {}
for value in test_top_words.values():
    key = feature_extractor.get_key(test_vocabulary,value)
    test_word_freq[key] = test_frequency_distribution[value]
print(test_word_freq)    

{'jurong': 1, 'point': 1, 'crazy..': 1, 'Available': 1, 'bugis': 1, 'great': 1, 'world': 1, 'buffet': 1, '...': 8, 'Cine': 1, 'got': 1, 'amore': 1, 'wat': 1, 'lar': 1, 'Joking': 1, 'wif': 1, 'oni': 1, 'Free': 2, 'entry': 2, 'wkly': 1, 'comp': 1, 'win': 2, 'Cup': 1, 'final': 1, 'tkts': 1, '21st': 1, 'May': 1, '2005': 1, 'Text': 1, '87121': 1, 'receive': 1, 'question': 1, 'std': 2, 'txt': 3, 'rate': 1, 'apply': 2, '08452810075over18': 1, 'dun': 1, 'say': 2, 'early': 1, 'hor': 1, 'already': 1, 'Nah': 1, "n't": 2, 'think': 1, 'goes': 1, 'usf': 1, 'lives': 1, 'around': 1, 'though': 1, 'FreeMsg': 1, 'Hey': 1, 'darling': 1, 'week': 2, 'word': 2, 'back': 1, 'like': 3, 'fun': 1, 'still': 1, 'XxX': 1, 'chgs': 1, 'send': 2, '1.50': 1, 'rcv': 1, 'Even': 1, 'brother': 1, 'speak': 1, 'They': 1, 'treat': 1, 'aids': 1, 'patent': 1, 'per': 1, 'request': 1, "'Melle": 1, 'Melle': 1, 'Oru': 1, 'Minnaminunginte': 1, 'Nurungu': 1, 'Vettam': 1, 'set': 1, 'callertune': 1, 'Callers': 1, 'Press': 1, 'copy': 1, 

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
wordcloud = WordCloud(background_color="white").generate_from_frequencies(word_freq)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# Save file to reports
wordcloud.to_file("../reports/figures/word_count.png")

In [None]:
# Top 10 words
dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10])

In [None]:
# Sort word frequency dictionary for analysis
sorted_d = sorted(word_freq.items(), key=lambda x: x[1])
print(sorted_d)

In [None]:
# Now we have a vocabulary of all words and a top words in the entire dataset
# And we need to obtain a feature vector by encoding these words into numerical vectors

* Techniques for Encoding - All the popular techniques that are used for encoding.
    * Bag of Words
    * Binary Bag of Words
    * Bigram, Ngram
    * TF-IDF( Term Frequency - Inverse Document Frequency)
    * Word2Vec
    * Avg-Word2Vec
    * TF-IDF Word2Vec

* Bag of Words Model — 
    * Find the unique words i.e., vocabulary from the list of documents. 
    * Parse each document word with the vocabulary, if present ‘1’ else ‘0’. 
    * This makes each document vector maintain the same length that of vocabulary length.
    * We use this vocabulary for the new document vectorization.

#### Bag of Words Model


In [None]:
# Get frequency of each word in the vocabulary for feature engineering
from src.functions import feature_vector
#print({k: vocabulary[k] for k in list(vocabulary)[:500]})
binary_bow = feature_vector.custom_binary_bow_vector(vocabulary,tokenized_dataset['sms'])

#### Word Counts with CountVectorizer(scikit-learn)


##### BINARY BAG OF WORDS

In binary BoW, we dont count the frequency of word, we just place 1 if the word appears in the review or else 0. In CountVectorizer there is a parameter binary = true this makes our BoW to binary BoW.

In [None]:
binary_count_vectorizer_vector = feature_vector.binary_count_vectorizer(preprocessor.preprocess,cleaned_dataset['sms'])

In [None]:
print(binary_count_vectorizer_vector)

* Drawbacks of BoW/ Binary BoW

Our main objective in doing these text to vector encodings is that similar meaning text vectors should be close to each other, but in some cases this may not possible for Bow

For example, if we consider two reviews This pasta is very tasty and This pasta is not tasty after stopwords removal both sentences will be converted to pasta tasty so both giving exact same meaning.

The main problem is here we are not considering the front and back words related to every word, here comes Bigram and Ngram techniques.

##### BI-GRAM BOW

Considering pair of words for creating dictionary is Bi-Gram , Tri-Gram means three consecutive words so as NGram.

CountVectorizer has a parameter ngram_range if assigned to (1,2) it considers Bi-Gram BoW

But this massively increases our dictionary size

In [None]:
bigram_count_vectorizer_vector = feature_vector.bigram_count_vectorizer(preprocessor.preprocess,cleaned_dataset['sms'])

#### Word Frequencies with TfidfVectorizer (scikit-learn) 

TF-IDF

Term Frequency - Inverse Document Frequency it makes sure that less importance is given to most frequent words and also considers less frequent words.

Term Frequency is number of times a particular word(W) occurs in a review divided by totall number of words (Wr) in review. The term frequency value ranges from 0 to 1.

Inverse Document Frequency is calculated as log(Total Number of Docs(N) / Number of Docs which contains particular word(n)). Here Docs referred as Reviews.

TF-IDF is TF * IDF that is (W/Wr)*LOG(N/n)

Using scikit-learn's tfidfVectorizer we can get the TF-IDF.

In [None]:
tfidf_vector = feature_vector.tfidf_vectorizer(preprocessor.preprocess,cleaned_dataset['sms'])

Tf-idf is the best vectorization method among these three, because it prioritise the words in each document. IDF value for the word “this” is less since it present in both the documents. So, unlike word counts which give higher value for stop words like “in”, “this”, word frequency lowers the value if it present in more number of documents, because stop words repeats in each document almost.

* Limitaions of TFIDF:

So even here we get a TF-IDF value for every word and in some cases it may consider different meaning reviews as similar after stopwords removal. so to over come we can use BI-Gram or NGram.

So to actually overcome the problem of semantical reviews having close distance we have Word2Vec

#### Word2Vec 

Word2Vec actually takes the semantic meaning of the words and their relationships between other words. it learns all the internal relationships between the words.It represents the word in dense vector form.

Using Gensim's library we have Word2Vec which takes parameters like min_count = 5 considers only if word repeats more than 5 times in entire data. size = 50 gives a vector length of size 50 and workers are cores to run this.

##### Average Word2Vec

Compute the Word2vec of each of the words and add the vectors of each words of the sentence and divide the vector with the number of words of the sentence.Simply Averaging the Word2Vec of all words.

In [None]:
word2vec_vector = feature_vector.word2vec(preprocessor.preprocess,tokenized_dataset['sms'])

In [None]:
print(word2vec_vector[1])

##### F-IDF WORD2VEC

In TF-IDF Word2Vec the Word2Vec value of each word is multiplied by the tfidf value of that word and summed up and then divided by the sum of the tfidf values of the sentence.

   V = ( t(W1)*w2v(W1) + t(W2)*w2v(W2) +.....+t(Wn)*w2v(Wn))/(t(W1)+t(W2)+....+t(Wn))

In [None]:
word2vec_tfidf_vectorizer = feature_vector.word2vec_tfidf_vectorizer(word2vec_vector,cleaned_dataset['sms'])

In [None]:
print(word2vec_tfidf_vectorizer[5])

#### Using CountVectorizer for Feature Extraction 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow_transformer = CountVectorizer(analyzer=preprocessor.preprocess)
bow_transformer.fit(sms_df)

In [None]:
# Now we have a vocabulary of all words and a top words in the entire dataset
# And we need to obtain a feature vector by encoding these words into numerical vectors

* Techniques for Encoding - All the popular techniques that are used for encoding.
    * Bag of Words
    * Binary Bag of Words
    * Bigram, Ngram
    * TF-IDF( Term Frequency - Inverse Document Frequency)
    * Word2Vec
    * Avg-Word2Vec
    * TF-IDF Word2Vec

In [None]:
len(bow_transformer.get_feature_names())

In [None]:
message4 = sms_df[3]
print(message4)

In [None]:
bow4 = bow_transformer.transform([message4])
print(bow4)

In [None]:
print (bow_transformer.get_feature_names()[6043])
print (bow_transformer.get_feature_names()[2016])
print (bow_transformer.get_feature_names()[2035])

In [None]:
messages_bow = bow_transformer.transform(sms_df)

In [None]:
print ('Shape of Sparse Matrix: ', messages_bow.shape)
print ('Amount of Non-Zero occurences: ', messages_bow.nnz)
print ('sparsity: %.2f%%' % (100.0 * messages_bow.nnz /
                             (messages_bow.shape[0] * messages_bow.shape[1])))

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(messages_bow)
messages_tfidf = tfidf_transformer.transform(messages_bow)

In [None]:
print (messages_tfidf.shape)

In [None]:
from sklearn.cluster import KMeans
num_clusters = 2
km = KMeans(num_clusters,random_state=99,init='k-means++', n_init=14, max_iter=100, tol=0.00001, copy_x=True)
km.fit(messages_tfidf)
clusters = km.labels_.tolist()
print("Results of Clustering:")
print(clusters)