In [1]:
# -*- coding: utf-8 -*-
# Indentation: Jupyter Notebook

'''
Various form of Vectorization
'''

__version__ = 1.0
__author__ = "Sourav Raj"
__author_email__ = "souravraj.iitbbs@gmail.com"


In [2]:
import pandas as pd
import nltk
import re
import string
pd.set_option('display.max_colwidth', 100)

In [3]:
stopwords=nltk.corpus.stopwords.words('english')
ps=nltk.PorterStemmer()

In [5]:
data = pd.read_csv('../../data/SMSSpamCollection.tsv', sep='\t')
data.columns=['label', 'body_text']

Cleaning (punctuation, tokenize, remove stopwords, stem)

In [26]:
def clean_text(text):
    text=''.join([word.lower() for word in text if word not in string.punctuation])
    tokens=re.split('\W', text)
    text=[ps.stem(word) for word in tokens if word not in stopwords]
    return text

No need to run this function separately on give data as countvectorizer itself have analyzer parameter to do this thing

# Count Vectorizer

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect=CountVectorizer(analyzer=clean_text)
X_counts=count_vect.fit_transform(data['body_text'])
print(X_counts.shape)

(5567, 8098)


To print out all the word in all the document


In [8]:
print(count_vect.get_feature_names())

['', '0', '008704050406', u'0089mi', '0121', '01223585236', '01223585334', '0125698789', '02', '020603', '0207', '02070836089', '02072069400', '02073162414', '02085076972', '020903', '021', '050703', '0578', '06', '060505', '061104', '07008009200', '07046744435', '07090201529', '07090298926', '07099833605', '071104', '07123456789', '0721072', '07732584351', '07734396839', '07742676969', '07753741225', '0776xxxxxxx', '07786200117', '077xxx', '078', '07801543489', '07808', '07808247860', '07808726822', '07815296484', '07821230901', '0784987', '0789xxxxxxx', '0794674629107880867867', '0796xxxxxx', '07973788240', '07xxxxxxxxx', '0800', '08000407165', '08000776320', '08000839402', '08000930705', '08000938767', '08001950382', '08002888812', '08002986030', '08002986906', '08002988890', '08006344447', '0808', '08081263000', '08081560665', '0825', '0844', '08448350055', '08448714184', '0845', '08450542832', '08452810071', '08452810073', u'08452810075over18', '0870', '08700621170150p', '08701213

In [9]:
data_sample=data[0:20]
count_vect_sample=CountVectorizer(analyzer=clean_text)
X_counts_sample=count_vect_sample.fit_transform(data_sample['body_text'])
X_counts_sample.shape

(20, 193)

In [11]:
print(count_vect_sample.get_feature_names())

['', '08002986030', u'08452810075over18', '09061701461', '1', '100', '100000', '11', '12', '120', '150pday', '16', '2', '20000', '2005', '21st', '3', '4', '4403ldnw1a7rw18', '4txt', u'6day', '81010', '87077', '87121', '87575', '9', '900', 'aft', u'aid', u'alreadi', 'alright', u'anymor', u'appli', 'ard', 'around', 'b', 'brother', 'call', u'caller', u'callertun', 'camera', 'cash', u'chanc', 'claim', 'click', 'co', 'code', 'colour', 'comin', 'comp', u'copi', 'cost', 'credit', u'cri', 'csh11', 'cup', u'custom', 'da', 'date', 'dont', 'eg', 'eh', 'england', 'enough', u'entitl', u'entri', 'even', 'fa', 'feel', 'ffffffffff', 'final', 'fine', 'finish', 'first', 'free', u'friend', u'go', 'goalsteam', u'goe', 'gonna', 'gota', 'ha', 'hl', 'home', u'hour', 'httpwap', 'im', 'info', 'ive', 'jackpot', u'joke', 'k', 'kim', 'kl341', 'lar', 'latest', 'lccltd', 'like', 'link', u'live', 'lor', 'lunch', 'macedonia', 'make', 'may', 'meet', u'mell', 'membership', u'messag', u'minnaminungint', u'miss', u'mobil

In [12]:
X_counts_sample

<20x193 sparse matrix of type '<type 'numpy.int64'>'
	with 224 stored elements in Compressed Sparse Row format>

In [14]:
X_count_df=pd.DataFrame(X_counts_sample.toarray())
X_count_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,183,184,185,186,187,188,189,190,191,192
0,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2,0,0,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
6,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
9,2,0,0,0,1,0,1,0,0,0,...,1,0,0,0,0,1,1,0,0,0


In [15]:
X_count_df.columns=count_vect_sample.get_feature_names()

In [16]:
X_count_df.tail()

Unnamed: 0,Unnamed: 1,08002986030,08452810075over18,09061701461,1,100,100000,11,12,120,...,week,wet,win,winner,wkli,word,wwwdbuknet,xxxmobilemovieclub,xxxmobilemovieclubcomnqjkgighjjgcbl,ye
15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# N-Grams

As N-grams takes entire sentence not individual workd, we have to modify clean_text function

In [17]:
def clean_text(text):
    text=''.join([word.lower() for word in text if word not in string.punctuation])
    tokens=re.split('\W', text)
    text=' '.join([ps.stem(word) for word in tokens if word not in stopwords])
    return text

In [18]:
data['cleaned_text']=data['body_text'].apply(lambda x:clean_text(x))
data.head()

Unnamed: 0,label,body_text,cleaned_text
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri questionstd...
1,ham,"Nah I don't think he goes to usf, he lives around here though",nah dont think goe usf live around though
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,even brother like speak treat like aid patent
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,date sunday
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend...


N-Grams is built with CountVectorizer itself. just we have to pass the ngram-range value = (start, end) in countvectorizer object

ngram-range =(1,1) : select only unigram

ngram-range=(1,2) : select unigram & bigram 

to optimize range we have to try with different value

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
ngram_vect=CountVectorizer(ngram_range=(2,2))

In [20]:
X_counts=ngram_vect.fit_transform(data['cleaned_text'])
X_counts.shape

(5567, 31257)

In [23]:
ngram_vect.get_feature_names()

[u'008704050406 sp',
 u'0089mi last',
 u'0121 2025050',
 u'01223585236 xx',
 u'01223585334 cum',
 u'0125698789 ring',
 u'02 user',
 u'020603 2nd',
 u'0207 153',
 u'02072069400 bx',
 u'02073162414 cost',
 u'02085076972 repli',
 u'020903 2nd',
 u'021 3680',
 u'021 3680offer',
 u'050703 tcsbcm4235wc1n3xx',
 u'06 good',
 u'07046744435 arrang',
 u'07090298926 reschedul',
 u'07099833605 reschedul',
 u'07123456789 87077',
 u'0721072 find',
 u'07732584351 rodger',
 u'07734396839 ibh',
 u'07742676969 show',
 u'07753741225 show',
 u'0776xxxxxxx uve',
 u'077xxx 2000',
 u'07801543489 guarante',
 u'07808 xxxxxx',
 u'07808247860 show',
 u'07808726822 award',
 u'07815296484 show',
 u'0784987 show',
 u'0789xxxxxxx today',
 u'0796xxxxxx today',
 u'07973788240 show',
 u'07xxxxxxxxx 2000',
 u'07xxxxxxxxx show',
 u'0800 0721072',
 u'0800 169',
 u'0800 18',
 u'0800 195',
 u'0800 1956669',
 u'0800 505060',
 u'0800 542',
 u'08000407165 18',
 u'08000776320 repli',
 u'08000839402 2stoptx',
 u'08000839402 2stop

In [None]:
def clean_text(text):
    text=''.join([word.lower() for word in text if word not in string.punctuation])
    tokens=re.split('\W', text)
    text=[ps.stem(word) for word in tokens if word not in stopwords]
    return text

# TFIDF

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect=TfidfVectorizer(analyzer=clean_text)
X_tfidf=tfidf_vect.fit_transform(data['body_text'])
X_tfidf.shape

(5567, 8098)

In [31]:
print(tfidf_vect.get_feature_names())

['', '0', '008704050406', u'0089mi', '0121', '01223585236', '01223585334', '0125698789', '02', '020603', '0207', '02070836089', '02072069400', '02073162414', '02085076972', '020903', '021', '050703', '0578', '06', '060505', '061104', '07008009200', '07046744435', '07090201529', '07090298926', '07099833605', '071104', '07123456789', '0721072', '07732584351', '07734396839', '07742676969', '07753741225', '0776xxxxxxx', '07786200117', '077xxx', '078', '07801543489', '07808', '07808247860', '07808726822', '07815296484', '07821230901', '0784987', '0789xxxxxxx', '0794674629107880867867', '0796xxxxxx', '07973788240', '07xxxxxxxxx', '0800', '08000407165', '08000776320', '08000839402', '08000930705', '08000938767', '08001950382', '08002888812', '08002986030', '08002986906', '08002988890', '08006344447', '0808', '08081263000', '08081560665', '0825', '0844', '08448350055', '08448714184', '0845', '08450542832', '08452810071', '08452810073', u'08452810075over18', '0870', '08700621170150p', '08701213