## Bag-of-words

In [1]:
import pandas as pd

messages = pd.read_csv('SMSSpamCollection', sep='\t', names=['labels','message'])

In [2]:
messages

Unnamed: 0,labels,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
import re
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.corpus import stopwords

In [4]:
corpus = []
for i in range(len(messages)):
    review = re.sub('[^a-zA-z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
    

In [5]:
corpus

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat',
 'ok lar joking wif u oni',
 'free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s',
 'u dun say so early hor u c already then say',
 'nah i don t think he goes to usf he lives around here though',
 'freemsg hey there darling it s been week s now and no word back i d like some fun you up for it still tb ok xxx std chgs to send to rcv',
 'even my brother is not like to speak with me they treat me like aids patent',
 'as per your request melle melle oru minnaminunginte nurungu vettam has been set as your callertune for all callers press to copy your friends callertune',
 'winner as a valued network customer you have been selected to receivea prize reward to claim call claim code kl valid hours only',
 'had your mobile months or more u r entitled to update to the latest colour mobiles with camera for free call 

In [6]:
## create the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
# for binary BOW enable binary=True
cv = CountVectorizer(max_features=100, binary=True)

In [7]:
X=cv.fit_transform(corpus).toarray()

In [8]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [9]:
X.shape

(5572, 100)

In [10]:
cv.vocabulary_

{'go': 26,
 'only': 62,
 'in': 38,
 'there': 82,
 'got': 29,
 'ok': 59,
 'free': 23,
 'to': 86,
 'text': 78,
 'txt': 88,
 'so': 71,
 'then': 81,
 'don': 20,
 'think': 83,
 'he': 32,
 'here': 33,
 'it': 40,
 'been': 10,
 'now': 57,
 'and': 3,
 'no': 55,
 'back': 8,
 'like': 45,
 'some': 72,
 'you': 98,
 'up': 89,
 'for': 22,
 'still': 74,
 'send': 70,
 'my': 52,
 'is': 39,
 'not': 56,
 'with': 97,
 'me': 50,
 'as': 6,
 'your': 99,
 'all': 1,
 'have': 31,
 'call': 13,
 'mobile': 51,
 'or': 63,
 'the': 80,
 'on': 60,
 'be': 9,
 'home': 35,
 'want': 91,
 'about': 0,
 'this': 84,
 'today': 87,
 'from': 24,
 'day': 17,
 'reply': 68,
 'our': 64,
 'take': 76,
 'will': 96,
 'at': 7,
 'how': 36,
 'did': 18,
 'if': 37,
 'that': 79,
 'its': 41,
 'dont': 21,
 'ur': 90,
 'going': 27,
 'when': 95,
 'da': 16,
 'lor': 47,
 'can': 14,
 'just': 42,
 'are': 5,
 'do': 19,
 'love': 48,
 'we': 93,
 'll': 46,
 'know': 43,
 'what': 94,
 'was': 92,
 'out': 65,
 'but': 11,
 'tell': 77,
 'of': 58,
 'please': 67,


## N-gram

In [11]:
# ngram_range=(1,2)

from sklearn.feature_extraction.text import CountVectorizer
# for binary BOW enable binary=True
cv = CountVectorizer(max_features=100, binary=True, ngram_range=(1,1))
X = cv.fit_transform(corpus).toarray()

In [12]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [13]:
cv.vocabulary_

{'go': 26,
 'only': 62,
 'in': 38,
 'there': 82,
 'got': 29,
 'ok': 59,
 'free': 23,
 'to': 86,
 'text': 78,
 'txt': 88,
 'so': 71,
 'then': 81,
 'don': 20,
 'think': 83,
 'he': 32,
 'here': 33,
 'it': 40,
 'been': 10,
 'now': 57,
 'and': 3,
 'no': 55,
 'back': 8,
 'like': 45,
 'some': 72,
 'you': 98,
 'up': 89,
 'for': 22,
 'still': 74,
 'send': 70,
 'my': 52,
 'is': 39,
 'not': 56,
 'with': 97,
 'me': 50,
 'as': 6,
 'your': 99,
 'all': 1,
 'have': 31,
 'call': 13,
 'mobile': 51,
 'or': 63,
 'the': 80,
 'on': 60,
 'be': 9,
 'home': 35,
 'want': 91,
 'about': 0,
 'this': 84,
 'today': 87,
 'from': 24,
 'day': 17,
 'reply': 68,
 'our': 64,
 'take': 76,
 'will': 96,
 'at': 7,
 'how': 36,
 'did': 18,
 'if': 37,
 'that': 79,
 'its': 41,
 'dont': 21,
 'ur': 90,
 'going': 27,
 'when': 95,
 'da': 16,
 'lor': 47,
 'can': 14,
 'just': 42,
 'are': 5,
 'do': 19,
 'love': 48,
 'we': 93,
 'll': 46,
 'know': 43,
 'what': 94,
 'was': 92,
 'out': 65,
 'but': 11,
 'tell': 77,
 'of': 58,
 'please': 67,


In [14]:
# ngram_range=(1,2)

from sklearn.feature_extraction.text import CountVectorizer
# for binary BOW enable binary=True
cv = CountVectorizer(max_features=500, binary=True, ngram_range=(1,2))
X = cv.fit_transform(corpus).toarray()

In [15]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [16]:
cv.vocabulary_

{'go': 136,
 'only': 290,
 'in': 188,
 'great': 145,
 'world': 476,
 'there': 391,
 'got': 144,
 'wat': 446,
 'ok': 284,
 'lar': 204,
 'free': 127,
 'to': 401,
 'win': 467,
 'st': 364,
 'may': 238,
 'text': 377,
 'receive': 329,
 'txt': 428,
 'rate': 323,
 'apply': 20,
 'over': 298,
 'dun': 103,
 'say': 338,
 'so': 355,
 'early': 105,
 'already': 9,
 'then': 390,
 'don': 98,
 'think': 395,
 'he': 163,
 'around': 23,
 'here': 169,
 'hey': 170,
 'it': 195,
 'been': 36,
 'week': 453,
 'now': 276,
 'and': 15,
 'no': 272,
 'word': 474,
 'back': 31,
 'like': 215,
 'some': 356,
 'fun': 131,
 'you': 487,
 'up': 430,
 'for': 121,
 'still': 366,
 'xxx': 480,
 'send': 342,
 'even': 111,
 'my': 259,
 'is': 192,
 'not': 274,
 'speak': 362,
 'with': 469,
 'me': 240,
 'they': 392,
 'is not': 193,
 'as': 24,
 'per': 302,
 'your': 497,
 'has': 155,
 'all': 8,
 'friends': 129,
 'has been': 156,
 'network': 265,
 'customer': 83,
 'have': 157,
 'selected': 341,
 'prize': 318,
 'claim': 68,
 'call': 51,
 '

In [17]:
# ngram_range=(1,3)

from sklearn.feature_extraction.text import CountVectorizer
# for binary BOW enable binary=True
cv = CountVectorizer(max_features=500, binary=True, ngram_range=(2,3))
X = cv.fit_transform(corpus).toarray()

In [18]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [19]:
cv.vocabulary_

{'free entry': 97,
 'to win': 388,
 'to to': 386,
 'to receive': 376,
 'you up': 489,
 'up for': 402,
 'to send': 379,
 'is not': 173,
 'like to': 205,
 'with me': 450,
 'has been': 128,
 'you have': 472,
 'have been': 130,
 'selected to': 289,
 'to claim': 352,
 'claim call': 60,
 'claim code': 61,
 'you have been': 473,
 'to claim call': 353,
 'had your': 124,
 'your mobile': 498,
 'to the': 385,
 'the latest': 328,
 'for free': 89,
 'free call': 96,
 'free on': 98,
 'gonna be': 113,
 'and don': 9,
 'want to': 415,
 'to talk': 383,
 'and send': 13,
 'have won': 135,
 'in our': 164,
 'the word': 338,
 'to no': 371,
 'you have won': 474,
 've been': 411,
 'for the': 93,
 'thank you': 319,
 'at all': 25,
 'in the': 165,
 'the next': 331,
 'that the': 323,
 'the way': 337,
 'is that': 174,
 'going to': 112,
 'with you': 453,
 'did you': 72,
 'are you': 22,
 'do you': 75,
 'my love': 229,
 'we re': 425,
 'let you': 201,
 'you know': 477,
 'know if': 190,
 'not sure': 243,
 'to live': 366,

In [20]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [21]:
X.shape

(5572, 500)