## Bag of Words (BoW) 

In [1]:
import nltk
import re
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer # to prepare BoW and n_grams
from sklearn.feature_extraction.text import TfidfVectorizer # to measure TF-IDF value 

In [7]:
data='''annotatorId": 2, "ratingOverall": 5, "ratingRoom": 5, "author": "travellerseattle", 
"hotelId": "277882", "reviewId": "277882:54", "ratingLocation": 4, "ratingCleanliness": 5, 
"ratingService": 5, "ratingBusiness": 3, "ratingValue": 5, "segmentLabels": [{"OTHER": "p"}, 
{"NOTRELATED": "x"}, {"OTHER": "p"}, {"SERVICE": "p"}, {"ROOMS": "p"}, {"CLEANLINESS": "p", 
"ROOMS": "p"}, {"FOOD": "p"}, {"FOOD": "p"}, {"LOCATION": "ip"}, {"OTHER": "p"}, {"OTHER": "p"}, 
{"OTHER": "p"}], "date": "May 15, 2007", "ratingCheckin": 5, 
"segments": ["LOVED THE HAMPTON INN SEAPORT!!!!!!!!!!!!!!!!!!", 
"Just returned from a 3 night stay.", "This is a FABULOUS hotel.", 
"The front desk staff, the doormen, the breakfast staff, 
EVERYONE is incredibly friendly and helpful and warm and welcoming.", 
"The rooms was fabulous too.", "Really comfy beds, great decorating, and super super clean.", 
"The breakfasts are great - fresh fruit, bagels, muffins, hot eggs and sausage, etc.", 
"Good coffee.", "Just around the corner from the hotel is a FABULOUS little Italian restaurant - Bon Amici.", 
"Highly recommend it.", "I will stay at this hotel everytime I come to New York.", 
"Can't say enough great things about it!!!!!
annotatorId": 2, "ratingOverall": 4, "ratingRoom": 4, "author": "culturevulture2", 
"hotelId": "590144", "reviewId": "590144:48", "ratingLocation": 5, "ratingCleanliness": 4, 
"ratingService": 4, "ratingBusiness": -1, "ratingValue": 4, 
"segmentLabels": [{"OTHER": "p"}, {"NOTRELATED": "x"}, {"LOCATION": "p"}, 
{"ROOMS": "p"}, {"ROOMS": "x"}, {"ROOMS": "x"}, {"ROOMS": "x"}, {"CHECKIN": "n"}, 
{"CLEANLINESS": "n"}, {"VALUE": "p"}], "date": "Dec 22, 2007", "ratingCheckin": 3, 
"segments": ["We Will Stay Here Again", 
"This is the second time that we are staying at Lanson Place, Hong Kong.", 
"We loved its location in Causeway Bay-a stone's throw from the Causeway Bay MTR, 
great food outlets at Jardine's Bazaar, good shopping at Times Square.", 
"The room is big for HK standards and there is a small kitchenette if you need to heat up food in the microwave or cook a meal.", 
"This time, our room was in an L-shaped configuration.", 
"The bedroom was a little tight but the living and study space was slightly bigger.", 
"The furnishings and fittings are still in good condition.", 
"When we checked in for our stay, we were a little annoyed that we had to wait for our room to be ready even though it was 21/2 hours past the check-in time.", 
"I was a little taken aback too when one of the bath towels that I wanted to use had 5 holes on it.", "Nevertheless,
the housekeeping team was prompt with providing us with good quality towels when we requested for it.We will stay here again when we next visit HK.'''

In [8]:
data1=re.sub('[^A-Za-z0-9\' ]+', '', data) # caret means removal elements except in bracket, removal of punctuations
data1

"annotatorId 2 ratingOverall 5 ratingRoom 5 author travellerseattle hotelId 277882 reviewId 27788254 ratingLocation 4 ratingCleanliness 5 ratingService 5 ratingBusiness 3 ratingValue 5 segmentLabels OTHER p NOTRELATED x OTHER p SERVICE p ROOMS p CLEANLINESS p ROOMS p FOOD p FOOD p LOCATION ip OTHER p OTHER p OTHER p date May 15 2007 ratingCheckin 5 segments LOVED THE HAMPTON INN SEAPORT Just returned from a 3 night stay This is a FABULOUS hotel The front desk staff the doormen the breakfast staff EVERYONE is incredibly friendly and helpful and warm and welcoming The rooms was fabulous too Really comfy beds great decorating and super super clean The breakfasts are great  fresh fruit bagels muffins hot eggs and sausage etc Good coffee Just around the corner from the hotel is a FABULOUS little Italian restaurant  Bon Amici Highly recommend it I will stay at this hotel everytime I come to New York Can't say enough great things about itannotatorId 2 ratingOverall 4 ratingRoom 4 author cultu

In [9]:
sent_tokens=nltk.sent_tokenize(data1)
sent_tokens

["annotatorId 2 ratingOverall 5 ratingRoom 5 author travellerseattle hotelId 277882 reviewId 27788254 ratingLocation 4 ratingCleanliness 5 ratingService 5 ratingBusiness 3 ratingValue 5 segmentLabels OTHER p NOTRELATED x OTHER p SERVICE p ROOMS p CLEANLINESS p ROOMS p FOOD p FOOD p LOCATION ip OTHER p OTHER p OTHER p date May 15 2007 ratingCheckin 5 segments LOVED THE HAMPTON INN SEAPORT Just returned from a 3 night stay This is a FABULOUS hotel The front desk staff the doormen the breakfast staff EVERYONE is incredibly friendly and helpful and warm and welcoming The rooms was fabulous too Really comfy beds great decorating and super super clean The breakfasts are great  fresh fruit bagels muffins hot eggs and sausage etc Good coffee Just around the corner from the hotel is a FABULOUS little Italian restaurant  Bon Amici Highly recommend it I will stay at this hotel everytime I come to New York Can't say enough great things about itannotatorId 2 ratingOverall 4 ratingRoom 4 author cult

In [10]:
word_tokens=nltk.word_tokenize(data1) # prepearing word tags
print (word_tokens)
len(word_tokens)

['annotatorId', '2', 'ratingOverall', '5', 'ratingRoom', '5', 'author', 'travellerseattle', 'hotelId', '277882', 'reviewId', '27788254', 'ratingLocation', '4', 'ratingCleanliness', '5', 'ratingService', '5', 'ratingBusiness', '3', 'ratingValue', '5', 'segmentLabels', 'OTHER', 'p', 'NOTRELATED', 'x', 'OTHER', 'p', 'SERVICE', 'p', 'ROOMS', 'p', 'CLEANLINESS', 'p', 'ROOMS', 'p', 'FOOD', 'p', 'FOOD', 'p', 'LOCATION', 'ip', 'OTHER', 'p', 'OTHER', 'p', 'OTHER', 'p', 'date', 'May', '15', '2007', 'ratingCheckin', '5', 'segments', 'LOVED', 'THE', 'HAMPTON', 'INN', 'SEAPORT', 'Just', 'returned', 'from', 'a', '3', 'night', 'stay', 'This', 'is', 'a', 'FABULOUS', 'hotel', 'The', 'front', 'desk', 'staff', 'the', 'doormen', 'the', 'breakfast', 'staff', 'EVERYONE', 'is', 'incredibly', 'friendly', 'and', 'helpful', 'and', 'warm', 'and', 'welcoming', 'The', 'rooms', 'was', 'fabulous', 'too', 'Really', 'comfy', 'beds', 'great', 'decorating', 'and', 'super', 'super', 'clean', 'The', 'breakfasts', 'are', '

396

# BoW

In [11]:
stopwords1=(stopwords.words('english'))
cv= CountVectorizer(stop_words=stopwords1) 
print (cv)
cv.fit(sent_tokens) #training algorithm
print (cv.vocabulary_)


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs',... 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"],
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)
{'annotatorid': 10, 'ratingoverall': 109, 'ratingroom': 110, 'author': 13, 'travellerseattle': 151, 'hotelid': 69, '277882': 4, 'reviewid': 119, '27788254': 5, 'ratinglocation': 108, 'ratingcleanliness': 107, 'ratingservice': 111

In [13]:
cvt=cv.fit_transform(sent_tokens)
print (cvt)

  (0, 155)	1
  (0, 95)	1
  (0, 77)	1
  (0, 116)	1
  (0, 104)	1
  (0, 152)	1
  (0, 103)	1
  (0, 102)	1
  (0, 143)	1
  (0, 71)	1
  (0, 93)	1
  (0, 65)	1
  (0, 153)	1
  (0, 157)	1
  (0, 150)	2
  (0, 15)	1
  (0, 98)	1
  (0, 8)	1
  (0, 142)	1
  (0, 100)	1
  (0, 70)	1
  (0, 2)	1
  (0, 145)	1
  (0, 47)	1
  (0, 113)	1
  :	:
  (0, 0)	1
  (0, 87)	1
  (0, 39)	2
  (0, 74)	1
  (0, 84)	3
  (0, 52)	4
  (0, 30)	2
  (0, 121)	7
  (0, 128)	1
  (0, 97)	2
  (0, 126)	2
  (0, 112)	2
  (0, 105)	2
  (0, 111)	2
  (0, 107)	2
  (0, 108)	2
  (0, 5)	1
  (0, 119)	2
  (0, 4)	1
  (0, 69)	2
  (0, 151)	1
  (0, 13)	2
  (0, 110)	2
  (0, 109)	2
  (0, 10)	1


In [14]:
cvt.toarray() # converting sparse vector to numpy array

array([[1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 3, 1, 4, 1, 1, 1, 1, 1, 4, 4, 1, 1, 1, 1, 2, 1,
        1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 3, 2, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
        2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 3, 7, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1,
        1, 1, 2, 1, 5, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 3, 1, 2, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1]], dtype=int64)

In [10]:
type(cvt)

scipy.sparse.csr.csr_matrix

In [54]:
cvt.get_shape()


(1, 161)

In [55]:
cvt.toarray() # converting sparse vector to numpy array

array([[1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 3, 1, 4, 1, 1, 1, 1, 1, 4, 4, 1, 1, 1, 1, 2, 1,
        1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 3, 2, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
        2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 3, 7, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1,
        1, 1, 2, 1, 5, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 3, 1, 2, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1]], dtype=int64)

 121 nr of word (rooms) is appearing 7 times and 52 nr of word(food) is appearing 4 times   

In [17]:
positive_words=['good', 'very']
negative_words=['not', 'bad']
dist_positive= nltk.FreqDist(positive_words)
dist_negative=nltk.FreqDist(negative_words)
dist_positive
dist_negative

FreqDist({'not': 1, 'bad': 1})

In [15]:
cv= CountVectorizer(ngram_range=(1, 3)) #means unigram and bi gram , if 1,4 means - unigram, bigram, trigram and tetra gram
cvt_ngram_counts=cv.fit_transform(sent_tokens)
print (cv.vocabulary_)
print (len(cv.vocabulary_))
print (cvt_ngram_counts)

{'annotatorid': 58, 'ratingoverall': 523, 'ratingroom': 526, 'author': 83, 'travellerseattle': 751, 'hotelid': 319, '277882': 12, 'reviewid': 554, '27788254': 15, 'ratinglocation': 520, 'ratingcleanliness': 517, 'ratingservice': 530, 'ratingbusiness': 510, 'ratingvalue': 533, 'segmentlabels': 590, 'other': 475, 'notrelated': 458, 'service': 598, 'rooms': 566, 'cleanliness': 149, 'food': 223, 'location': 413, 'ip': 350, 'date': 178, 'may': 428, '15': 0, '2007': 3, 'ratingcheckin': 513, 'segments': 593, 'loved': 420, 'the': 662, 'hampton': 284, 'inn': 347, 'seaport': 584, 'just': 387, 'returned': 551, 'from': 246, 'night': 455, 'stay': 624, 'this': 703, 'is': 353, 'fabulous': 213, 'hotel': 312, 'front': 252, 'desk': 189, 'staff': 616, 'doormen': 192, 'breakfast': 121, 'everyone': 207, 'incredibly': 344, 'friendly': 243, 'and': 41, 'helpful': 290, 'warm': 774, 'welcoming': 808, 'was': 777, 'too': 741, 'really': 539, 'comfy': 160, 'beds': 109, 'great': 270, 'decorating': 186, 'super': 645,

In [49]:
print (len(cv.vocabulary_))
print ((cvt_ngram_counts.get_shape()))

510
(1, 510)


In [53]:
tf_idf=TfidfVectorizer(ngram_range=(1,2))
final_tf_idf=tf_idf.fit_transform(sent_tokens)

print (final_tf_idf)

  (0, 36)	0.025424641809046068
  (0, 324)	0.050849283618092135
  (0, 326)	0.050849283618092135
  (0, 51)	0.050849283618092135
  (0, 460)	0.025424641809046068
  (0, 199)	0.050849283618092135
  (0, 8)	0.025424641809046068
  (0, 344)	0.050849283618092135
  (0, 10)	0.025424641809046068
  (0, 322)	0.050849283618092135
  (0, 320)	0.050849283618092135
  (0, 328)	0.050849283618092135
  (0, 316)	0.050849283618092135
  (0, 330)	0.050849283618092135
  (0, 365)	0.050849283618092135
  (0, 296)	0.1525478508542764
  (0, 285)	0.050849283618092135
  (0, 370)	0.025424641809046068
  (0, 351)	0.1779724926633225
  (0, 94)	0.050849283618092135
  (0, 142)	0.10169856723618427
  (0, 256)	0.0762739254271382
  (0, 218)	0.025424641809046068
  (0, 113)	0.050849283618092135
  (0, 265)	0.025424641809046068
  :	:
  (0, 467)	0.025424641809046068
  (0, 175)	0.025424641809046068
  (0, 190)	0.025424641809046068
  (0, 291)	0.025424641809046068
  (0, 227)	0.025424641809046068
  (0, 278)	0.025424641809046068
  (0, 205)	0.02