In [1]:
from __future__ import division
from pymongo import MongoClient
from datetime import datetime
from collections import Counter
import pandas as pd
import tqdm
import json
from textblob import TextBlob
from utils.data_manager import get_business_reviews, get_business
import preprocess_reviews

from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

%load_ext autoreload
%autoreload 2

from IPython.core.display import HTML
import urllib2
HTML(urllib2.urlopen('http://seanwade.com/jupyter.css').read())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
client = MongoClient()
db = client.yelp

In [3]:
review_list = get_business_reviews('GzWiVvE6JDdBNowNVshItA')
review_list = [x['text'] for x in review_list]

In [4]:
print review_list[0]

We set out to find Bavarian Point restaurant and stumbled into Zur-Kate by accident.  They are next door to each other, although I had never heard of Zur-Kate before.  What a stroke of luck.

The atmosphere was very cozy with lots of German flair.  The place was very crowded at 8pm, and everyone seemed to be having a very god time.  The first thing we noticed about the menu was the far lower prices than the ones on menus we had seen on line for other Valley German restaurants.  Many different kinds of wurst, schitzel and chops are available to choose from.  Each entree comes with a choice of German potato salad, spatzel, potato cake, home fries or french fries.  You also receive your choice of sauerkraut or red cabbage.  A salad bar is included, and basket of rye bread delivered to the table.  There is not an entree on the menu over $12, and you can add a wurst to any entree for $2.50.  Beers are in the $2-$4 range - where does that happen?  These prices are already amazing, but then y

In [50]:
blob = TextBlob(review_list[1])

The polarity score is a float within the range [-1.0, 1.0]. The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.

In [52]:
for sent in blob.sentences:
    print sent.sentiment

Sentiment(polarity=0.39999999999999997, subjectivity=0.6166666666666667)
Sentiment(polarity=0.0, subjectivity=0.0)
Sentiment(polarity=0.0, subjectivity=0.0)
Sentiment(polarity=0.845, subjectivity=0.8900000000000001)
Sentiment(polarity=0.0, subjectivity=0.0)
Sentiment(polarity=0.7, subjectivity=0.6000000000000001)
Sentiment(polarity=0.0, subjectivity=0.0)
Sentiment(polarity=1.0, subjectivity=1.0)
Sentiment(polarity=0.3, subjectivity=0.5)
Sentiment(polarity=0.3499999999999999, subjectivity=0.6666666666666666)
Sentiment(polarity=0.0, subjectivity=0.0)
Sentiment(polarity=0.0, subjectivity=0.0)


In [53]:
blob.sentiment

Sentiment(polarity=0.56275, subjectivity=0.6896666666666668)

In [49]:
blob.ngrams()

[WordList([u'We', u'set', u'out']),
 WordList([u'set', u'out', u'to']),
 WordList([u'out', u'to', u'find']),
 WordList([u'to', u'find', u'Bavarian']),
 WordList([u'find', u'Bavarian', u'Point']),
 WordList([u'Bavarian', u'Point', u'restaurant']),
 WordList([u'Point', u'restaurant', u'and']),
 WordList([u'restaurant', u'and', u'stumbled']),
 WordList([u'and', u'stumbled', u'into']),
 WordList([u'stumbled', u'into', u'Zur-Kate']),
 WordList([u'into', u'Zur-Kate', u'by']),
 WordList([u'Zur-Kate', u'by', u'accident']),
 WordList([u'by', u'accident', u'They']),
 WordList([u'accident', u'They', u'are']),
 WordList([u'They', u'are', u'next']),
 WordList([u'are', u'next', u'door']),
 WordList([u'next', u'door', u'to']),
 WordList([u'door', u'to', u'each']),
 WordList([u'to', u'each', u'other']),
 WordList([u'each', u'other', u'although']),
 WordList([u'other', u'although', u'I']),
 WordList([u'although', u'I', u'had']),
 WordList([u'I', u'had', u'never']),
 WordList([u'had', u'never', u'heard'

In [58]:
pos_cursor = db.reviews.find({'stars':{'$gt':3}})
neg_cursor = db.reviews.find({'stars':{'$lt':3}})

pos_list = []
neg_list = []

for i in range(10):
    pos_review = pos_cursor.next()
    neg_review = neg_cursor.next()
    
    pos_list.append((pos_review['text'], 'pos'))
    neg_list.append((neg_review['text'], 'neg'))

In [62]:
preprocess_reviews.prepare_classify(5)

([(u'Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.',
   'pos'),
  (u"I like this place a lot. It's a good toasted hoagie.\n\nI actually don't like a my bun exploding with meat, but as a previous poster mentioned if you do maybe you wouldn't like this place.\n\nThe inside badly needs updated though. \n\nThe staff is friendly.",
   'pos'),
  (u'Cold cheap beer. Good bar food. Good service. \n\nLooking for a great Pittsburgh style fish sandwich, this is the place to go. The breading is light, fish is more than plentiful and a good side of home cut fries. \n\nGood grilled chicken salads or

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
cv = CountVectorizer()

In [20]:
a = cv.fit_transform(review_list)

In [22]:
a.toarray().shape

(115, 1946)

In [49]:
def word_feats(words):
    return dict([(word, True) for word in words])
 
negfeats = [(word_feats(x), 'neg') for x in negative_reviews]
posfeats = [(word_feats(x), 'pos') for x in positive_reviews]
 
negcutoff = int(len(negfeats)*3/4)
poscutoff = int(len(posfeats)*3/4)
 
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
 
classifier = NaiveBayesClassifier.train(trainfeats)
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
classifier.show_most_informative_features()

train on 150000 instances, test on 50000 instances
accuracy: 0.84008
Most Informative Features
          unprofessional = True              neg : pos    =    150.4 : 1.0
                    2013 = True              neg : pos    =    107.0 : 1.0
                Horrible = True              neg : pos    =    102.1 : 1.0
                   rude! = True              neg : pos    =     92.3 : 1.0
                   Worst = True              neg : pos    =     86.5 : 1.0
                apology, = True              neg : pos    =     85.0 : 1.0
                   Worse = True              neg : pos    =     80.3 : 1.0
               horrible! = True              neg : pos    =     77.0 : 1.0
                    Rude = True              neg : pos    =     69.0 : 1.0
               Terrible. = True              neg : pos    =     66.3 : 1.0


In [3]:
pipe = [{'$group': {'_id':'$business_id', 'count':{'$sum':1}}}, {'$sort': {'count':-1}}]
db.reviews.aggregate(pipe)['result']

TypeError: 'CommandCursor' object has no attribute '__getitem__'