In [30]:
#yelp API client required: !pip install yelp
#also need an API key from yelp's developer site

from yelp.client import Client
from yelp.oauth1_authenticator import Oauth1Authenticator

secrets = dict()
with open('secrets.txt', 'r') as f:
    for line in f:
        k, v = line.split()
        secrets[k] = v
        
auth = Oauth1Authenticator(**secrets)
client = Client(auth)

params = {
    'term': 'restaurants',
    'lang': 'en'
}

response = client.search('Oakland', **params)
first_business = response.businesses[0]
a_business = client.get_business(business_id=first_business.id)
a_business.business.reviews[1]

I was going to use the API to pull a set of reviews for analysis,
but unfortunately Yelp only allows a single review per business to be returned via API. I'd rather not write a web scraper since that would be pretty clearly violating what Yelp allows, but maybe there's another way.

Kaggle.com offers a lot of datasets - fortunately they have an aggregated yelp dataset at https://www.kaggle.com/c/yelp-recsys-2013/data. We'll download and unzip this data into the ./data/yelp_training_set directory

In [174]:
dataset = []

import json
import string

labels, reviews = [], []

#Normally, I would use pandas.read_json functionality to quickly load the data,
#but this particular dataset is causing errors - since we only care about
#the stars and review text at this point, doing it directly is easy enough.

with open('./data/yelp_training_set/yelp_training_set_review.json', 'r') as f:
    data = [line.strip() for line in f]
    
for line in data:
    review = json.loads(line)
    labels.append(review['stars'])
    reviews.append(review['text'])

In [178]:
print(len(labels))
print()
print(labels[0])
print()
print(reviews[1][:30]+'...')

229907

1

I have no idea why some people...


Looking good - we now have 230K reviews with a star rating and the review text. Next, we'll split this data into a training and test set to make sure we're evaluating our performance fairly, then set up a pipeline to perform a term-frequency/inverse-document-frequency transformation on the data, and then run a SGD Classifier using the transformed test data and the test labels

In [161]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(reviews, labels)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier())
                     ])

text_clf = text_clf.fit(X_train, y_train)

The model is now trained, so let's try to predict the labels of the held-out test data and see how performance looks:

In [163]:
predicted = text_clf.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, predicted))

             precision    recall  f1-score   support

          1       0.54      0.76      0.63      4402
          2       0.50      0.15      0.23      5097
          3       0.50      0.15      0.23      8882
          4       0.52      0.55      0.54     20030
          5       0.59      0.81      0.68     19066

avg / total       0.54      0.56      0.52     57477



Not that bad! >50% classification accuracy with 5 labels suggests that we're at least detecting some signal. The model does seem to be better at the extremes of the rating scales (1 or 5), and is a bit mushier in the middle and under-predicts 2, 3 or 4.

But, maybe we don't need that level of detail - what if we just wanted to know if the review was positive (4 or 5) vs. neutral to bad (1-3)?

In [177]:
labels = [0 if l < 4 else 1 for l in labels]
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier())
                     ])

X_train, X_test, y_train, y_test = train_test_split(reviews, labels)
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.85      0.68      0.76     18505
          1       0.86      0.94      0.90     38972

avg / total       0.86      0.86      0.85     57477



Much better accuracy! The model does a quite good job of finding positive reviews, although it still under-classifies negative reviews. We might be able to adjust this by balancing the class weights or otherwise trying to balance the data, but as-is the classifer is doing a pretty good job.