In [22]:
import json
import numpy as np
import pandas as pd
import random
import scipy.stats as stats

from sklearn.metrics import zero_one_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import StratifiedKFold
from sklearn.decomposition import LatentDirichletAllocation

#with open('yelp_academic_dataset_review_mini.json') as f:
#    reviews = [json.loads(line) for line in f]
with open('yelp_academic_dataset_review_mini.json', 'rb') as f:
    reviews = f.readlines()
data_json_sep = map(lambda x: x.rstrip(), reviews)
data_json = "[" + ','.join(data_json_sep) + "]"
data = pd.read_json(data_json)

stars = data['stars'].apply(lambda x: 1 if x < 3 else 2 if x == 3 else 3)
text = data['text']

In [29]:
def RMSE(pre, act):
    return np.sqrt(((pre - act) ** 2).mean())

def rSquared(pre, act):
    return 1 - sum((pre - act)**2) / sum((act - act.mean())**2)

In [11]:
import nltk, re
def tokenize(text):
    # remove non letters
    text = re.sub("[^a-zA-Z\s]", "", text)
    # tokenize
    tokens = nltk.word_tokenize(text)
    return tokens

basic_vectorizer = CountVectorizer(min_df=2, max_df=0.7, ngram_range=(1, 1), tokenizer=tokenize, stop_words='english', max_features=500)
vector = basic_vectorizer.fit_transform(text).toarray()
print vector.shape

(5000, 500)


In [65]:
lda = LatentDirichletAllocation(n_topics=10, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
lda.fit(vector)
feature_names = basic_vectorizer.get_feature_names()

for topic_idx, topic in enumerate(lda.components_):
    print("Topic #%d:" % topic_idx)
    print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-20 - 1:-1]]))
    print ""

Topic #0:
pizza cheese sauce good cream ordered like chicken bread delicious ice just really cooked crust dish pasta cake order tasted

Topic #1:
park nice just price like good free hot shop size extra work offer new know red dont real quiet meat

Topic #2:
food people like place dont want good staff know review friendly great wont make youre priced think wish service lot

Topic #3:
place best pittsburgh hot ive dog food great favorite just dogs fries time good restaurant amazing dont order pancakes years

Topic #4:
love great pittsburgh really place foods store selection museum lot prices im parking like art items steelers nice good staff

Topic #5:
good food great beer place service bar pretty nice menu selection atmosphere really like night delicious just definitely friendly dinner

Topic #6:
ordered came got table minutes time restaurant food did didnt order took waitress service went meal server dinner just wasnt

Topic #7:
car service just time store work shop great customer im n

In [23]:
features = vector

In [25]:
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import StratifiedKFold

gnb = GaussianNB()
k=2
skf = StratifiedKFold(stars,n_folds=k)
averageError = 0.0
for train_index, test_index in skf:
    xtrain, xtest = features[train_index], features[test_index]
    ytrain, ytest = stars[train_index], stars[test_index]
    gnb.fit(xtrain, ytrain)
    pred = gnb.predict(xtest)
    err = zero_one_loss(pred, ytest)
    averageError += (1./k) * err
print 100 * averageError

50.3021760483


In [26]:
size = features.shape[0]

test_idx = np.array(random.sample(range(0,size), size/10))
xtest = features[test_idx]
xtrain = features[[i for i in range(0,size) if i not in test_idx]]
ytest = stars[test_idx]
ytrain = stars[[i for i in range(0,size) if i not in test_idx]]

In [30]:
gnb.fit(xtrain, ytrain)
pred = gnb.predict(xtest)
#print np.array(pred)
#print np.array(ytest)
err = zero_one_loss(pred, ytest)
#rmse = RMSE(np.array(pred), np.array(ytest))
#print rmse
#print err
err

0.34599999999999997

In [31]:
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV

mod = SVC()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svm = GridSearchCV(mod, parameters)
svm.fit(xtrain, ytrain)

pred = svm.predict(xtest)
err = zero_one_loss(pred, ytest)
err

0.24399999999999999