In [14]:
import json
import numpy as np
import pandas as pd
import random
import scipy.stats as stats

from sklearn.metrics import zero_one_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import StratifiedKFold

#with open('yelp_academic_dataset_review_mini.json') as f:
#    reviews = [json.loads(line) for line in f]
with open('yelp_academic_dataset_review_mini.json', 'rb') as f:
    reviews = f.readlines()
data_json_sep = map(lambda x: x.rstrip(), reviews)
data_json = "[" + ','.join(data_json_sep) + "]"
data = pd.read_json(data_json)

stars = data['stars']
text = data['text']

                 business_id       date               review_id  stars  \
0     5UmKMjUEUNdYWqANhGckJw 2012-08-01  Ya85v4eqdd6k9Od8HbQjyA      4   
1     5UmKMjUEUNdYWqANhGckJw 2014-02-13  KPvLNJ21_4wbYNctrOwWdQ      5   
2     5UmKMjUEUNdYWqANhGckJw 2015-10-31  fFSoGV46Yxuwbr3fHNuZig      5   
3     UsFtqoBl7naz8AVUBZMjQQ 2013-11-08  Di3exaUCFNw1V4kSNW5pgA      5   
4     UsFtqoBl7naz8AVUBZMjQQ 2014-03-29  0Lua2-PbqEQMjD9r89-asw      3   
5     UsFtqoBl7naz8AVUBZMjQQ 2014-10-29  7N9j5YbBHBW6qguE5DAeyA      1   
6     UsFtqoBl7naz8AVUBZMjQQ 2014-11-28  mjCJR33jvUNt41iJCxDU_g      4   
7     3eu6MEFlq2Dg7bQh8QbdOg 2014-02-27  Ieh3kfZ-5J9pLju4JiQDvQ      5   
8     3eu6MEFlq2Dg7bQh8QbdOg 2015-06-16  PU28OoBSHpZLkYGCmNxlmg      5   
9     cE27W9VPgO88Qxe4ol6y_g 2012-08-19  XsA6AojkWjOHA4FmuAb8XQ      3   
10    cE27W9VPgO88Qxe4ol6y_g 2013-04-18  rkD7UDbQ9VM3Va6bI-eBHQ      1   
11    cE27W9VPgO88Qxe4ol6y_g 2013-07-14  WExNE-f93SL4D1q8s9QWKg      1   
12    cE27W9VPgO88Qxe4ol6y_g 2013-08-1

In [7]:
def RMSE(pre, act):
    return np.sqrt(((pre - act) ** 2).mean())

def rSquared(pre, act):
    return 1 - sum((pre - act)**2) / sum((act - act.mean())**2)

In [2]:
import nltk, re
def tokenize(text):
    # remove non letters
    text = re.sub("[^a-zA-Z]", " ", text)
    # tokenize
    tokens = nltk.word_tokenize(text)
    return tokens

basic_vectorizer = CountVectorizer(min_df=1, ngram_range=(1, 2), tokenizer=tokenize, stop_words='english', max_features=85)
vector = basic_vectorizer.fit_transform(text).toarray()
vector.shape

(5000, 85)

In [3]:
size = vector.shape[0]

test_idx = np.array(random.sample(range(0,size), size/10))
xtest = vector[test_idx]
xtrain = vector[[i for i in range(0,size) if i not in test_idx]]
ytest = stars[test_idx]
ytrain = stars[[i for i in range(0,size) if i not in test_idx]]

In [4]:
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import StratifiedKFold

gnb = GaussianNB()
k=2
skf = StratifiedKFold(stars,n_folds=k)
averageError = 0.0
for train_index, test_index in skf:
    xtrain, xtest = vector[train_index], vector[test_index]
    ytrain, ytest = stars[train_index], stars[test_index]
    gnb.fit(xtrain, ytrain)
    pred = gnb.predict(xtest)
    err = zero_one_loss(pred, ytest)
    averageError += (1./k) * err
print 100 * averageError

59.9599743744


In [13]:
gnb.fit(xtrain, ytrain)
pred = gnb.predict(xtest)
print np.array(pred)
print np.array(ytest)
err = zero_one_loss(pred, ytest)
rmse = RMSE(np.array(pred), np.array(ytest))
print rmse
print err

[2 2 3 ..., 4 4 4]
[2 2 2 ..., 4 4 5]
1.55200740384
0.599279423539


In [6]:
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV

mod = SVC()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svm = GridSearchCV(mod, parameters)
svm.fit(xtrain, ytrain)

pred = svm.predict(xtest)
err = zero_one_loss(pred, ytest)
err

0.58086469175340272