In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import json

In [2]:
path = 'yelp_training_set_review.json'
results = [json.loads(line) for line in open(path,'r')]

In [3]:
for result in results:
    votes = result['votes']
    for x in votes:
        result[x] = votes[x]

In [4]:
df_review = pd.DataFrame(results)

In [5]:
df_review.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,type,useful,user_id,votes
0,9yKzy9PApeiPPOUJEtnvkg,2,2011-01-26,0,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,5,rLtl8ZkDX5vH5nAx9C3q5Q,"{'useful': 5, 'cool': 2, 'funny': 0}"
1,ZRJwVLyzEJq1VAihDhYiow,0,2011-07-27,0,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0,0a2KyEL0d3Yb1V6aivbIuQ,"{'useful': 0, 'cool': 0, 'funny': 0}"
2,6oRAC4uyJCsJl1X0WZpVSA,0,2012-06-14,0,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,1,0hT2KtfLiobPvh6cDC8JQg,"{'useful': 1, 'cool': 0, 'funny': 0}"
3,_1QQZuf4zZOyFCvXc0o6Vg,1,2010-05-27,0,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,2,uZetl9T0NcROGOyFfughhg,"{'useful': 2, 'cool': 1, 'funny': 0}"
4,6ozycU1RpktNG2-1BroVtw,0,2012-01-05,0,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,0,vYmM4KTsC8ZfQBg-j5MWkw,"{'useful': 0, 'cool': 0, 'funny': 0}"


In [6]:
feature = ['user_id', 'text', 'useful']
df_review = df_review[feature]
df_review.to_json('df_review.json')

In [7]:
df_review = pd.read_json('df_review.json')

In [8]:
df_review.head()

Unnamed: 0,text,useful,user_id
0,My wife took me here on my birthday for breakf...,5,rLtl8ZkDX5vH5nAx9C3q5Q
1,I have no idea why some people give bad review...,0,0a2KyEL0d3Yb1V6aivbIuQ
10,The oldish man who owns the store is as sweet ...,3,-OMlS6yWkYjVldNhC31wYg
100,I have to admit that I find myself thinking th...,1,bZFRqP7s0Vszxeu8_IwYow
1000,Great atmosphere with interesting lights (look...,0,3ltazFFclBfchSYlctX6iA


In [9]:
df_review.shape

(229907, 3)

In [10]:
n_sample = 5000
df_review_sample = df_review.sample(n_sample, random_state=0)

In [11]:
X = df_review_sample['text']
y = df_review_sample['useful']

In [12]:
def map_y(x):
    if x > 1:
        return 1
    else:
        return 0
y = y.map(map_y)

# Baseline Accuracy

In [13]:
y.value_counts()[0]/y.shape[0]*100

69.379999999999995

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

def preprocessor(s):
    return s.lower()
    
def tokenizer_porter(s):
    porter = PorterStemmer()
    stop = stopwords.words('english')
    words = [porter.stem(word) for word in s.split(' ') if word not in stop and word != '']
    return words

# Offline algorithm

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

tfidf = TfidfVectorizer(preprocessor=preprocessor,
        tokenizer=tokenizer_porter)

X_train_tfidf = tfidf.fit_transform(X_train).toarray()

In [18]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train_tfidf, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [19]:
X_test_tfidf = tfidf.transform(X_test).toarray()
y_pred = clf.predict(X_test_tfidf)

In [20]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.716
[[1019   45]
 [ 381   55]]
             precision    recall  f1-score   support

          0       0.73      0.96      0.83      1064
          1       0.55      0.13      0.21       436

avg / total       0.68      0.72      0.65      1500



# Online algorithm

In [21]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import LogisticRegression, SGDRegressor, SGDClassifier

In [22]:
vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21, 
                         preprocessor=preprocessor, 
                         tokenizer=tokenizer_porter)
clf = LogisticRegression()

In [29]:
X_train_vec = vect.transform(X_train)
X_test_vec = vect.transform(X_test)

clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)
y_pred_train = clf.predict(X_train_vec)

In [30]:
print('train accuracy: %.5f' %(accuracy_score(y_train, y_pred_train)))
print('test accuracy: %.5f' %(accuracy_score(y_test, y_pred)))

train accuracy: 0.78886
test accuracy: 0.70733
