### Review prediction for Yelp data

In [1]:
from gensim.models import Doc2Vec
import numpy as np
import json
import pandas as pd
import re
from gensim.parsing import strip_non_alphanum, strip_multiple_whitespaces
# change python encoding
import sys
reload(sys)  
sys.setdefaultencoding('utf-8')

In [2]:
d = "/home/ruoxu/workspace/data/public/yelp_business_review/yelp_training_set/yelp_training_set_review.json"
data = []
with open(d) as f:
    for line in f.readlines():
        data.append(json.loads(line))

In [3]:
df = pd.DataFrame.from_dict(data)

In [4]:
contractions = re.compile(r"'|-")
symbols = re.compile(r'(\W+)', re.U)
numeric = re.compile(r'(?<=\s)(\d+|\w\d+|\d+\w)(?=\s)', re.I)
swrd = re.compile(r'(?<=\s)(,|"|\(|\)|to|a|as|the|an|and|or|for|are|is)(?=\s)', re.I)
suffix = re.compile(r'(?<=\w)(s|ings*|ly|(?<=e)[sd]+)(?=\s)')
seps = re.compile(r'\s+')

# cleaner (order matters)
def clean(text): 
    text = u' ' +  text.lower() + u' '
    text = contractions.sub('', text)
    text = symbols.sub(r' \1 ', text)
    text = numeric.sub('000', text)
    text = swrd.sub(' ', text)
    text = seps.sub(' ', text)
    text = strip_multiple_whitespaces(strip_non_alphanum(text).strip())
    #return text.split(' ')
    return text


# import re
# alteos = re.compile(r'([!\?])')

# def revsplit(l):
#     l = alteos.sub(r' \1 . ', l).rstrip("( \. )*\n")
#     return [s.split() for s in l.split(" . ")]

In [5]:
df.columns

Index([u'business_id',        u'date',   u'review_id',       u'stars',
              u'text',        u'type',     u'user_id',       u'votes'],
      dtype='object')

### Get features for users and businesses using Doc2Vec model

In [6]:
group_user = df.groupby(by='user_id',squeeze=True).groups
group_business = df.groupby(by='business_id',squeeze=True).groups

In [7]:
# save documents to file

from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
user_dict = {}
doc_users = []
j = 0

with open("data/yelp_train_users.txt","w") as f:
    for u,ids in group_user.items():
        tag = 'SENT_' + str(j)
        doc = clean(' '.join(df.iloc[ids].text))
        doc = doc.encode('utf-8',errors='ignore').decode('utf-8')
        #doc_users.append(LabeledSentence(words=doc,tags=tag))
        user_dict[u] = tag
        j+=1
        f.write(doc + '\n')

In [8]:
# load documents
from gensim.models.doc2vec import TaggedLineDocument
%time doc_users = TaggedLineDocument('data/yelp_train_users.txt')

In [9]:
# train doc2vec model
%time model_user = Doc2Vec(documents=doc_users,size=100)

In [10]:
# save documents to file
business_dict = {}
doc_business = []
j = 0

with open("data/yelp_train_business.txt","w") as f:
    for u,ids in group_business.items():
        tag = 'SENT_' + str(j)
        doc = clean(' '.join(df.iloc[ids].text))
        doc = doc.encode('utf-8',errors='ignore').decode('utf-8')
        business_dict[u] = tag
        j+=1
        f.write(doc + '\n')

In [11]:
#load documents
from gensim.models.doc2vec import TaggedLineDocument
%time doc_business = TaggedLineDocument('data/yelp_train_business.txt')

In [12]:
#train doc2vec model
%time model_business = Doc2Vec(documents=doc_business,size=100)

In [13]:
## save models for users and business
import cPickle
with open('data/doc2vec_yelp_user.pkl','w') as f:
    cPickle.dump(model_user,f)
with open('data/doc2vec_yelp_business.pkl','w') as f:
    cPickle.dump(model_business,f)
with open('data/dict_yelp_user.pkl','w') as f:
    cPickle.dump(user_dict,f)
with open('data/dic_yelp_business.pkl','w') as f:
    cPickle.dump(business_dict,f)

#### Recommendation using Yelp data

In [14]:
## formulate user - business rating
n_business = df.business_id.unique().shape[0]
n_users = df.user_id.unique().shape[0]
row_names = {n:i for i,n in enumerate(df.user_id.unique())} # user_id for rows
col_names = {n:i for i,n in enumerate(df.business_id.unique())} # business_id for columns
X_train = np.zeros((n_users,n_business))

for ib,iu,star in df[['business_id','user_id','stars']].values:
    i = row_names[iu]
    j = col_names[ib]
    X_train[i,j] = star

In [15]:
with open('data/col_names.pkl','w') as f:
    cPickle.dump(col_names,f)
with open('data/row_names.pkl','w') as f:
    cPickle.dump(row_names,f)

In [16]:
## NMF
from sklearn.decomposition import NMF
nmf = NMF(n_components=100)
%time W =  nmf.fit_transform(X_train)
H = nmf.components_

In [17]:
with open('data/yelp_mf.pkl','w') as f:
    cPickle.dump(nmf,f)

In [18]:
W.shape, H.shape

((45981, 100), (100, 11537))

In [19]:
# predict residual
residual_nmf = X_train - np.dot(W,H)

In [20]:
mean_u = residual_nmf.mean() ### coefficient miu

In [21]:
residual_nmf = residual_nmf - mean_u ## updated residual

In [22]:
import h5py
f = h5py.File('data/residual.h5','w')
f.create_dataset('data',data=residual_nmf)
f.close()

#### reload values to do regression

In [23]:
import h5py
import numpy as np
f = h5py.File('data/residual.h5','r')
residual = f['data'].value
f.close()

In [24]:
import cPickle
with open('data/doc2vec_yelp_user.pkl','r') as f:
    %time model_user = cPickle.load(f)
with open('data/doc2vec_yelp_business.pkl','r') as f:
    %time model_business = cPickle.load(f)
with open('data/dict_yelp_user.pkl','r') as f:
    %time user_dict = cPickle.load(f)
with open('data/dic_yelp_business.pkl','r') as f:
    %time business_dict = cPickle.load(f)
with open('data/col_names.pkl','r') as f:
    %time col_names = cPickle.load(f)
with open('data/row_names.pkl','r') as f:
    %time row_names = cPickle.load(f)

In [25]:
# replication of method in the report
n_users =  len(user_dict)
n_business = len(business_dict)
#y = residual.ravel()
residual.shape

(45981, 11537)

In [26]:
print n_users, n_business

In [27]:
X = []
y = []
for b_id,u_id in df[['business_id','user_id']].values:
    i = row_names[u_id]
    j = col_names[b_id]
    ii = int(user_dict[u_id].strip('SENT_'))
    jj = int(business_dict[b_id].strip('SENT_'))
    X.append(np.concatenate((model_user.docvecs[ii],model_business.docvecs[jj])))
    y.append(residual[i,j])

In [28]:
X = np.array(X)
y = np.array(y)
X.shape,y.shape

((229907, 200), (229907,))

In [29]:
from sklearn import linear_model
regressor = linear_model.Ridge(fit_intercept=False)
%time regressor.fit(X,y)

Ridge(alpha=1.0, copy_X=True, fit_intercept=False, max_iter=None,
   normalize=False, solver='auto', tol=0.001)

### Test

In [80]:
d = "/home/ruoxu/workspace/data/public/yelp_business_review/final_test_set/final_test_set_review.json"
data = []
with open(d) as f:
    for line in f.readlines():
        data.append(json.loads(line))
df_test = pd.DataFrame.from_dict(data)

In [81]:
d = "/home/ruoxu/workspace/data/public/yelp_business_review/IdLookupTable.csv"
df_lookup = pd.read_csv(d)
d = "/home/ruoxu/workspace/data/public/yelp_business_review/sampleSubmissionFinal.csv"
df_final = pd.read_csv(d)

In [82]:
df_test

Unnamed: 0,business_id,review_id,type,user_id
0,-sC66z4SO3tR7nFCjfQwuQ,Wv-4SQr9UUztIBnjzHu9-g,review,dqeFcKq2L2wiOg9LFT9-UA
1,n3CpBxY6turmqrihCjNbAQ,6CQgD-mC2hZkHLiRSXc3wQ,review,XZxDoqyUJu2qMOY8HiEDsg
2,wtGArpfNEvvYDSn6UVKibQ,CK_jCsfvzX7B4I48gC5fXQ,review,QMlaWJt4zVVWzv_KQ2scew
3,OtdMMd4taGagxkU88wmEFA,QEJ2TAmUmtcz6rVz2mhihg,review,weYT-xJwz8o7mLNWIhD6HA
4,tqDwpyCB53TiEIv915Tuww,1u6rTznnVSDgVeaxpGQzKQ,review,gQm2p1gmBPQ4f1sHxgRlbw
5,Wa9nN637fEBbcAafbn2c8A,HW4MFsVIeAf8KfcLDfbqXA,review,PO1H81tZBIjRK84sWpCsVQ
6,5KG0A3WlC7K3DAXtrIFFjg,wgZptlK7ArG7kRA78Xy26w,review,979X2L-2jz8fwOVkSkMpFw
7,h6jfMpTZpNduLG0wE2tbaw,CfZAWA8uPdyHqC43PPV5nA,review,R69oePjuaeq9jwlI7cqCPQ
8,ntJAKLAq0cXzzdFgPoo0Iw,-2l-Gcro_paZuc5PV_Rg8A,review,BlivVzHUCuCIzMngqlauWg
9,83RLRT-MsvXQJrmIanqVYA,-KkgInOPKWxjvZwVvV0zPg,review,XVa0c9vRTnL3z-uvd3idvA


In [93]:
from collections import Counter
Counter(df.stars)

Counter({1: 17516, 2: 20957, 3: 35363, 4: 79878, 5: 76193})

In [92]:
np.round(1.70)

2.0

In [100]:
stars_mean = df.groupby(by='business_id')['stars'].mean()
prediction = []
for b_id,u_id,ind in df_test[['business_id','user_id','review_id']].values:
    try:
        i = row_names[u_id]
        j = col_names[b_id]
        ii = int(user_dict[u_id].strip('SENT_'))
        jj = int(business_dict[b_id].strip('SENT_'))
        feature_doc2vec = np.concatenate((model_user.docvecs[ii],model_business.docvecs[jj]))
        w = W[i,:]
        h = H[:,j]
        score = mean_u + np.dot(w,h) + regressor.predict(feature_doc2vec)[0]
        score = int(np.round(score))
        if score > 5:
            score = 5
        elif score < 0:
            score = 0
        prediction.append([ind,score])
    except:
        try:
            predictiona.append([ind,int(np.round(stars_mean[b_id]))])
        except:
            prediction.append([ind,int(4)])

In [101]:
prediction

[[u'Wv-4SQr9UUztIBnjzHu9-g', 2],
 [u'6CQgD-mC2hZkHLiRSXc3wQ', 4],
 [u'CK_jCsfvzX7B4I48gC5fXQ', 4],
 [u'QEJ2TAmUmtcz6rVz2mhihg', 1],
 [u'1u6rTznnVSDgVeaxpGQzKQ', 3],
 [u'HW4MFsVIeAf8KfcLDfbqXA', 4],
 [u'wgZptlK7ArG7kRA78Xy26w', 4],
 [u'CfZAWA8uPdyHqC43PPV5nA', 4],
 [u'-2l-Gcro_paZuc5PV_Rg8A', 4],
 [u'-KkgInOPKWxjvZwVvV0zPg', 4],
 [u'Np3RBYyE8mHC6di0w2MDUQ', 4],
 [u'jkiwMKLQUqSik0qnnzYfxg', 4],
 [u'TZAa_61ODaRTDzEWwPY9yg', 4],
 [u'4Q5srbbkdjZiZloSFsGxFw', 4],
 [u'XnRbUSFZB5DieSHrBkV2ng', 4],
 [u'kEYEqY1Qs-W3Ise4XDiIdQ', 2],
 [u'O4zfvsFXaXnISEiL7DuELg', 4],
 [u'OPud6_T25yny_veVFgwnpg', 4],
 [u'6o48ylH1Pql0j1kN8ZZt0g', 5],
 [u'RhcChIyZ5s3gey-Mj73QBg', 4],
 [u'nnYjybfVAGcT5UVsdyaOTQ', 0],
 [u'a7dhJn5OGvIl3Kyu0f7h0g', 0],
 [u'Ci1l0cBIW-xJ4mPoHot_tQ', 4],
 [u'GwkfBs3ryGV4qochfkGCJw', 4],
 [u'fUJFwZ3EwLGmWbYWjbDHGw', 4],
 [u'OlzwNYEfzv6ALBmcjuI7Nw', 5],
 [u'pDEPW6gEYsWl1E8ZNkSJeQ', 4],
 [u'0hcPk_NBDwWhPVMLIWs-0A', 4],
 [u'r-PpDp7XxuvtD3vhCAcn9A', 4],
 [u'6rK3nDr6cSYNdlaBXX8VFg', 4],
 [u'jRDmnB

In [102]:
results = pd.DataFrame(data=prediction,columns=['review_id','stars'])

In [103]:
results.to_csv('data/submission.csv',index=False)