### Review prediction for Yelp data

In [9]:
from gensim.models import Doc2Vec
import numpy as np
import json
import pandas as pd
import re
from gensim.parsing import strip_non_alphanum, strip_multiple_whitespaces
# change python encoding
import sys
reload(sys)  
sys.setdefaultencoding('utf-8')

In [10]:
d = "data/yelp_business_rating/yelp_training_set/yelp_training_set_review.json"
data = []
with open(d) as f:
    for line in f.readlines():
        data.append(json.loads(line))

In [11]:
df = pd.DataFrame.from_dict(data)

In [14]:
contractions = re.compile(r"'|-")
symbols = re.compile(r'(\W+)', re.U)
numeric = re.compile(r'(?<=\s)(\d+|\w\d+|\d+\w)(?=\s)', re.I)
swrd = re.compile(r'(?<=\s)(,|"|\(|\)|to|a|as|the|an|and|or|for|are|is)(?=\s)', re.I)
suffix = re.compile(r'(?<=\w)(s|ings*|ly|(?<=e)[sd]+)(?=\s)')
seps = re.compile(r'\s+')

# cleaner (order matters)
def clean(text): 
    text = u' ' +  text.lower() + u' '
    text = contractions.sub('', text)
    text = symbols.sub(r' \1 ', text)
    text = numeric.sub('000', text)
    text = swrd.sub(' ', text)
    text = seps.sub(' ', text)
    text = strip_multiple_whitespaces(strip_non_alphanum(text).strip())
    #return text.split(' ')
    return text


# import re
# alteos = re.compile(r'([!\?])')

# def revsplit(l):
#     l = alteos.sub(r' \1 . ', l).rstrip("( \. )*\n")
#     return [s.split() for s in l.split(" . ")]

In [15]:
df.columns

Index([u'business_id',        u'date',   u'review_id',       u'stars',
              u'text',        u'type',     u'user_id',       u'votes'],
      dtype='object')

### Get features for users and businesses using Doc2Vec model

In [16]:
group_user = df.groupby(by='user_id',squeeze=True).groups
group_business = df.groupby(by='business_id',squeeze=True).groups

In [17]:
# save documents to file

from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
user_dict = {}
doc_users = []
j = 0

with open("data/yelp_train_users.txt","w") as f:
    for u,ids in group_user.items():
        tag = 'SENT_' + str(j)
        doc = clean(' '.join(df.iloc[ids].text))
        doc = doc.encode('utf-8',errors='ignore').decode('utf-8')
        #doc_users.append(LabeledSentence(words=doc,tags=tag))
        user_dict[u] = tag
        j+=1
        f.write(doc + '\n')

In [18]:
# load documents
from gensim.models.doc2vec import TaggedLineDocument
%time doc_users = TaggedLineDocument('data/yelp_train_users.txt')

In [10]:
# train doc2vec model
%time model_user = Doc2Vec(documents=doc_users,size=100)

In [19]:
# save documents to file
business_dict = {}
doc_business = []
j = 0

with open("data/yelp_train_business.txt","w") as f:
    for u,ids in group_business.items():
        tag = 'SENT_' + str(j)
        doc = clean(' '.join(df.iloc[ids].text))
        doc = doc.encode('utf-8',errors='ignore').decode('utf-8')
        business_dict[u] = tag
        j+=1
        f.write(doc + '\n')

In [20]:
#load documents
from gensim.models.doc2vec import TaggedLineDocument
%time doc_business = TaggedLineDocument('data/yelp_train_business.txt')

In [30]:
#train doc2vec model
%time model_business = Doc2Vec(documents=doc_business,size=100)

In [31]:
## save models for users and business
import cPickle
with open('data/doc2vec_yelp_user.pkl','w') as f:
    cPickle.dump(model_user,f)
with open('data/doc2vec_yelp_business.pkl','w') as f:
    cPickle.dump(model_business,f)
with open('data/dict_yelp_user.pkl','w') as f:
    cPickle.dump(user_dict,f)
with open('data/dic_yelp_business.pkl','w') as f:
    cPickle.dump(business_dict,f)

#### Recommendation using Yelp data

In [21]:
## formulate user - business rating
n_business = df.business_id.unique().shape[0]
n_users = df.user_id.unique().shape[0]
row_names = {n:i for i,n in enumerate(df.user_id.unique())} # user_id for rows
col_names = {n:i for i,n in enumerate(df.business_id.unique())} # business_id for columns
X_train = np.zeros((n_users,n_business))

for ib,iu,star in df[['business_id','user_id','stars']].values:
    i = row_names[iu]
    j = col_names[ib]
    X_train[i,j] = star

In [22]:
with open('data/col_names.pkl','w') as f:
    cPickle.dump(col_names,f)
with open('data/row_names.pkl','w') as f:
    cPickle.dump(row_names,f)

In [100]:
## NMF
from sklearn.decomposition import NMF
nmf = NMF(n_components=100)
%time W =  nmf.fit_transform(X_train)
H = nmf.components_

In [104]:
with open('data/yelp_mf.pkl','w') as f:
    cPickle.dump(nmf,f)

In [110]:
W.shape, H.shape

((45981, 100), (100, 11537))

In [112]:
# predict residual
residual_nmf = X_train - np.dot(W,H)

In [114]:
mean_u = residual_nmf.mean() ### coefficient miu

In [117]:
residual_nmf = residual_nmf - mean_u ## updated residual

In [120]:
import h5py
f = h5py.File('data/residual.h5','w')
f.create_dataset('data',data=residual_nmf)
f.close()

#### reload values to do regression

In [1]:
import h5py
import numpy as np
f = h5py.File('data/residual.h5','r')
residual = f['data'].value
f.close()

In [2]:
import cPickle
with open('data/doc2vec_yelp_user.pkl','r') as f:
    %time model_user = cPickle.load(f)
with open('data/doc2vec_yelp_business.pkl','r') as f:
    %time model_business = cPickle.load(f)
with open('data/dict_yelp_user.pkl','r') as f:
    %time user_dict = cPickle.load(f)
with open('data/dic_yelp_business.pkl','r') as f:
    %time business_dict = cPickle.load(f)
with open('data/col_names.pkl','r') as f:
    %time col_names = cPickle.load(f)
with open('data/row_names.pkl','r') as f:
    %time row_names = cPickle.load(f)

Couldn't import dot_parser, loading of dot files will not be possible.
CPU times: user 9.11 s, sys: 96 ms, total: 9.21 s
Wall time: 9.21 s
CPU times: user 5.44 s, sys: 24 ms, total: 5.46 s
Wall time: 5.46 s
CPU times: user 112 ms, sys: 0 ns, total: 112 ms
Wall time: 114 ms
CPU times: user 28 ms, sys: 0 ns, total: 28 ms
Wall time: 29 ms
CPU times: user 20 ms, sys: 0 ns, total: 20 ms
Wall time: 19.8 ms
CPU times: user 80 ms, sys: 4 ms, total: 84 ms
Wall time: 84.3 ms


In [3]:
row_names = {i:n for n,i in row_names.items()}

In [4]:
col_names = {i:n for n,i in col_names.items()}

In [5]:
# replication of method in the report
n_users =  len(user_dict)
n_business = len(business_dict)
y = residual.ravel()

In [7]:
X = np.zeros((n_users,n_business,200))
for i in range(n_users):
    for j in range(n_business):
        iname = row_names[i]
        jname = col_names[j]
        ii = int(user_dict[iname].strip('SENT_'))
        jj = int(business_dict[jname].strip('SENT_'))
        X[i,j,:] = np.concatenate((model_user.docvecs[ii],model_business.docvecs[jj]))

MemoryError: 