In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import hashlib

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)

In [3]:
%%time
reviews_chunk = pd.read_json("../dataset/jsons/yelp_academic_dataset_review.json", lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':'int8',
                             'date':str,'text':str,'useful':'int8',
                             'funny':'int8','cool':'int8'},
                      chunksize=10000)

reviews_data = [review for review in reviews_chunk]
reviews = pd.concat(reviews_data)

Wall time: 4min 20s


In [5]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   review_id    object
 1   user_id      object
 2   business_id  object
 3   stars        int8  
 4   useful       int8  
 5   funny        int8  
 6   cool         int8  
 7   text         object
 8   date         object
dtypes: int8(4), object(5)
memory usage: 293.3+ MB


In [6]:
reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [7]:
del reviews_chunk, reviews_data

In [9]:
reviews = reviews[['user_id','business_id','stars']]

In [10]:
reviews

Unnamed: 0,user_id,business_id,stars
0,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3
1,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5
2,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3
3,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5
4,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4
...,...,...,...
6990275,qskILQ3k0I_qcCMI-k6_QQ,jals67o91gcrD4DC81Vk6w,5
6990276,Zo0th2m8Ez4gLSbHftiQvg,2vLksaMmSEcGbjI5gywpZA,5
6990277,mm6E4FbCMwJmb7kPDZ5v2Q,R1khUUxidqfaJmcpmGd4aw,4
6990278,YwAMC-jvZ1fvEUum6QkEkw,Rr9kKArrMhSLVE9a53q-aA,5


In [29]:
%%time
train_data, test_data = train_test_split(reviews, test_size=0.2)

Wall time: 4.24 s


In [30]:
print(train_data.shape)
print(test_data.shape)

(5592224, 3)
(1398056, 3)


In [31]:
train_data

Unnamed: 0,user_id,business_id,stars
5107771,fv7E1jVntEkFdppH5R-pmA,-K0LoSCfh8i5U_y53Krepg,5
1744252,mluWVKaIJY1Ior8_i3EYPw,YPLfRswyryjjW3VJa-SdaA,5
5827131,4ze88V1brgv5slnpZ1Q4gw,JvFbsyBllt7rEZ6nR9nwBQ,4
2900325,OvpTIjhGpg2y2kklHa47NQ,YLYLVY1HuQG1IvYjXyHzww,3
3024890,ycd1cetEw0T5GoJI4PvQzw,VVH6k9-ycttH3TV_lk5WfQ,5
...,...,...,...
4989566,Jq6acVd3KpYvgrBb3Ptqfw,nk96iwJV1_p2HECYQW1ysA,5
1491010,vDBLHZKOaitt51nvCIMP7A,49D0UOluimrA15_5K1hHVw,1
6871782,u4Dzi_282Zw6hKqZJOhsSw,n_024s43f-usTmmzPn8vIQ,5
3432666,Qp1HQD4lFbaumVZH94qCAg,J4CrtqHDH4LAyDd001cD9g,5


In [17]:
%%time
user_encoder = LabelEncoder()
business_encoder = LabelEncoder()

train_data['user_id_encoded'] = user_encoder.fit_transform(train_data['user_id'])
train_data['business_id_encoded'] = business_encoder.fit_transform(train_data['business_id'])

Wall time: 17.4 s


In [18]:
test_data['user_id_encoded'] = user_encoder.transform(test_data['user_id'])
test_data['business_id_encoded'] = business_encoder.ttransform(test_data['business_id'])

ValueError: y contains previously unseen labels: 'YxpJDf6Idn7MA9E003B0Zw'

In [23]:
train_data

Unnamed: 0,user_id,business_id,stars
4763105,V_46L2RMM2GrSorJMpOCkw,OJEiB_a2I_gxC-ZeayI94A,3
3276572,JGeVD5-0bKKnXnZxaTZUZQ,bRTWD05drzNfzVP0IadX7A,1
3769481,VRKuCyuCYZGzCpMankWH1Q,232qg1k9QV0pNt90jyTatA,4
1362927,ohQzhkPJtKUmdAhHP6xZdw,Y5RUhntIPDfe2eKriQFIZg,5
6976103,HVQKFDi-vZbawtmLMq3ZOw,0RuvlgTnKFbX3IK0ZOOocA,5
...,...,...,...
1692743,DYJ2s7mvC2xHWVqzmraNlQ,I7SkoqN88fpKagzKA059Fw,5
6550634,O6uORP9Q_AYHomJBPWxIMA,hvp1huVLVB7reZZKeXvhpg,4
6423388,z1yqatRh9eSo1Xl99w3SjQ,-1B9pP_CrRBJYPICE5WbRA,3
6962611,rpTVjnG1qz403EYYsAlm2Q,VbzoVWEPSfzaIGBq4KlVwQ,4


In [20]:
test_data

Unnamed: 0,user_id,business_id,stars
1295256,56gL9KEJNHiSDUoyjk2o3Q,8yR12PNSMo6FBYx1u5KPlw,2
3297618,bAt9OUFX9ZRgGLCXG22UmA,pBNucviUkNsiqhJv5IFpjg,5
1217795,NRHPcLq2vGWqgqwVugSgnQ,8sf9kv6O4GgEb0j1o22N1g,5
3730348,PAxc0qpqt5c2kA0rjDFFAg,XwepyB7KjJ-XGJf0vKc6Vg,4
1826590,BqPR1Dp5Rb_QYs9_fz9RiA,prm5wvpp0OHJBlrvTj9uOg,5
...,...,...,...
5175215,NGdUjjaZN0G4ElHcljlwEA,peQnrEY1S6sE4ifK2SFUSA,5
625093,iHilXutf7Qo2-x4hpYZH9A,f4vbnGoGo3eWorVekctVGQ,3
1337225,g__MeBwN_HDrlLTTbK4JRw,hA03QM1dEu5DLU0TB9rx4g,3
1236785,ugN7k16qsJ7Q-cwdY7VmBw,SVf23pjKERkedqCdWl6ECA,5
