# Create Las Vegas Reviews Dataset

#### Imports

In [82]:
import numpy as np
import pandas as pd

from langid.langid import LanguageIdentifier, model

### Read JSON Data

In [11]:
review_chunks = pd.read_json('data/yelp_reviews.json', lines=True, orient='records', chunksize=5000)
review_list = []
for chunk in review_chunks:
    review_list.append(chunk)
reviews = pd.concat(review_list)

In [12]:
reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,xQY8N_XvtGbearJ5X4QryQ,OwjRMXRC0KyPrIlcjaXeFQ,-MhfebM0QIsKt87iDN-FNw,2,5,0,0,"As someone who has worked with many museums, I...",2015-04-15 05:21:16
1,UmFMZ8PyXZTY2QcwzsfQYA,nIJD_7ZXHq-FX8byPMOkMQ,lbrU8StCq3yDfr-QMnGrmQ,1,1,1,0,I am actually horrified this place is still in...,2013-12-07 03:16:52
2,LG2ZaYiOgpr2DK_90pYjNw,V34qejxNsCbcgD8C0HVk-Q,HQl28KMwrEKHqhFrrDqVNQ,5,1,0,0,I love Deagan's. I do. I really do. The atmosp...,2015-12-05 03:18:11
3,i6g_oA9Yf9Y31qt0wibXpw,ofKDkJKXSKZXu5xJNGiiBQ,5JxlZaqCnk1MnbgRirs40Q,1,0,0,0,"Dismal, lukewarm, defrosted-tasting ""TexMex"" g...",2011-05-27 05:30:52
4,6TdNDKywdbjoTkizeMce8A,UgMW8bLE0QMJDCkQ1Ax5Mg,IS4cv902ykd8wj1TR0N3-A,4,0,0,0,"Oh happy day, finally have a Canes near my cas...",2017-01-14 21:56:57


In [13]:
business_chunks = pd.read_json('data/yelp_business.json', lines=True, orient='records', chunksize=5000)
business_list = []
for chunk in business_chunks:
    business_list.append(chunk)
business = pd.concat(business_list)

In [14]:
business.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,1,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...","Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh...","{'Monday': '10:0-18:0', 'Tuesday': '11:0-20:0'..."
1,Yzvjg0SayhoZgCljUJRF9Q,"Carlos Santo, NMD","8880 E Via Linda, Ste 107",Scottsdale,AZ,85258,33.569404,-111.890264,5.0,4,1,"{'GoodForKids': 'True', 'ByAppointmentOnly': '...","Health & Medical, Fitness & Instruction, Yoga,...",
2,XNoUzKckATkOD1hP6vghZg,Felinus,3554 Rue Notre-Dame O,Montreal,QC,H4C 1P4,45.479984,-73.58007,5.0,5,1,,"Pets, Pet Services, Pet Groomers",
3,6OAZjbxqM5ol29BuHsil3w,Nevada House of Hose,1015 Sharp Cir,North Las Vegas,NV,89030,36.219728,-115.127725,2.5,3,0,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","Hardware Stores, Home Services, Building Suppl...","{'Monday': '7:0-16:0', 'Tuesday': '7:0-16:0', ..."
4,51M2Kk903DFYI6gnB5I6SQ,USE MY GUY SERVICES LLC,4827 E Downing Cir,Mesa,AZ,85205,33.428065,-111.726648,4.5,26,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","Home Services, Plumbing, Electricians, Handyma...","{'Monday': '0:0-0:0', 'Tuesday': '9:0-16:0', '..."


In [20]:
business.categories.fillna('NA', inplace=True)

### Find Las Vegas Restaurant Reviews

In [34]:
restaurants = business[business['categories'].str.contains('Restaurant')]

In [36]:
lv = restaurants[restaurants.city.str.contains('Las Vegas')]

In [38]:
lv.shape

(7310, 14)

In [39]:
food = business[business['categories'].str.contains('Restaurant|Food')]

In [41]:
food[food.city.str.contains('Las Vegas')].shape

(9477, 14)

In [44]:
lv_reviews = reviews.set_index('business_id').loc[[lv.business_id]]

In [45]:
lv_reviews.head()

Unnamed: 0_level_0,review_id,user_id,stars,useful,funny,cool,text,date
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
fnZrZlqW1Z8iWgTVDfv_MA,WUAt5-krh075Ie44np8Hew,YDkC5VVT8s9NlIZtAr8NUA,5,0,0,0,So why would I be giving a Fast Food chain loc...,2010-11-27 09:33:34
fnZrZlqW1Z8iWgTVDfv_MA,S9vIs-je49jqbhpWSQ8yHg,mxtalrN7VHVQwGjv8ln5yg,2,0,0,0,I come here bout 3x's a mo. & I just can't do ...,2016-03-19 09:03:09
fnZrZlqW1Z8iWgTVDfv_MA,i2Soid_X8XsSTyIrPOst6g,xammeA3ftpFoAN13v5Blyg,1,2,0,0,WORST experience EVER!!!!! never have i ate an...,2015-04-02 06:14:51
fnZrZlqW1Z8iWgTVDfv_MA,uDYGaAOUgo7SLuH5nUZs3w,gQhFacMGI41mfVXqw1E7bQ,4,0,0,0,Hot fresh food usually. Staff seems to turn o...,2014-08-14 23:00:34
fnZrZlqW1Z8iWgTVDfv_MA,Xg7HuBvuZHTBog1G_2Wk0w,0PXhJMztE3ijzQPjiwPERg,3,0,0,0,Beer battered cod fish seems really rubbery. I...,2015-03-23 02:10:52


In [56]:
lv_reviews.reset_index(inplace=True)

In [50]:
lv_reviews.stars.value_counts()/lv_reviews.stars.value_counts().sum()

5    0.444518
4    0.228404
1    0.124015
3    0.118725
2    0.084337
Name: stars, dtype: float64

### Select only English Reviews

In [85]:
# Pick out reviews with length greater than 10
lv_reviews = lv_reviews.loc[lv_reviews.text.apply(lambda x : True if len(x) > 10 else False)]

In [86]:
lv_reviews.reset_index(drop=True, inplace=True)

In [87]:
langlst= []

In [88]:
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)

for i, text in enumerate(reviews.text):
    lang = identifier.classify(text)[0]
    langlst.append(lang)


In [90]:
lv_reviews['language'] = np.array(langlst)

In [91]:
eng_reviews = lv_reviews[lv_reviews.language == 'en']

### Save New Dataset

In [92]:
lv_reviews.to_pickle('data/las_vegas_reviews.pkl')

In [93]:
eng_reviews.to_pickle('data/english_reviews.pkl')