In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

In [2]:
%%time
business = pd.read_json("dataset/jsons/yelp_academic_dataset_business.json", lines=True,
                       dtype={'name':str,'address':str, 'city':str, 'state':str, 'postal_code':str,
                             'business_id':str,'stars':'float8',
                             'latitude':'float32','longitude':'float32','review_count':'int8',
                             'is_open':'int2','attributes':str, 'categories':str, 'hours':str})

Wall time: 4.65 s


In [3]:
business.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   150346 non-null  object 
 1   name          150346 non-null  object 
 2   address       150346 non-null  object 
 3   city          150346 non-null  object 
 4   state         150346 non-null  object 
 5   postal_code   150346 non-null  object 
 6   latitude      150346 non-null  float32
 7   longitude     150346 non-null  float32
 8   stars         150346 non-null  float64
 9   review_count  150346 non-null  int8   
 10  is_open       150346 non-null  int64  
 11  attributes    150346 non-null  object 
 12  categories    150346 non-null  object 
 13  hours         150346 non-null  object 
dtypes: float32(2), float64(1), int64(1), int8(1), object(9)
memory usage: 13.9+ MB


In [4]:
%%time
reviews_chunk = pd.read_json("dataset/jsons/yelp_academic_dataset_review.json", lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':'int8',
                             'date':str,'text':str,'useful':'int8',
                             'funny':'int8','cool':'int8'},
                      chunksize=10000)

reviews_data = [review for review in reviews_chunk]
reviews = pd.concat(reviews_data)

Wall time: 4min 26s


In [5]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   review_id    object
 1   user_id      object
 2   business_id  object
 3   stars        int8  
 4   useful       int8  
 5   funny        int8  
 6   cool         int8  
 7   text         object
 8   date         object
dtypes: int8(4), object(5)
memory usage: 293.3+ MB


In [6]:
reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [7]:
%%time
x = pd.merge(reviews, business, on='business_id')
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6990280 entries, 0 to 6990279
Data columns (total 22 columns):
 #   Column        Dtype  
---  ------        -----  
 0   review_id     object 
 1   user_id       object 
 2   business_id   object 
 3   stars_x       int8   
 4   useful        int8   
 5   funny         int8   
 6   cool          int8   
 7   text          object 
 8   date          object 
 9   name          object 
 10  address       object 
 11  city          object 
 12  state         object 
 13  postal_code   object 
 14  latitude      float32
 15  longitude     float32
 16  stars_y       float64
 17  review_count  int8   
 18  is_open       int64  
 19  attributes    object 
 20  categories    object 
 21  hours         object 
dtypes: float32(2), float64(1), int64(1), int8(5), object(13)
memory usage: 940.0+ MB
Wall time: 2min 46s


In [8]:
x.head()

Unnamed: 0,review_id,user_id,business_id,stars_x,useful,funny,cool,text,date,name,address,city,state,postal_code,latitude,longitude,stars_y,review_count,is_open,attributes,categories,hours
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,Turning Point of North Wales,1460 Bethlehem Pike,North Wales,PA,19454,40.210197,-75.22364,3.0,-87,1,"{'NoiseLevel': ""u'average'"", 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
1,VJxlBnJmCDIy8DFG0kjSow,Iaee7y6zdSB3B-kRCo4z1w,XQfwVwDr-v0ZS3_CbbE5Xw,2,0,0,0,This is the second time we tried turning point...,2017-05-13 17:06:55,Turning Point of North Wales,1460 Bethlehem Pike,North Wales,PA,19454,40.210197,-75.22364,3.0,-87,1,"{'NoiseLevel': ""u'average'"", 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
2,S6pQZQocMB1WHMjTRbt77A,ejFxLGqQcWNLdNByJlIhnQ,XQfwVwDr-v0ZS3_CbbE5Xw,4,2,0,1,The place is cute and the staff was very frien...,2017-08-08 00:58:18,Turning Point of North Wales,1460 Bethlehem Pike,North Wales,PA,19454,40.210197,-75.22364,3.0,-87,1,"{'NoiseLevel': ""u'average'"", 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
3,WqgTKVqWVHDHjnjEsBvUgg,f7xa0p_1V9lx53iIGN5Sug,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,We came on a Saturday morning after waiting a ...,2017-11-19 02:20:23,Turning Point of North Wales,1460 Bethlehem Pike,North Wales,PA,19454,40.210197,-75.22364,3.0,-87,1,"{'NoiseLevel': ""u'average'"", 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
4,M0wzFFb7pefOPcxeRVbLag,dCooFVCk8M1nVaQqcfTL3Q,XQfwVwDr-v0ZS3_CbbE5Xw,2,0,0,0,"Mediocre at best. The decor is very nice, and ...",2017-09-09 17:49:47,Turning Point of North Wales,1460 Bethlehem Pike,North Wales,PA,19454,40.210197,-75.22364,3.0,-87,1,"{'NoiseLevel': ""u'average'"", 'HasTV': 'False',...","Restaurants, Breakfast & Brunch, Food, Juice B...","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'..."
