## Importing modules

In [None]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

# Data Processing

### Business

In [None]:
%%time
business_chunk = pd.read_json("../dataset/jsons/yelp_academic_dataset_business.json",
                              lines=True,
                              chunksize=10000)

business_data = [business for business in business_chunk]
business = pd.concat(business_data)

CPU times: user 2.9 s, sys: 481 ms, total: 3.38 s
Wall time: 3.38 s


In [None]:
business.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [None]:
business.city.value_counts()

Philadelphia      14569
Tucson             9250
Tampa              9050
Indianapolis       7540
Nashville          6971
                  ...  
Gentilly              1
pennsauken            1
Hamiltion             1
Newtown square        1
Apollo beach          1
Name: city, Length: 1416, dtype: int64

In [None]:
def check_categories(categories):
    try:
        return 'Restaurants' in categories.split(', ')
    except Exception:
        return False

filtered_business = business[business['categories'].apply(check_categories)]
filtered_business = filtered_business[filtered_business['review_count']>20]
filtered_business = filtered_business[filtered_business['city']=="Philadelphia"]

In [None]:
business.shape, filtered_business.shape

((150346, 14), (3829, 14))

In [None]:
filtered_business.to_csv('../dataset/processed/business.csv', index=False)

### Users

In [None]:
%%time
users_chunk = pd.read_json("../dataset/jsons/yelp_academic_dataset_user.json",
                              lines=True,
                              chunksize=10000)

user_data = [users for users in users_chunk]
users = pd.concat(user_data)
users.shape

CPU times: user 46.3 s, sys: 7.93 s, total: 54.2 s
Wall time: 54.2 s


(1987897, 22)

In [None]:
users.head()

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,3.91,250,65,55,56,18,232,844,467,467,239,180
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,3.74,1145,264,184,157,251,1847,7054,3131,3131,1521,1946
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2008-07-25 10:41:00,2086,1010,1003,20092010201120122013,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",52,3.32,89,13,10,17,3,66,96,119,119,35,18
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,2005-11-29 04:38:33,512,330,299,200920102011,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg...",28,4.27,24,4,1,6,2,12,16,26,26,10,9
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,2007-01-05 19:40:59,29,15,7,,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA...",1,3.54,1,1,0,0,0,1,1,0,0,0,0


In [None]:
users.review_count.min(), users.review_count.max()

(0, 17473)

In [None]:
users = users[users['review_count']>20]
users.shape

(419347, 22)

In [None]:
users.to_csv('../dataset/processed/users.csv', index=False)

### Reviews

In [None]:
%%time
reviews_chunk = pd.read_json("../dataset/jsons/yelp_academic_dataset_review.json", lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':'int8',
                             'date':str,'text':str,'useful':'int8',
                             'funny':'int8','cool':'int8'},
                      chunksize=10000)

reviews_data = [review for review in reviews_chunk]
reviews = pd.concat(reviews_data)

CPU times: user 1min 29s, sys: 12.8 s, total: 1min 41s
Wall time: 1min 42s


In [None]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   review_id    object
 1   user_id      object
 2   business_id  object
 3   stars        int8  
 4   useful       int8  
 5   funny        int8  
 6   cool         int8  
 7   text         object
 8   date         object
dtypes: int8(4), object(5)
memory usage: 293.3+ MB


In [None]:
filtered_reviews = pd.merge(reviews, users, on='user_id', how='inner')
filtered_reviews = pd.merge(filtered_reviews, filtered_business, on='business_id', how='inner')

In [None]:
filtered_reviews = filtered_reviews[['user_id', 'business_id', 'stars_x']]
filtered_reviews.rename(columns={'stars_x': 'stars'},inplace=True, errors='raise')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_reviews.rename(columns={'stars_x': 'stars'},inplace=True, errors='raise')


In [None]:
reviews.shape, filtered_reviews.shape

((6990280, 9), (419344, 3))

In [None]:
filtered_reviews.to_csv('../dataset/processed/reviews.csv', index=False)