In [1]:
import pandas as pd
import numpy as np

In [2]:
biz = pd.read_json("/Users/priya/Documents/Final Capstone/yelp_dataset/business.json", lines=True)

In [3]:
biz.shape

(209393, 14)

In [4]:
#selecting businesses that didn't close down
biz = biz[biz['is_open']==1]

In [5]:
#selecting businesses that are located in the US
biz = biz[biz['state'].isin(["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"])]

In [6]:
#dropping irrelevant columns
drop_columns = ['hours','is_open']
biz = biz.drop(drop_columns, axis=1)

In [7]:
#selecting businesses that are restaurants  
biz = biz[biz['categories'].str.contains('Restaurants',case=False, na=False)]

In [8]:
#selected out 6 random cuisine types of restaurants and renamed the categories
biz.is_copy=False
biz['category']=pd.Series()
biz.loc[biz.categories.str.contains('Italian'), 'category'] = 'Italian'
biz.loc[biz.categories.str.contains('Korean'),'category'] = 'Korean'
biz.loc[biz.categories.str.contains('French'), 'category'] = 'French'
biz.loc[biz.categories.str.contains('Greek'),'category'] = 'Greek'
biz.loc[biz.categories.str.contains('Indian'),'category'] = 'Indian'
biz.loc[biz.categories.str.contains('Japanese'), 'category'] = 'Japanese'
biz.category[:20]

  This is separate from the ipykernel package so we can avoid doing imports until


8          NaN
33         NaN
41         NaN
49         NaN
54         NaN
59         NaN
63         NaN
83         NaN
86         NaN
89         NaN
101        NaN
107        NaN
108        NaN
115        NaN
120        NaN
126    Italian
129        NaN
136        NaN
151        NaN
155        NaN
Name: category, dtype: object

In [9]:
#dropped null values in category, deleted original column categories and reseted the index
biz=biz.dropna(axis=0, subset=['category'])
del biz['categories']
biz=biz.reset_index(drop=True)

In [10]:
biz.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,category
0,0y6alZmSLnPzmG5_kP5Quw,J J's Pizza,20542 Lorain Rd,Fairview Park,OH,44126,41.448341,-81.847644,4.5,21,"{'NoiseLevel': 'u'quiet'', 'WiFi': ''no'', 'Bu...",Italian
1,AN0bWhisCf6LN9eHZ7DQ3w,Los Olivos Ristorante,3759 E Desert Inn Rd,Las Vegas,NV,89121,36.129178,-115.092483,5.0,222,"{'WiFi': 'u'free'', 'RestaurantsPriceRange2': ...",Italian
2,AtD6B83S4Mbmq0t7iDnUVA,Veggie House,"5115 Spring Mountain Rd, Ste 203",Las Vegas,NV,89146,36.125569,-115.210911,4.5,1142,"{'RestaurantsPriceRange2': '2', 'BikeParking':...",Japanese
3,UITPqkoDytnHT4kxaAyDeA,Hibachi Express,"7945 N Tryon St, Ste 100",Charlotte,NC,28262,35.2971,-80.755434,3.0,23,"{'Ambience': '{'romantic': False, 'intimate': ...",Japanese
4,_Kp1IPTi17wBywYOd30raA,Pizza Hut,8609 University Blvd,Coraopolis,PA,15108,40.507268,-80.222632,2.0,14,"{'NoiseLevel': 'u'quiet'', 'RestaurantsGoodFor...",Italian


In [11]:
review_json_path = '/Users/priya/Documents/Final Capstone/yelp_dataset/review.json'

In [12]:
size = 100000
review = pd.read_json(review_json_path, lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=size)


In [13]:
chunk_list = []
for chunk_review in review:
    # Drop columns that aren't needed
    chunk_review = chunk_review.drop(['review_id','useful','funny','cool'], axis=1)
    # Renaming column name to avoid conflict with business overall star rating
    chunk_review = chunk_review.rename(columns={'stars': 'review_stars'})
    # Inner merge with edited business file so only reviews related to the business remain
    chunk_merged = pd.merge(biz, chunk_review, on='business_id', how='inner')
    # Show feedback on progress
    #print(f"{chunk_merged.shape[0]} out of {size:,} related reviews")
    chunk_list.append(chunk_merged)
# After trimming down the review file, concatenate all relevant data back to one dataframe
df = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

In [14]:
df.shape

(766882, 16)

In [15]:
df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,category,user_id,review_stars,text,date
0,0y6alZmSLnPzmG5_kP5Quw,J J's Pizza,20542 Lorain Rd,Fairview Park,OH,44126,41.448341,-81.847644,4.5,21,"{'NoiseLevel': 'u'quiet'', 'WiFi': ''no'', 'Bu...",Italian,6hVKlwEqIx8sge7Q-Dwctw,4,I live in Cleveland but this store is close to...,2016-12-04 09:53:04
1,0y6alZmSLnPzmG5_kP5Quw,J J's Pizza,20542 Lorain Rd,Fairview Park,OH,44126,41.448341,-81.847644,4.5,21,"{'NoiseLevel': 'u'quiet'', 'WiFi': ''no'', 'Bu...",Italian,GgnVgDYZAptY1Q8QdSHfLg,4,First time trying it. We ordered a variety of ...,2017-05-01 00:43:37
2,AN0bWhisCf6LN9eHZ7DQ3w,Los Olivos Ristorante,3759 E Desert Inn Rd,Las Vegas,NV,89121,36.129178,-115.092483,5.0,222,"{'WiFi': 'u'free'', 'RestaurantsPriceRange2': ...",Italian,GgCjStvmclW9uedJa_tTlA,5,"Very good restaurant, they have many choices a...",2018-09-03 02:54:29
3,AN0bWhisCf6LN9eHZ7DQ3w,Los Olivos Ristorante,3759 E Desert Inn Rd,Las Vegas,NV,89121,36.129178,-115.092483,5.0,222,"{'WiFi': 'u'free'', 'RestaurantsPriceRange2': ...",Italian,4CR7rQLHuXZpfLzDvqlaIA,5,Awsome little Italian place. Never would have ...,2018-06-19 17:20:53
4,AN0bWhisCf6LN9eHZ7DQ3w,Los Olivos Ristorante,3759 E Desert Inn Rd,Las Vegas,NV,89121,36.129178,-115.092483,5.0,222,"{'WiFi': 'u'free'', 'RestaurantsPriceRange2': ...",Italian,UkBp300T1dfvMK8BLq08qQ,5,We moved back to Vegas about a year ago and he...,2018-08-05 03:13:21


In [16]:
## add column of number of words in review and label of negative and postive reviews
df['num_words_review'] = df.text.str.replace('\n',''). \
                                          str.replace('[!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~]','').map(lambda x: len(x.split()))

In [17]:
# label reviews as positive or negative
df['labels'] = ''
df.loc[df.review_stars >=4, 'labels'] = 'positive'
df.loc[df.review_stars ==3, 'labels'] = 'neural'
df.loc[df.review_stars <3, 'labels'] = 'negative'

In [18]:
# drop neutral reviews for easy analysis
df.drop(df[df['labels'] =='neural'].index, axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)