In [2]:
import spacy
import pandas as pd
import numpy as np
import textblob
from textblob import TextBlob
import math 
import matplotlib.pyplot as plt

sp = spacy.load('en_core_web_md')

#### Process businesses dataset:

In [None]:
#### import datasets 
# read JSON of businesses

import json 

data_file = open('yelp_academic_dataset_business.json')
data = []
for line in data_file: 
    data.append(json.loads(line))
business_df = pd.DataFrame(data)
data_file.close()
business_df.to_csv('yelp_businesses.csv')

In [None]:
business_df = pd.read_csv('yelp_businesses.csv')

# turn lists back into arrays
business_df['og_categories'] = business_df['categories']
business_df['category_tokens'] = business_df['categories'].str.split(',')

In [None]:
#### Processing
# drop businesses with no categories
business_df = business_df.dropna(subset=['category_tokens']) # 103 businesses 
first_category_df = ([row[0] fr row in business_df['category_tokens']])
# get first category
business_df['first_category'] = first_category_df

#### Process categories dataset: 

In [None]:
import json 

with open("yelp_categories.json", "r", encoding="utf-8") as f: 
    data = json.load(f)

categories = data

cat_parents = {item['alias']: item.get('parents', []) for item in categories}

# recursively find highest parent
def find_highest_parents(alias): 
    seen = set()
    while alias in cat_parents and cat_parents[alias]:
        alias = cat_parents[alias][0]
        if alias in seen: 
            break
        seen.add(alias)
    return alias 

# process data
cat_data = []
for item in categories: 
    alias = item.get("alias", "")
    title = item.get("title", "")
    highest_parent = find_highest_parents(alias)

    cat_data.append([alias, title, highest_parent])

categories_df = pd.DataFrame(cat_data, columns=['alias', 'title', 'parent'])
categories_df.to_csv('yelp_categories.csv')

In [None]:
categories_df = pd.read_csv('yelp_categories.csv')
categories_df.head()

In [None]:
# process and simplify parent categories 
from collections import Counter 

categories_df.head()
categories_df['parent'].unique()

def simplify_categories(df):
    # combine categories to simplify data
    df.loc[df['parent'] == 'restaurants', 'parent'] = 'food'
    df.loc[df['parent'].isin(['shopping', 'bicycles']), 'parent'] = 'retail'
    df.loc[df['parent'] == 'health', 'parent'] = 'health'
    df.loc[df['parent'] == 'beautysvc', 'parent'] = 'beauty'
    df.loc[df['parent'].isin(['nightlife', 'arts', 'active', 'eventservices', 'localflavor', 'massmedia']), 'parent'] = 'entertainment'
    df.loc[df['parent'].isin(['homeservices', 'localservices', 'pets', 'religiousorgs']), 'parent'] = 'homeservices'
    df.loc[df['parent'] == 'auto', 'parent'] = 'auto'
    df.loc[df['parent'].isin(['professional', 'publicservicesgovt', 'financialservices']), 'parent'] = 'professionalservices'
    df.loc[df['parent'] == 'hotelstravel', 'parent'] = 'travel'
    df.loc[df['parent'] == 'education', 'parent'] = 'education'
    return df

categories_df = simplify_categories(categories_df)
counter = Counter(categories_df['parent'])
print(counter)

categories_df.head()

In [None]:
# get a parent category for each business
business_df['parent'] = ''

In [106]:
def get_parent(business):
    print(business['first_category'])

    matched_row = categories_df[categories_df['title'] == business['first_category']]
    print(matched_row)
    if not matched_row.empty: 
        return matched_row['parent'].iloc[0]
    else: 
        return None

In [None]:
business_df['parent'] = business_df.apply(lambda x: get_parent(x), axis=1)

In [None]:
# drop businesses with no parent 
business_df = business_df.dropna(subset=['parent'])

In [None]:
business_df.to_csv('yelp_businesses_processed.csv')

#### Assign a business category to each review:

In [124]:
business_df = pd.read_csv('yelp_businesses_processed.csv')
business_df.head()

Unnamed: 0.1,Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,og_categories,category_tokens,first_category,parent
0,0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",,"Doctors, Traditional Chinese Medicine, Naturop...","['Doctors', ' Traditional Chinese Medicine', '...",Doctors,health
1,1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ...","Shipping Centers, Local Services, Notaries, Ma...","['Shipping Centers', ' Local Services', ' Nota...",Shipping Centers,homeservices
2,2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ...","Department Stores, Shopping, Fashion, Home & G...","['Department Stores', ' Shopping', ' Fashion',...",Department Stores,retail
3,3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","['Restaurants', ' Food', ' Bubble Tea', ' Coff...",Restaurants,food
4,4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2...","Brewpubs, Breweries, Food","['Brewpubs', ' Breweries', ' Food']",Brewpubs,food


In [125]:
reviews_df = pd.read_csv('yelp.csv')
reviews_df['category'] = ''
reviews_df.head()

Unnamed: 0.1,Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,category
0,0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,
1,1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18,
2,2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,
3,3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,
4,4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,


In [126]:
# check that each review matches a business in business_ids
no_biz_reviews = reviews_df[~reviews_df['business_id'].isin(business_df['business_id'])]

print("reviews:", reviews_df.shape[0])
print("businesses:", business_df.shape[0])
print("reviews with no business:", len(no_biz_reviews))

no_biz_reviews.head()

reviews: 10000
businesses: 148891
reviews with no business: 114


Unnamed: 0.1,Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,category
11,11,l3Wk_mvAog6XANIuGQ9C7Q,ZbqSHbgCjzVAqaa7NKWn5A,EQ-TZ2eeD_E0BHuvoaeG5Q,4.0,0,0,0,"Locals recommended Milktooth, and it's an amaz...",2015-08-19 14:31:45,
127,127,940tqxFO4Pwg_KMg4Y4Z5g,O6wkgoJqU7KMjleSlCDGaA,EQ-TZ2eeD_E0BHuvoaeG5Q,5.0,2,0,1,Milktooth is the place to go if you want a goo...,2018-01-10 01:48:58,
286,286,91uC4f0aX4ycQTDL5Hq7Mw,huHPQSQgw4kFakc0Vq7TDA,_D7QoWuQKMXk0mEE7r_Ftw,5.0,1,0,1,I have been a fan of Chateau La Vin for a few ...,2014-03-14 13:55:46,
527,527,S-H-Ao17MEYH9cLpvevbnQ,s03ZJhgJki5i89d93-9keQ,EQ-TZ2eeD_E0BHuvoaeG5Q,5.0,1,0,1,"Busy place, but we were offered extra- special...",2016-06-11 21:41:05,
568,568,DkrsmadeZJj_GERNMIBxVg,LqC5hNXcFVY-jcbyxi-tuQ,Q-prSTdggNlxAEFV88BZOw,5.0,0,0,0,Came here for lunch with my fiancee and friend...,2013-12-28 20:23:21,


In [127]:
# use API for reviews with no business
# split up because of the high computational cost for the API
import requests

BASE_URL = "https://api.yelp.com/v3/businesses/"
API_KEY = "kA_gvVBfor51QvtrRpUBlDT4JnOaCTcxXIj8avei_nkR_M1GRDBYwqo7576t8rHcmWYSUusRiK2JV8VKsi06E55sxY505WRu0nla-_yYMiyevNekJMbEVKwUgY3pZ3Yx"

def get_api_category(business_id):     
    url = f'{BASE_URL}{business_id}'
    headers = {"accept": "application/json",
               "authorization": f"Bearer {API_KEY}"}
    
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json()
        categories = data.get('categories', [])
        if categories:
            return categories[0].get('title', None)
        else: 
            print(f"id {business_id} has no categories")
    else: 
        print(f"Error on {business_id}, code: {response.status_code}")
        return None 
    
def get_nobiz_categories(business_id): 
    category = get_api_category(business_id)
    if category: 
        return category
    else: 
        print("None found")
        return None

no_biz_reviews['category'] = no_biz_reviews['business_id'].apply(get_nobiz_categories)
print(no_biz_reviews)

id ZERQMWb1PFzCfbfknqq-fA has no categories
None found
      Unnamed: 0               review_id                 user_id  \
11            11  l3Wk_mvAog6XANIuGQ9C7Q  ZbqSHbgCjzVAqaa7NKWn5A   
127          127  940tqxFO4Pwg_KMg4Y4Z5g  O6wkgoJqU7KMjleSlCDGaA   
286          286  91uC4f0aX4ycQTDL5Hq7Mw  huHPQSQgw4kFakc0Vq7TDA   
527          527  S-H-Ao17MEYH9cLpvevbnQ  s03ZJhgJki5i89d93-9keQ   
568          568  DkrsmadeZJj_GERNMIBxVg  LqC5hNXcFVY-jcbyxi-tuQ   
...          ...                     ...                     ...   
9153        9153  IPqjqCUmyAjybkrPrJQr2w  rkefyv6T6ucEdQOeLfqUlg   
9486        9486  2yDOQDdmIix7CTbZcpWlqg  r15S-CXQW6LMqI2MbjDOaA   
9535        9535  tgt0zevXxgvetVhjyDerjg  Udgnme7KH9p99wMoLMwNGA   
9724        9724  aWTKbgX-viOD7bllpIjVWQ  uXsLqy4oqwDHZgsRDH6J-g   
9918        9918  NsSRb2m9xjeJBt3NNRChlw  0Zwtd2T7HNG75UhO_z5LLg   

                 business_id  stars  useful  funny  cool  \
11    EQ-TZ2eeD_E0BHuvoaeG5Q    4.0       0      0     0   
127   EQ

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_biz_reviews['category'] = no_biz_reviews['business_id'].apply(get_nobiz_categories)


In [128]:
print(no_biz_reviews['category'].isnull().sum())
print(no_biz_reviews['category'])

1
11               Breakfast & Brunch
127              Breakfast & Brunch
286            Beer, Wine & Spirits
527              Breakfast & Brunch
568                       Wine Bars
                   ...             
9153                        Grocery
9486                    Sports Bars
9535                    Sports Bars
9724                        Mexican
9918    Used, Vintage & Consignment
Name: category, Length: 114, dtype: object


In [129]:
# drop null row 
no_biz_reviews = no_biz_reviews.dropna(subset=['category'])
print(len(no_biz_reviews))

113


In [130]:
business_df.head()

Unnamed: 0.1,Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,og_categories,category_tokens,first_category,parent
0,0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",,"Doctors, Traditional Chinese Medicine, Naturop...","['Doctors', ' Traditional Chinese Medicine', '...",Doctors,health
1,1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ...","Shipping Centers, Local Services, Notaries, Ma...","['Shipping Centers', ' Local Services', ' Nota...",Shipping Centers,homeservices
2,2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ...","Department Stores, Shopping, Fashion, Home & G...","['Department Stores', ' Shopping', ' Fashion',...",Department Stores,retail
3,3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","['Restaurants', ' Food', ' Bubble Tea', ' Coff...",Restaurants,food
4,4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2...","Brewpubs, Breweries, Food","['Brewpubs', ' Breweries', ' Food']",Brewpubs,food


In [131]:
# assign categories to rows with ids 
def get_csv_category(business_id): 
    match = business_df[business_df['business_id'] == business_id]['parent']
    return match.iloc[0] if not match.empty else None
    
reviews_df['category'] = reviews_df['business_id'].apply(get_csv_category)

In [132]:
print(reviews_df['category'].isnull().sum())
# missing 114 values, which are populated by the API

114


In [133]:
reviews_df.head()


Unnamed: 0.1,Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,category
0,0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,food
1,1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18,entertainment
2,2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,food
3,3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,food
4,4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,food


In [150]:
# get parents for API reviews
no_biz_reviews = no_biz_reviews.merge(categories_df[['title', 'parent']],
                                      left_on='category',
                                      right_on='title',
                                      how='left')
no_biz_reviews.head()

MergeError: Passing 'suffixes' which cause duplicate columns {'parent_x', 'title_x'} is not allowed.

In [140]:
reviews_df.to_csv('TEMP_reviews_df.csv')

no_biz_reviews.to_csv("TEMP_nobizreviews_df.csv")

In [155]:
reviews_df = pd.read_csv("TEMP_reviews_df.csv")
no_biz_reviews = pd.read_csv("TEMP_nobizreviews_df.csv")
no_biz_reviews['category'] = no_biz_reviews['parent']
print(Counter(reviews_df['category']))
print(Counter(no_biz_reviews['category']))

Counter({'food': 6517, 'entertainment': 1682, 'beauty': 374, 'retail': 355, 'travel': 280, 'homeservices': 262, 'auto': 219, 'health': 132, nan: 114, 'professionalservices': 42, 'education': 23})
Counter({'nightlife': 37, 'restaurants': 33, 'food': 18, 'shopping': 15, 'localservices': 6, 'hotelstravel': 2, nan: 1, 'arts': 1})


In [156]:
no_biz_reviews.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,category,title,parent
0,0,11,l3Wk_mvAog6XANIuGQ9C7Q,ZbqSHbgCjzVAqaa7NKWn5A,EQ-TZ2eeD_E0BHuvoaeG5Q,4.0,0,0,0,"Locals recommended Milktooth, and it's an amaz...",2015-08-19 14:31:45,restaurants,Breakfast & Brunch,restaurants
1,1,127,940tqxFO4Pwg_KMg4Y4Z5g,O6wkgoJqU7KMjleSlCDGaA,EQ-TZ2eeD_E0BHuvoaeG5Q,5.0,2,0,1,Milktooth is the place to go if you want a goo...,2018-01-10 01:48:58,restaurants,Breakfast & Brunch,restaurants
2,2,286,91uC4f0aX4ycQTDL5Hq7Mw,huHPQSQgw4kFakc0Vq7TDA,_D7QoWuQKMXk0mEE7r_Ftw,5.0,1,0,1,I have been a fan of Chateau La Vin for a few ...,2014-03-14 13:55:46,food,"Beer, Wine & Spirits",food
3,3,527,S-H-Ao17MEYH9cLpvevbnQ,s03ZJhgJki5i89d93-9keQ,EQ-TZ2eeD_E0BHuvoaeG5Q,5.0,1,0,1,"Busy place, but we were offered extra- special...",2016-06-11 21:41:05,restaurants,Breakfast & Brunch,restaurants
4,4,568,DkrsmadeZJj_GERNMIBxVg,LqC5hNXcFVY-jcbyxi-tuQ,Q-prSTdggNlxAEFV88BZOw,5.0,0,0,0,Came here for lunch with my fiancee and friend...,2013-12-28 20:23:21,nightlife,Wine Bars,nightlife


In [158]:
reviews_df = reviews_df.merge(no_biz_reviews[['business_id','category']],
                              on='business_id',
                              how='left',
                              suffixes=('','_new'))

In [162]:
reviews_df['category'] = reviews_df['category'].fillna(reviews_df['category_new'])

In [172]:
reviews_df = reviews_df.dropna(subset=['category'])

In [175]:
# clean up data 
reviews_df = reviews_df.drop(['Unnamed: 0.1', 'Unnamed: 0', 'category_new'], axis=1)

In [176]:
reviews_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,category
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,food
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18,entertainment
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,food
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,food
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,food


In [177]:
reviews_df.to_csv("reviews_with_parents.csv")