### Where does the dataset come from? Download from yelp.com

In [None]:
Which packages are needed?
<b>
* spaCy
* spacytextblob
* LemmInflect
* spaCy Language Models

### Businesses

In [None]:
import pandas as pd
import json
import re
import numpy as np

In [None]:
data = []

with open('yelp_dataset/yelp_academic_dataset_business.json') as f:

    for line in f:
        
        json_dict = json.loads(line)

        if json_dict['attributes']:
            if 'BusinessParking' in json_dict['attributes'].keys():

                json_dict['attributes']['BusinessParking'] = eval(json_dict['attributes']['BusinessParking'])

        data.append(json_dict)

In [None]:
businesses = pd.json_normalize(data)

In [None]:
businesses.columns = businesses.columns.str.split(('.')).str[-1]

In [None]:
businesses.columns

In [None]:
columns = []

for column in businesses.columns:
    column = re.sub(r'(\w)([A-Z])', r'\1_\2', column)
    column = re.sub(r'Restaurants_', r'', column)
    column = re.sub(r'Business_', r'', column)
    column = re.sub(r'(Range)([0-9])', r'\1', column)
    columns.append(column.lower())

businesses.columns = columns

In [None]:
businesses.drop(
  [
    'attributes',
    'hair_specializes_in',
    'counter_service',
    'open24_hours',
    'dietary_restrictions',
    'accepts_insurance',
    'ages_allowed',
    'b_yo_bcorkage',
    'corkage',
    'smoking',
    'b_yo_b',
    'good_for_dancing',
    'coat_check',
    'by_appointment_only',
    'best_nights',
    'music',
    'drive_thru',
    'accepts_bitcoin',
    'dogs_allowed',
    'happy_hour',
    'wheelchair_accessible',
    'good_for_meal',
    'ambience',
    'business_parking',
    'address',
    'postal_code',
    'hours',
    'is_open',
    'monday',
    'tuesday',
    'wednesday',
    'thursday',
    'friday',
    'saturday',
    'sunday'
  ],
  axis=1, errors='ignore', inplace=True
)

In [None]:
businesses['alcohol'] = businesses['alcohol'].str.replace("u'", "").str.replace("'", "")
businesses['noise_level'] = businesses['noise_level'].str.replace("u'", "").str.replace("'", "")
businesses['attire'] = businesses['attire'].str.replace("u'", "").str.replace("'", "")
businesses['wi_fi'] = businesses['wi_fi'].str.replace("u'", "").str.replace("'", "")

In [None]:
def check_parking(row):
    if row['garage'] == True or row['street'] == True or row['validated'] == True or row['lot'] == True or row['valet'] == True:
        return 'True'
    if row['garage'] == False or row['street'] == False or row['validated'] == False or row['lot'] == False or row['valet'] == False:
        return 'False'
    else:
        return np.nan

In [None]:
businesses['parking_available'] = businesses.apply(check_parking, axis=1)

In [None]:
businesses.drop(['garage', 'street','validated','lot','valet'], axis=1, inplace=True)

In [None]:
# Replace values in columns by dummies
# Is there shorter code?

columns = []
stop_list = [
    'business_id',
    'name',
    'city',
    'state',
    'latitude',
    'longitude',
    'stars',
    'review_count',
    'categories',
    'monday',
    'tuesday',
    'wednesday',
    'thursday',
    'friday',
    'saturday',
    'sunday'
]

for column in businesses.columns:
    if column not in stop_list:  
        columns.append(column)

for column in columns:
    businesses[column].replace('True', 1, inplace=True)
    businesses[column].replace('False', 0, inplace=True)
    businesses[column].replace('nan', np.NaN, inplace=True)
    businesses[column].replace('None', np.NaN, inplace=True)
    businesses[column].replace('none', np.NaN, inplace=True)
    businesses[column].replace('casual', 0, inplace=True)
    businesses[column].replace('formal', 1, inplace=True)
    businesses[column].replace('dressy', 2, inplace=True)
    businesses[column].replace('full_bar', 0, inplace=True)
    businesses[column].replace('beer_and_wine', 1, inplace=True)
    businesses[column].replace('average', 0, inplace=True)
    businesses[column].replace('quiet', 1, inplace=True)
    businesses[column].replace('loud', 2, inplace=True)
    businesses[column].replace('very_loud', 3, inplace=True)
    businesses[column].replace('no', 0, inplace=True)
    businesses[column].replace('free', 1, inplace=True)
    businesses[column].replace('paid', 2, inplace=True)

### We have created a table with all businesses and the attributes we want to have a look at

### Businesses in Pennsylvania

In [None]:
pennsylvania = businesses[businesses['state'] == 'PA'].copy()

### Restaurants in Pennsylvania

In [None]:
pennsylvania.dropna(subset='categories', inplace=True)
pennsylvania.reset_index(drop=True, inplace=True)

In [None]:
# Is there a faster ways to do this?

categories =[
    'Coffee & Tea',
    'Bistros',
    'Breakfast & Brunch',
    'Cafes',
    'French',
    'Greek',
    'Italian',
    'Mexican',
    'Tacos',
    'Egyptian',
    'Pizza',
    'Soup',
    'Sushi Bars',
    'Vegetarian',
    'Waffles',
    'Food',
    'Restaurants',
    'Bars'
]

restaurants = pd.DataFrame()

for index, item in pennsylvania['categories'].items():

    for category in categories:
        if category in item:
            restaurants = pd.concat([restaurants, pennsylvania[index:index+1]])
            break

In [None]:
with open('tables/restaurants_pennsylvania.csv', 'w') as f:
    restaurants.to_csv(f, header=True, index=False)

### We have created a table with all restaurants in Pennsylvania and saved it in a .csv for later analysis

### Now we load the table with all reviews and create a .csv which only contains reviews from restaurants in Pennsylvania

In [None]:
businesses_ids = restaurants['business_id'].to_list()

column_names = ['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date']

chunks = pd.read_json('yelp_dataset/yelp_academic_dataset_review.json', lines=True, chunksize=100000)

with open('tables/reviews_pennsylvania.csv', 'w') as f:
    header = ','.join(column_names)
    f.write(header + '\n')

    for chunk in chunks:
        reviews = pd.DataFrame(chunk)
        
        reviews[reviews['business_id'].isin(businesses_ids)].to_csv(f, mode='a', header=False, index=False)

### Analysis starts here

In [None]:
import pandas as pd

def get_file(filename):

    import pandas as pd

    while True:
        user_input = input(
            '''
                Do you want to load {0}?
                Press 1 to load the file, 
                press 2 to terminate without loading the file
                '''.format(filename)
        )

        try:
            user_input = int(user_input)

            if user_input == 1:
                print('Loading file...')

                try:
                    f = pd.read_csv(filename)
                    print('File loaded...')
                    return f
                except FileNotFoundError:
                    print('File does not exist...')
                    break

            elif user_input == 2:
                print('Terminated without loading the file...')
                break

            else:
                print('Invalid input. Please enter either 1 or 2')

        except ValueError:
            print('Invalid input. Please enter either 1 or 2')

In [None]:
restaurants = get_file('tables/restaurants_pennsylvania.csv')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, figsize=(5, 5))

# fig.suptitle('Customer ratings', fontsize=16)

sns.barplot(
    ax = ax,
    x=restaurants['stars'].value_counts().sort_index().index,
    y=restaurants['stars'].value_counts().sort_index().values*100/restaurants['stars'].value_counts().sort_index().values.sum(),
    color='#FF1A1A'
)
ax.set(xlabel='Stars', ylabel='No. of businesses in %')
ax.set_title('Star ratings');

In [None]:
restaurants[[
    'stars',
    'accepts_credit_cards',
    'bike_parking',
    'price_range',
    'take_out',
    'delivery',
    'caters',
    'wi_fi',
    'outdoor_seating',
    'has_tv',
    'reservations',
    'alcohol',
    'good_for_kids',
    'attire',
    'table_service',
    'good_for_groups',
    'noise_level',
    'parking_available'
]].corr(method='spearman')

In [None]:
reviews = get_file('tables/reviews_pennsylvania.csv')

In [None]:
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

nlp = spacy.load('en_core_web_md')
nlp.add_pipe('spacytextblob')

def check_polarity(text):
    doc = nlp(text)
    return doc._.blob.polarity

reviews['polarity'] = reviews['text'][:10000].apply(lambda x: check_polarity(x))

In [None]:
with open('tables/reviews_polarity.csv', 'w') as f:
    reviews[:10000].to_csv(f, header=True, index=False)

In [None]:
reviews = get_file('tables/reviews_polarity.csv')

In [None]:
negative_reviews = reviews[reviews['polarity'] < 0]

### Analys negative reviews

#### We want to find statements about food

In [72]:
# Training the entity rule to recognize food

import pandas as pd

food = pd.read_csv('food_labels/food.csv')

In [89]:
food_labels = food[food['description'].str.contains('[a-zA-Z]') == True]['description']

In [90]:
food_labels = food_labels[food_labels.str.split().apply(len) <= 2].drop_duplicates()

In [91]:
food_labels = food_labels.str.lower()

In [92]:
food_labels = food_labels[food_labels.str.contains('.*,.*,.*', regex=True) == False]

In [93]:
# Fix: Foods are comma seperated and order is reversed: 'muffins, blueberry' become 'blueberry muffins'

food_labels[food_labels.str.contains(', ') == True] = (
    food_labels[food_labels.str.contains(', ') == True].str.split(', ', expand=True)[1] +
    ' ' +
    food_labels[food_labels.str.contains(', ') == True].str.split(', ', expand=True)[0]
)

food_labels[food_labels.str.contains(',') == True] = (
    food_labels[food_labels.str.contains(',') == True].str.split(',', expand=True)[1] +
    ' ' +
    food_labels[food_labels.str.contains(',') == True].str.split(',', expand=True)[0]
)

In [94]:
# If label has no plural, create singular form. If label has no singular, create plural 

import spacy
import lemminflect

nlp = spacy.load('en_core_web_lg')

inflected_labels = []

for label in food_labels:
    
    doc = nlp(label)

    if len(doc) == 1:
        if doc[0].tag_ == 'NNS':
            inflected_labels.append(doc[0]._.inflect('NN'))
        else:
            inflected_labels.append(doc[0]._.inflect('NNS'))

    if len(doc) == 2:
        if doc[1].tag_ == 'NNS':
            inflected_labels.append(doc[0].text + ' ' + doc[1]._.inflect('NN'))
        else:
            inflected_labels.append(doc[0].text + ' ' + doc[1]._.inflect('NNS'))

In [99]:
food_labels_inflected = pd.Series(inflected_labels)

In [100]:
food_labels = pd.concat([food_labels, food_labels_inflected])

In [101]:
food_labels.drop_duplicates(inplace=True)

In [102]:
food_labels[food_labels.str.contains('bar', regex=True)]

2645        italian barley
3702        barbecue spice
5431               bar bar
5547            fudge bars
7690         sweet bar-b-q
               ...        
15881          chunky bars
16540           limon bars
16663         hazelnut bar
16778    blueberry golbars
17113             paleobar
Length: 538, dtype: object

In [103]:
# Add food labels to entity ruler

food_labels = food_labels  # Remove 'bar' from foods_labels, add 'product'

patterns = []

nlp = spacy.load('en_core_web_lg')

ruler = nlp.add_pipe('entity_ruler', before='ner')

for label in food_labels:
    patterns.append({'label': 'FOOD', 'pattern': label})

ruler.add_patterns(patterns)

In [104]:
len(patterns)

30704

In [106]:
ruler.to_disk('food_labels/food_patterns.jsonl')

In [109]:
nlp = spacy.load('en_core_web_lg')

ruler = nlp.add_pipe("entity_ruler", before='ner')

ruler.from_disk('food_labels/food_patterns.jsonl')

<spacy.pipeline.entityruler.EntityRuler at 0x12d5e4990>