# Analysis of restaurant reviews

In this notebook we analyze restaurant reviews for a city and a specific date.

In case gbq is not working you might need to update the python api client library:

```bash
sudo pip install --upgrade google-api-python-client
```

In [None]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats

# hide warnings. `gbq.read_gbq()` gives some
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('../scrape_save_search')
import load_data

In [None]:
comments = load_data.load_comments()

In [None]:
restaurants, existing_tag_ids, elastic_tag_ids, image_tag_ids = load_data.load_restaurants(rename_cols=False)

In [None]:
comments['date'] = pd.to_datetime(comments['date'])
comments['yearmonth'] = comments['date'].dt.strftime('%Y-%m')
comments['year'] = comments['date'].dt.year
comments['month'] = comments['date'].dt.month

# Pretty cols

In [None]:
restaurants.columns = [x[5:] if 'info_' in x else x for x in restaurants.columns]

In [None]:
# Nice column names to plot some example data
restaurants_head = restaurants[['name', 'lat', 'lon', 'country', 'city', 'postal_code', 'street', 'house_number', 
                                'reviews_noise_level', 'reviews_waiting_time', 'reviews_nr_ratings']].head()
restaurants_head.columns = ['Name', 'Lat', 'Lon', 'Country', 'City', 'Postal code', 'Street', 'House number',
                            'Noise level', 'Waiting time', 'Number of ratings']
restaurants_head

## Reviewers

In [None]:
ax = comments.groupby('reviewer').size().value_counts().sort_index()[:15].plot.bar(figsize=(12,4), rot=45)
ax.set_title('Number of reviewers that wrote "X" reviews')
ax.set_xlabel('Number of reviews')
ax.set_ylabel('Number of reviewers');

## Overall review ratings

In [None]:
def plot_rating_bars(s: pd.Series, rating_type=float):
    ax = s.astype(rating_type).value_counts().sort_index().plot.bar(color='b', figsize=(12,4), rot=45)
    ax.set_title('Distribution of ratings')
    ax.set_xlabel('Rating')
    ax.set_ylabel('Number of reviews');

In [None]:
plot_rating_bars(comments.rating)

## Reviewer that use specific word

In [None]:
word = 'slecht'
comments[word] = (
    comments
    .apply(lambda x: sum(1 for match in re.finditer(r"\b"+re.escape(word)+r"\b", x.comment)), axis='columns')
)

For the regex we use \b to indicate a [word boundary](https://www.regular-expressions.info/wordboundaries.html)

In [None]:
def search_for_word(word, df, minimum_number_of_occurences=1):
    df[word] = df.apply(lambda x: 
                        sum(1 for match in re.finditer(r"\b"+re.escape(word)+r"\b" , x.comment)), axis='columns')
    df_word = df[df[word] >= minimum_number_of_occurences]
    print(f'Total reviews with {minimum_number_of_occurences} times the word {word}: {len(df_word)}' +
          f'\nTotal review overall: {len(df)}')
    plot_rating_bars(df_word.rating, rating_type=int)
    return df_word

In [None]:
slecht_1 = search_for_word('slecht', comments, minimum_number_of_occurences=1)

In [None]:
slecht_2 = search_for_word('slecht', comments, minimum_number_of_occurences=2)

In [None]:
# compare the distributions with chisquare
def compare_ratings(values_a, values_b):
    distributions = []
    for values in [values_a, values_b]:
        counts = values.rating.astype(int).value_counts().sort_index() 
        distributions.append(counts / sum(counts) * 100)
    print(stats.chisquare(distributions[1], distributions[0]))
    print(stats.chisquare(distributions[0], distributions[1]))

In [None]:
compare_ratings(slecht_1, slecht_2)

## More use of the word 'lekker' does not increase score

In [None]:
lekker_1 = search_for_word('lekker', comments)

In [None]:
lekker_2 = search_for_word('lekker', comments, minimum_number_of_occurences=2)

In [None]:
compare_ratings(lekker_1, lekker_2)

## Average ratings per reviewer

In [None]:
comments_mean_count_rating = comments.groupby('reviewer').agg({'rating': ['mean', 'count']})

In [None]:
# Reviewers die alleen onvoldoendes geven
comments_mean_count_rating[comments_mean_count_rating[('rating', 'mean')] < 5.5].shape

In [None]:
# Reviewers die alleen onvoldoendes geven (en meer dan 1 rating hebben gegeven)
comments_mean_count_rating[(comments_mean_count_rating[('rating', 'count')] > 1) & (comments_mean_count_rating[('rating', 'mean')] < 5.5)].shape

## Reviewer with most reviews

In [None]:
comments_cnt = comments.groupby('reviewer').count()
top_reviewer = comments_cnt[comments_cnt.name == comments_cnt.name.max()].index[0]
top_reviewer

In [None]:
# Wat voor scores geeft deze reviewer
comments[comments.reviewer == top_reviewer].rating.astype(int).value_counts().sort_index().plot.bar(color='b');

In [None]:
# In welke jaren gaf hij zijn reviews
comments[comments.reviewer == top_reviewer].date.dt.year.value_counts().sort_index().plot.bar(color='b');

# Reviews over time

In [None]:
# totaal aantal reviews per jaar
comments.date.dt.year.value_counts().sort_index().plot.bar(color='b');

In [None]:
fig, ax = plt.subplots(1,1)

for name, grp in (
    comments
    .assign(nr_reviews=1)
    .groupby(['year', 'month'])
    .agg({'nr_reviews' : sum})
    .groupby('year')
):
    grp.nr_reviews.plot(figsize=(16,6), title='Number of reviews per month', label=name, ax=ax) 

ax.set_xlabel('Month')
ax.set_xticks(range(12))
ax.set_xticklabels(range(1,13))
ax.legend();

# Average rating over time: rating increases over time!

In [None]:
fig, ax = plt.subplots(1,1)

for name, grp in (
    comments
    .groupby(['year', 'month'])
    .agg({'rating' : np.mean})
    .groupby('year')
):
    grp.rating.plot(figsize=(16,6), title='Average rating per month', label=name, ax=ax) 

ax.set_xlabel('Month')
ax.set_xticks(range(12))
ax.set_xticklabels(range(1,13))
ax.legend();

## Average price distribution

In [None]:
sns.distplot(restaurants.query('avg_price != -1')['avg_price']);