# Text analysis of restaurant reviews
## Find hamburgers!

In this notebook we analyze the restaurant reviews for a city and a specific date.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pprint import pprint
%matplotlib inline

# hide warnings. `gbq.read_gbq()` gives some
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('../scrape_save_search')
import load_data

In [None]:
df = load_data.load_comments()

# Elastic search

To run Elasticsearch as a service:
+ `brew tap homebrew/services`
+ `brew services start elasticsearch`

You can check that it's up-and-running by examining the logs:
+ `tail -n 15 /usr/local/var/log/elasticsearch.log`


In [None]:
from elasticsearch import Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

In [None]:
# Check if it is running
es.ping()

In [None]:
# Check its health
print(es.cat.health(v=True, 
                    h=['timestamp', 'cluster', 'status', 'node.total']))

In [None]:
# Delete old index if it exists
es.indices.delete(index='restaurant_comments', ignore=404)

In [None]:
# Create a new index
es.indices.create(index='restaurant_comments', ignore=400)

In [None]:
# Define pre-processing and analyzer
hyphens_and_apostrophes_strip = {
    "hyphens_and_apostrophes_strip": {
        "type": "mapping",
        "mappings": [ 
            "- => ' '",
            "' => "
        ]
    }
}

dutch_stop = {
    "dutch_stop": {
        "type": "stop",
        "stopwords": "_dutch_"
    }
}

ngram_tokenizer = {
    "ngram_tokenizer": {
        "type": "ngram",
        "min_gram": 3,
        "max_gram": 4
    }
}

analyzer = {
    "restaurant_comments_analyzer": {
        "type": "custom",
        "char_filter": ["hyphens_and_apostrophes_strip"],
        "tokenizer": "ngram_tokenizer", #"standard",
        "filter": ["lowercase", "dutch_stop", "asciifolding"]
    }
}   

In [None]:
# Apply these on our index 
es.indices.close(index='restaurant_comments')
es.indices.put_settings(
    index='restaurant_comments', 
    body={
        "analysis": {
            "char_filter": hyphens_and_apostrophes_strip,
            "tokenizer": ngram_tokenizer,
            "filter": dutch_stop,
            "analyzer": analyzer
        }
    }
)
es.indices.open(index='restaurant_comments')

In [None]:
# Define what our data looks like
es.indices.put_mapping(
    index='restaurant_comments',
    update_all_types=True,
    doc_type='restaurant_review',
    body={
        "properties": {
            "comment": {
                "type": "text",
                "analyzer": "restaurant_comments_analyzer"
            },
            "name": {
                "type": "keyword"            
            },
            "id": {
                "type": "text"
            },
            "rating": {
                "type": "integer"
            }
        }
    }
)

In [None]:
# Add all comments to our index
from elasticsearch.helpers import parallel_bulk
def generate_actions(df):    
    for _, row in df.iterrows():
        src = {
            'comment': row['comment'],
            'name': row['name'],
            'id': str(row['id']),
            'rating': int(row['rating_food'])  
        }
        yield {
            '_op_type': 'index',
            '_source': src 
        }

In [None]:
for success, info in parallel_bulk(
    client=es, 
    actions=generate_actions(df),
    index='restaurant_comments', 
    doc_type='restaurant_review',
    thread_count=4):
    if not success: print('Document insertion failed', info)

In [None]:
# Search for 'burger'
res = es.search(
    index='restaurant_comments', 
    doc_type='restaurant_review',
    body={
        "size": 10000,
        "query": {
            "match": {
                "comment": "burger"
            }
        }
    }
)

In [None]:
# Show top results
for i, document in enumerate(res['hits']['hits']):
    print('Score: ', document['_score'])
    pprint(document['_source'], indent=2)
    if i == 5:
        break

In [None]:
# Extract info
result = pd.DataFrame(
    [{**x['_source'], 'score': x['_score']} for x in res['hits']['hits']])

In [None]:
# Aggregate per restaurant
restaurant_scores = (
    result
    .groupby('id')['score']
    .max()
    .sort_values()
)

In [None]:
# Plot distribution
ax = sns.distplot(restaurant_scores)
ax.set_xlabel('Restaurant scores (higher is better)', fontsize=13)
ax.set_ylabel('Number of restaurants', fontsize=13)
ax.set_title('Number of restaurants per score', fontsize=14)
ax.set_yticks([]);

In [None]:
# Select burger restaurants
burger_restaurants = (
    restaurant_scores
    .loc[lambda r: r > 20]
    .reset_index()
    ['id']
)

In [None]:
burger_restaurants.to_csv('elasticsearch_burger_tags.csv', index=False)