# Google Vision API

This notebook applies the google vision API to Iens restaurant pictures to detect what food is on it.

In [None]:
import pandas as pd
import pandas_gbq as gbq 
import json
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', 250) # Show all columns
%matplotlib inline

In [None]:
# project specifics
PRIVATE_KEY = '../google-credentials/gsdk-credentials.json'
PROJECT_ID = json.load(open(PRIVATE_KEY))['project_id']
APIKEY = open('../google-credentials/gc-API-key.txt').read()

In [None]:
# dataset specifics
city = 'dongen'
date = '20180124'
bq_table = '_'.join(['iens.iens', city, date])  # use iens.iens_comments when querying on the comments table
bq_table_out = '_'.join(['iens.iens_images', city, date])  

In [None]:
# select all info fields, plus image_urls
query = "SELECT info.id, info.name, image_urls FROM {} WHERE info.nr_images > 0".format(bq_table)

df = gbq.read_gbq(query, project_id=PROJECT_ID, private_key=PRIVATE_KEY)

In [None]:
df.shape

## Calling the vision API 

First practice - just run a single request. See if it works!

Usefull documentation: 
* https://developers.google.com/api-client-library/python/start/get_started
* https://github.com/GoogleCloudPlatform/cloud-vision/tree/master/python/text

In [None]:
from googleapiclient.discovery import build
service = build('vision', 'v1', developerKey=APIKEY)
collection = service.images()

In [None]:
def make_request(url):
    return {
        'image' : {
            'source' : {
                'imageUri' : url
            }
        },
        'features': [{
                    'type': 'LABEL_DETECTION',
                    'maxResults': 10
        }]
    }

def make_batch_request(url_list):
    return collection.annotate(body={'requests' : [make_request(url) for url in url_list]})

def execute_batch_request(url_list):
    return make_batch_request(url_list).execute()['responses']

In [None]:
examples = {'burger' : 'https://u.tfstatic.com/restaurant_photos/811/352811/169/612/barasti-killer-burger-b42ea.jpg',
            'steak' : 'https://u.tfstatic.com/restaurant_photos/811/352811/169/612/barasti-ribstuk-2c5f9.jpg'}

In [None]:
from IPython.display import Image
Image(url=examples['burger']) 

In [None]:
pd.DataFrame(execute_batch_request([examples['burger']])[0]['labelAnnotations'])

## Set up batch request per restaurant

What we want is a dictionary with the following structure, to upload into Google BigQuery:

* restaurant id = integer
* images = list of dicts:
    * image url = string
    * labelAnnotation = list of dicts:
        - description
        - mid
        - score
        - topicality

In [None]:
# convert to Series for batch request per restaurant
restaurant_image_list = df.groupby(['info_id'])['image_urls'].apply(list)

result = []
for restaurant_id, image_urls in restaurant_image_list.iteritems():
    # do batch request
    responses = execute_batch_request(image_urls)
    # create images object for one restaurant
    images = [
        {
            'image_url' : image_url,
            'label_annotations' : label_annotations['labelAnnotations']
        }
        for image_url, label_annotations in 
        zip(image_urls, responses)
    ]
    # add results for one restaurant to list
    result.append({'info_id' : restaurant_id, 'images' : images})
    
len(result)

#### Write to jsonlines

for BigQuery save as jsonlines

In [None]:
file = open('../iens_scraper/output/' + bq_table_out + '.jsonlines', 'w')
for item in result:
    file.write('%s\n' % item)
file.close()

#### upload to BigQuery

Would be nicer to do this directly from python. For example with `gbq.to_gbq` (which is for dataframes only).

In [None]:
!bq load --autodetect --replace --source_format=NEWLINE_DELIMITED_JSON \
        {bq_table_out} ../iens_scraper/output/{bq_table_out}.jsonlines

### Query images

For example getting the top 15 most found labels by the vision API.

In [None]:
query = """
SELECT images.label_annotations.description, COUNT(*) AS count 
FROM {} 
GROUP BY images.label_annotations.description 
ORDER BY count DESC
LIMIT 15;
""".format(bq_table_out)

gbq.read_gbq(query, project_id=PROJECT_ID, private_key=PRIVATE_KEY)

Or.. getting the max grilling score for each restaurant:

In [None]:
keywords = ('grillades', 'grilling')#('hamburger', 'cheeseburger')
query = """
SELECT
  info_id, images.image_url, images.label_annotations.score
FROM (
  SELECT 
      *,
      ROW_NUMBER() OVER(PARTITION BY info_id ORDER BY images.image_url DESC, images.label_annotations.score DESC) AS highest_score
  FROM {}
  WHERE images.label_annotations.description IN {}
)
WHERE highest_score = 1
""".format(bq_table_out, keywords)

gbq.read_gbq(query, project_id=PROJECT_ID, private_key=PRIVATE_KEY)