## Cloud Vision API - Claims Processing PoC


#### Notes:
- Current accepted image types include: .jpg
- [Files hosted in the bucket currently made public; to add permissioning in future versions](https://cloud.google.com/storage/docs/access-control/making-data-public)
- [More info on bucket/object relationship](https://googlecloudplatform.github.io/google-cloud-python/latest/storage/buckets.html)

In [1]:
#! python3
#! Philip Mohun © 2018
# This notebook is used to generate .csv files to test Google's Vision API for Claims Processing
# https://console.cloud.google.com/storage/browser/anthem-handwriting-recognition-claims-photos

!pip install --upgrade google-api-python-client
APIKEY = '<INSERT API KEY>'

Requirement already up-to-date: google-api-python-client in /usr/local/envs/py3env/lib/python3.5/site-packages (1.7.4)


In [2]:
# import data from storage
import google.datalab.storage as storage
import pandas as pd
import os
from google.datalab import Context

project = Context.default().project_id
bucket = storage.Bucket('anthem-handwriting-recognition-claims-photos/forms/img')


## TODO Iterate through bucket to get all objects (blobs)
def list_blobs(bucket_name):
    """Lists all the blobs in the bucket."""
    bucket = storage.Bucket(bucket_name)
    blobs = bucket.list_blobs()

    for blob in blobs:
        print(blob.name)
        image =blob.name
        images.append(image)

In [3]:
# gs://anthem-handwriting-recognition-claims-photos/forms/img
images = ['ca_medical_claim_form.jpg',
          'ca_medical_claim_form_typed.jpg',
          'ca_medical_claim_form_handwritten.jpg',
          'ny_downstate_medical_claim_form.jpg',
          'ny_downstate_medical_claim_form_handwritten.jpg',
          'ny_downstate_medical_claim_form_typed.jpg',
          'ny_prescription_drug_claim_form.jpg',
          'ny_prescription_drug_claim_form_handwritten.jpg',
          'ny_prescription_drug_claim_form_typed.jpg']

#images = ['coverage_handwritten.jpg',
#         'coverage_typed.jpg',
#         'last_name_handwritten.jpg',
#         'last_name_typed.jpg',
#         'multiple_checks_handwritten.jpg',
#         'multiple_checks_typed.jpg',
#         'subscriber_handwritten.jpg',
#         'subscriber_typed.jpg',
#         'group_number_handwritten.jpg',
#         'group_number_typed.jpg',
#         'health_insurance_handwritten.jpg',
#         'health_insurance_typed.jpg']

uri_list = []
for image in images:
    image = bucket.object(image)
    uri_list.append(image.uri)

In [4]:
# running vision API
def call_api(uri,APIKEY):
  import base64
  from googleapiclient.discovery import build
  vservice = build('vision', 'v1', developerKey=APIKEY)
  request = vservice.images().annotate(body={
          'requests': [{
                  'image': {
                      'source': {
                          'gcs_image_uri': uri
                      }
                  },
                  'features': [{
                      'type': 'DOCUMENT_TEXT_DETECTION',
                      'maxResults': 3,
                  }],
# comment this out if reading typed version
#                  'imageContext': {
#                      'languageHints': ["en-t-i0-handwrit"]
#                  },
              }],
          })
  responses = request.execute(num_retries=3)
  return responses

In [5]:
# Parse json and return a list with the text from each block

def json_to_txt(responses):
  i = 0 # blocks
  j = 0 # paragraphs
  k = 0 # words
  l = 0 # letters
  text = []
  letter_count = len(responses['responses'][0]['fullTextAnnotation']['pages'][0]['blocks'][i]['paragraphs'][j]['words'][k]['symbols'])
  block_count = len(responses['responses'][0]['fullTextAnnotation']['pages'][0]['blocks'])
  for i in range(0,block_count):
    paragraph_count = len(responses['responses'][0]['fullTextAnnotation']['pages'][0]['blocks'][i]['paragraphs'])
    for j in range(0,paragraph_count):
      word_count = len(responses['responses'][0]['fullTextAnnotation']['pages'][0]['blocks'][i]['paragraphs'][j]['words'])
      words = ""
      for k in range(0,word_count):
        letter_count = len(responses['responses'][0]['fullTextAnnotation']['pages'][0]['blocks'][i]['paragraphs'][j]['words'][k]['symbols'])
        for l in range(0,letter_count):
          letter = responses['responses'][0]['fullTextAnnotation']['pages'][0]['blocks'][i]['paragraphs'][j]['words'][k]['symbols'][l]['text']
          if l == letter_count - 1:
            words += letter + " " # add a space if it's the end of a word
          else: 
            words += letter
          l += 1
        k += 1
      j += 1
    i += 1
    text.append(words) # each row is a block
  return text

In [20]:
# create folder to save csv

cwd = '/content/datalab/'
# create results folder
if os.path.exists(cwd + '/results') == False: 
    cwd = os.makedirs(cwd + '/results'); 
    os.chdir(cwd)
else:
  cwd += 'results'
  os.chdir(cwd)

In [22]:
# run notebook

for uri in uri_list:
    responses = call_api(uri,APIKEY)
    text = json_to_txt(responses)
    output = pd.Series(text)
    outname = uri.split('/')
    output.to_csv('{}.csv'.format((outname[len(outname)-1].replace('.jpg',''))))
    print('Transcribing {} to .csv'.format(uri[len(uri)-1])) 
    print('Done-- get well soon!')

Transcribing g to .csv
Done-- get well soon!
Transcribing g to .csv
Done-- get well soon!
Transcribing g to .csv
Done-- get well soon!
Transcribing g to .csv
Done-- get well soon!
Transcribing g to .csv
Done-- get well soon!
Transcribing g to .csv
Done-- get well soon!
Transcribing g to .csv
Done-- get well soon!
Transcribing g to .csv
Done-- get well soon!
Transcribing g to .csv
Done-- get well soon!
