# Downloading the JSON data to S3

To make it easier to deal with the re-loading of data and indexes we decided that it would be a good idea to download the JSON PUF data and put it in S3. That removes repeated hits on the servers and speeds up the reload process when we need to do so.

Unfortunately there isn't a way to stream data from the URL to S3; it has to be downloaded to the local file system first and then it can be uploaded to S3. The new `boto3` Python module makes large file uploading much easier in the past.

In [1]:
import boto3
import botocore
from boto3.s3.transfer import S3Transfer
import csv
import json
import requests
from urlparse import urlparse
import os

In [2]:
# Download to a local file
def download_file(url):
    local_file = str(hash(url)) + '.tmp'
    r = requests.get(url, stream=True)
    with open(local_file, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024*64):
            if chunk:
                f.write(chunk)
    return local_file

In [3]:
# Upload to S3 bucket
def xfer_to_S3(file_name, bucket, key):
    client = boto3.client('s3', 'us-west-1')
    transfer = S3Transfer(client)
    transfer.upload_file(file_name, bucket, key)

In [22]:
# process the PUF file, keeping track of which URLs have been processed
# and writing the URL - hash map to a file in JSON format
def process_machine_readable_puf(csv_filename, s3, bucket_name, prefix):
    url_hashmap = {}
    url_map_fname = 'puf-url-json-map.json'
    
    try:
        with open(url_map_fname, 'r') as map_file:
            url_hashmap = json.load(map_file)
            # print url_hashmap
    except IOError:
        pass
    
    try:
        with open(csv_filename, 'r') as urlfile:
            urls = csv.DictReader(urlfile)
            for row in urls:
                _url = row['URL Submitted']
                url_parseresult = urlparse(_url)

                # minimal check to make sure the url begins with scheme:// and is not empty
                if url_parseresult.scheme:
                    process_puf_url(_url, bucket_name, prefix, url_hashmap)

                    # write the updated dictionary to disk
                    with open(url_map_fname, 'w') as map_file:
                        map_file.write(json.dumps(url_hashmap))
    except IOError as e:
        print "I/O error({0}): {1}".format(e.errno, e.strerror)
        

In [24]:
def process_puf_url(puf_url, bucket_name, prefix, url_map):
    
    print "Processing {0}...".format(puf_url)
    response = requests.get(puf_url)

    links = json.loads(response.content)
    print ("\nProvider URLS:")
    print ("==================================")
    for provider_url in links['provider_urls']:
        if provider_url not in url_map:
            url_map[provider_url] = process_url(provider_url, bucket_name, prefix)
    
    print ("\nFormulary URLS:")
    print ("==================================")    
    for formulary_url in links['formulary_urls']:
        if provider_url not in url_map:
            url_map[formulary_url] = process_url(formulary_url, bucket_name, prefix)
        
    print ("\nPlan URLS:")
    print ("==================================")        
    for plan_url in links['plan_urls']:
        if provider_url not in url_map:
            url_map[plan_url] = process_url(plan_url, bucket_name, prefix)
        

In [25]:
# download to a local file and then transfer to S3
# using the hashed URL as the S3 key
def process_url(_url, bucket_name, prefix):
    print "Processing {0}".format(_url)
    hashed_url = hash(_url)
    f = download_file(_url)
    xfer_to_S3(f, bucket_name, prefix + str(hashed_url))
    # os.remove(f)  
    return hashed_url

In [26]:
s3 = boto3.resource('s3')
process_machine_readable_puf('machine-readable-url-puf.csv', s3, 'w210', 'json/')


Processing https://www.modahealth.com/cms-data-index.json...

Provider URLS:

Formulary URLS:

Plan URLS:
Processing https://fm.formularynavigator.com/jsonFiles/publish/11/47/cms-data-index.json...

Provider URLS:
Processing http://fm.formularynavigator.com/jsonFiles/publish/11/47/providers.json

Formulary URLS:

Plan URLS:
Processing https://www.modahealth.com/cms-data-index.json...

Provider URLS:

Formulary URLS:

Plan URLS:
Processing http://www.bestlife.com/exchange/cms-data-index.json...

Provider URLS:
Processing https://www.bestlife.com/exchange/providers_wNPI.json
Processing https://www.bestlife.com/exchange/providers_woNPI.json

Formulary URLS:

Plan URLS:
Processing http://www.bestlife.com/exchange/cms-data-index.json...

Provider URLS:

Formulary URLS:

Plan URLS:
Processing https://api.humana.com/v1/cms/index.json...

Provider URLS:
Processing https://api.humana.com/v1/cms/providers-0.json
Processing https://api.humana.com/v1/cms/providers-1.json
Processing https://api.hum

KeyboardInterrupt: 

--------------------------------------------------------------------------------------------

In [12]:
from boto3.s3.transfer import S3Transfer

client = boto3.client('s3', 'us-west-1')
transfer = S3Transfer(client)

In [14]:
transfer.upload_file('machine-readable-url-puf.csv', 'w210', 'test2/machine-readable-url-puf.csv')

In [None]:
from urllib2 import urlopen
import ijson
from contextlib import closing

with closing(urlopen('https://www.bestlife.com/exchange/providers_wNPI.json')) as f:
    count = 0
    for obj in ijson.items(f, 'item'):
        print obj['npi']
        count += 1
        if count > 100:
            break

In [None]:
from urllib2 import urlopen
s3.Object('w210', 'json/bestlife/providers.json')\
    .put(Body=urlopen('https://www.bestlife.com/exchange/providers_wNPI.json'))

In [None]:
upload_obj = s3.Object('w210', 'json/bestlife/providers.json')

In [None]:
upload_id = upload_obj.initiate_multipart_upload()

s3 = boto3.resource('s3')
multipart_upload_part = s3.MultipartUploadPart('bucket_name','object_key','multipart_upload_id','part_number')

In [9]:
my_dict = {'https://www.modahealth.com/cms-data/providers-AK.json': hash('https://www.modahealth.com/cms-data/providers-AK.json'),
        'https://www.modahealth.com/cms-data/providers-OR.json': hash('https://www.modahealth.com/cms-data/providers-OR.json'),
        'https://www.modahealth.com/cms-data/drugs-AK.json': hash('https://www.modahealth.com/cms-data/drugs-AK.json'),
        'https://www.modahealth.com/cms-data/drugs-OR.json': hash('https://www.modahealth.com/cms-data/drugs-OR.json'),
        'https://www.modahealth.com/cms-data/plans-AK.json': hash('https://www.modahealth.com/cms-data/plans-AK.json'),
        'https://www.modahealth.com/cms-data/plans-OR.json': hash('https://www.modahealth.com/cms-data/plans-OR.json')}

In [46]:
with open('puf-url-json-map.json', 'w') as map_file:
    map_file.write(json.dumps(my_dict))

In [10]:
my_dict.keys()

['https://www.modahealth.com/cms-data/plans-AK.json',
 'https://www.modahealth.com/cms-data/plans-OR.json',
 'https://www.modahealth.com/cms-data/drugs-OR.json',
 'https://www.modahealth.com/cms-data/providers-OR.json',
 'https://www.modahealth.com/cms-data/providers-AK.json',
 'https://www.modahealth.com/cms-data/drugs-AK.json']

In [13]:
'https://www.modahealth.com/cms-data/providers-AK.json' in my_dict.keys()

True