# Downloading the JSON data to S3

To make it easier to deal with the re-loading of data and indexes we decided that it would be a good idea to download the JSON PUF data and put it in S3. That removes repeated hits on the servers and speeds up the reload process when we need to do so.

Unfortunately there isn't a way to stream data from the URL to S3; it has to be downloaded to the local file system first and then it can be uploaded to S3. The new `boto3` Python module makes large file uploading much easier in the past.

In [34]:
import boto3
import botocore
from boto3.s3.transfer import S3Transfer
import csv
import json
import requests
from requests.exceptions import SSLError
from urlparse import urlparse
import os
import hashlib

In [35]:
# Download to a local file
def download_file(url):
    h = hashlib.md5(item['url']).hexdigest()
    local_file = h + '.tmp'
    r = requests.get(url, stream=True)
    with open(local_file, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024*64):
            if chunk:
                f.write(chunk)
    return local_file

In [30]:
# Upload to S3 bucket
def xfer_to_S3(file_name, bucket, key):
    client = boto3.client('s3', 'us-west-1')
    transfer = S3Transfer(client)
    transfer.upload_file(file_name, bucket, key)

In [36]:
# download to a local file and then transfer to S3
# using the hashed URL as the S3 key
def process_url(_url, bucket_name, prefix):
    print "Processing {0}".format(_url)
    hashed_url = hashlib.md5(item['url']).hexdigest()
    f = download_file(_url)
    xfer_to_S3(f, bucket_name, prefix + str(hashed_url))
    # os.remove(f)  
    return hashed_url

### Get All URLs from the Machine Readable PUF

This code takes the Machine Readable PUF CSV file and walks it to get every URL pointed to by the PUF file and place it in a list.

In [2]:
csv_filename = 'machine-readable-url-puf.csv'

url_list = []
with open(csv_filename, 'r') as urlfile:
    urls = csv.DictReader(urlfile)
    for row in urls:
        _url = row['URL Submitted']
        
        if urlparse(_url).scheme:
            url_list.append(_url)

### Get the JSON file URLs

Now walk the list of URLs and for each of the Plan, Provider and Formulary sections, get every URL indicated in those sections. Place the retrieved JSON urls in the corresponding list. Errors are tracked in a separate dictionary. Each is written to separate files: provider, plans, formulary and errors.

In [None]:
provider_urls = []
plan_urls = []
formulary_urls = []
errors = []

for _url in url_list:
    try:
        response = requests.get(_url)
        links = json.loads(response.content)
        if 'provider_urls' in links:
            for provider_url in links['provider_urls']:
                provider_urls.append({'url': provider_url, 
                                      'status': 'NEW', 
                                      'parent_url': _url})
        if 'formulary_urls' in links:
            for formulary_url in links['formulary_urls']:
                formulary_urls.append({'url': formulary_url, 
                                       'status': 'NEW', 
                                       'parent_url': _url})
        if 'plan_urls' in links:
            for plan_url in links['plan_urls']:
                plan_urls.append({'url': plan_url, 
                                  'status': 'NEW', 
                                  'parent_url': _url})

    except ValueError as ve:
        print "JSON load failed with this url:"
        print _url
        errors.append({'url': _url, 'error': 'JSON load failed', 'message': str(ve)})
    except SSLError as se:
        print "SSL Error attempting to negotiate:"
        print _url
        errors.append({'url': _url, 'error': 'SSL Error', 'message': str(se)})
    except Exception as inst:
        print(type(inst))    # the exception instance
        print(inst.args)     # arguments stored in .args
        print(inst)          # __str__ allows args to be printed directly,
        print links
        errors.append({'url': _url, 'error': str(type(inst)), 'message': str(inst)})
        
provider_urlfile = 'provider-urls.txt'
plan_urlfile = 'plan-urls.txt'
formulary_urlfile = 'formulary-urls.txt'
error_file = 'error-urls.txt'

In [16]:
with open(provider_urlfile, 'w') as prvfile:
    for _url in provider_urls:
        prvfile.write("{0}\n".format(json.dumps(_url)))
        
with open(plan_urlfile, 'w') as planfile:
    for _url in plan_urls:
        planfile.write("{0}\n".format(json.dumps(_url)))
        
with open(formulary_urlfile, 'w') as formfile:
    for _url in formulary_urls:
        formfile.write("{0}\n".format(json.dumps(_url)))
        
with open(error_file, 'w') as errorfile:
    for err in errors:
        errorfile.write("{0}\n".format(json.dumps(err)))

### Get the JSON file content for all URLs

Get all the data pointed to by the provider, plan and formulary urls and store each in S3.

In [38]:
s3 = boto3.resource('s3')
bucket = s3.Bucket('w210')
check_map = {}

# reconstitute the dictionary from the file on disk
def load_urls(urlfile):
    urls = []
    with open(urlfile, 'r') as infile:
        for line in infile.readlines():
            urls.append(json.loads(line.strip()))
    return urls
            
for fname in ['provider-urls.txt','plan-urls.txt','formulary-urls.txt']:
    urls = load_urls(fname)
    for _url in urls:
        if _url['status'] == 'NEW':
            try:
                _url['s3key'] = process_url(_url['url'], 'w210', 'json/')
                _url['status'] = 'PROCESSED'
            except Exception as ex:
                _url['status'] = 'ERROR'
                print ex
    with open(fname, 'w') as outfile:
        for _url in urls:
            outfile.write("{0}\n".format(json.dumps(_url)))

Processing https://www.modahealth.com/cms-data/providers-AK.json


KeyboardInterrupt: 

--------------------------------------------------------------------------------------------

In [77]:
count = 0
for item in check_map:
    print check_map[item]['key']
    break
    if not item['hash']:
        count += 1
print "{0}/{1}".format(count, len(check_map))

json1985609760036727740
0/4193


In [66]:
hashes = []
for ppf_url in ppf_urls:
    hashes.append(str(hash(ppf_url)))

In [67]:
len(hashes)

71418

In [62]:
import boto3
import re
s3_hashes = []
s3 = boto3.resource('s3')
for bucket in s3.buckets.all():
    for obj in bucket.objects.filter(Prefix='json/'):
        s3_hashes.append(re.split('/',obj.key)[1])

In [63]:
len(s3_hashes)

2938

In [69]:
count = 0
for s3_hash in s3_hashes:
    if s3_hash not in hashes:
        print s3_hash
        
print count

-5261335633318247304
0


In [44]:
for i in range(10):
    print s3_hashes[i]

-1001508241897246742
-100756317028515332
-1012468138642023148
-1015077227650838858
-1018534659563486333
-1027155078582571361
-1058517716119348089
-1069132213514599781
-1087410947984936387
-1092631167391773407


In [65]:
for i in range(10):
    print hashes[i]

1985609760036727740
59549040364971953
9039898652487861151
-7179808801752382390
-3505892085904121982
4833110020626385507
-902456558468115489
-7107920627704830342
4684633488033199749
1985609760036727740
