In [1]:
from google.cloud import bigquery
import os, csv, xmltodict, json

In [2]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../creds.json"

In [3]:
PROJECT = 'eng-reactor-287421'
dataset = 'reference_data'
data_table = 'ice_nested'
map_table = 'ice_bq_map'

### get schema and modification details from bq

In [4]:
bqclient = bigquery.Client(project=PROJECT,)

In [5]:
# get mapping schema from bigquery
map_query = """
SELECT * FROM {}.{}

           """.format(dataset,map_table)

In [6]:
bq_map_result = bqclient.query(map_query).result()

In [7]:
bq_map_result = list(bq_map_result)

In [8]:
ice_schema,map_version,leaf_nodes,repeated_record_nodes,address_key_list,bad_address_list = bq_map_result[0]

In [9]:
leaf_nodes = [[entry['path'],entry['type'],entry['mode']] for entry in leaf_nodes]

In [10]:
address_key_list = [[entry['path'],entry['key']] for entry in address_key_list]

In [11]:
repeated_leafs = [entry[0] for entry in leaf_nodes if entry[2]=='REPEATED']

In [12]:
all_repeated_nodes = repeated_leafs + repeated_record_nodes

### Modification functions

In [13]:
# address_list is '/' seperated string of nodes address
# xml_dict is ice instrument in python dict
# function checks all repeated nodes in address, if in the instrument, checks if instrument is wrapped in a list
# if not, entry is wrapped in a list

def repeated_enforcement(address_list,xml_dict):
    split_address_list = [entry.split('/')[1:] for entry in address_list]
    
    for address in split_address_list:
        current_xml_level = xml_dict
        for current_index in range(len(address)-1):
            if address[current_index] not in current_xml_level:
                break
            current_xml_level = current_xml_level[address[current_index]]

        if address[-1] not in current_xml_level:
            continue
        else:
            if not isinstance(current_xml_level[address[-1]],list):
                current_xml_level[address[-1]]=[current_xml_level[address[-1]]]


In [14]:
# address_key_list contains '/' seperated string of nodes address and the key value to extract
# key value in child node of node in address_key_list is extracted and put as value for node

def record_key_flatten(address_key_list,xml_dict):
    split_address_key_list = [[entry[0].split('/')[1:],entry[1]] for entry in address_key_list]
    
    for entry in split_address_key_list:
        address, key = entry
        current_xml_level = xml_dict
        for current_index in range(len(address)-1):
            if address[current_index] not in current_xml_level:
                break
            current_xml_level = current_xml_level[address[current_index]]
            
        if address[-1] not in current_xml_level:
            continue
        else:
            if isinstance(current_xml_level[address[-1]],dict):
                current_xml_level[address[-1]]= current_xml_level[address[-1]][key]


In [15]:
# this is a one off on a poorly handled field in ICE data
# usually not repeated, when repeated, just take the first entry in list

def derepeat_bad_fields(bad_address_list,xml_dict):
    split_bad_address_list = [entry.split('/')[1:] for entry in bad_address_list]
    
    for address in split_bad_address_list:
        current_xml_level = xml_dict
        for current_index in range(len(address)-1):
            if address[current_index] not in current_xml_level:
                break
            current_xml_level = current_xml_level[address[current_index]]
            
        if address[-1] not in current_xml_level:
            continue
        else:
            if isinstance(current_xml_level[address[-1]],list):
                current_xml_level[address[-1]]= current_xml_level[address[-1]][0]
        

###  loading handlers

In [16]:
def get_timestamp_from_file_name(file_name):
    date = re.search('([0-9]{4}[0-9]{2}[0-9]{2}T[0-9]{4}-[0-9]{2})', file_name)[0]
    date = date.replace('-','')
    return(datetime.strftime(datetime.strptime(date, '%Y%m%dT%H%M%S'),'%Y-%m-%dT%H:%M:%SZ'))

In [17]:
def get_xmls_list(gz_file_name):
    storage_client = storage.Client()
    #bucket = storage_client.bucket('ref_data_1')
    bucket = storage_client.bucket('ice_init_xml')
    blob = bucket.get_blob(gz_file_name) #e.g. 'gsm_update_muni_APFICC_GSMF10I.35.1_1.20201221T0800-05.xml.gz')
    file = blob.download_to_filename('/tmp/temp_gz_file.xml.gz')

    f = gzip.open('/tmp/temp_gz_file.xml.gz', 'rt')
    file_str = f.read()
    clean_file = file_str.split('<payload>')[1]
    return (clean_file.split('</instrument>'))

In [18]:
def load_nested_table_uri_json(table_id,uri):
    
    client = bigquery.Client()
    ice_nested_table = bqclient.get_table(table_id)

    job_config = bigquery.LoadJobConfig(
        schema=ice_nested_table.schema,
        source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
    )

    bq_client.load_table_from_uri(uri, table_id, job_config=job_config)# Make an API request.

In [None]:
def get_nested_ndjson_as_uri(file_name,xmls,timestamp):
    ndjson = ''
    total = len(xmls)-1

    for n in range(total):
        xml_str = xmls[n]+'</instrument>' 
        instrument_dict = xmltodict.parse(xml_str,attr_prefix='',cdata_key='text')
        
        # add timestamp to dict, constant for now
        instrument_dict['ice_file_date'] = timestamp
        
        # conform fields with bq schema 
        derepeat_bad_fields(bad_address_list,instrument_dict)
        record_key_flatten(address_key_list,instrument_dict)
        repeated_enforcement(all_repeated_nodes,instrument_dict)
        
        ndjson+= json.dumps(instrument_dict)+'\n'

    json_file_name = '%snest.json' % (file_name) 
    with open('/tmp/%s' % (json_file_name), 'wb+') as f:
        f.write(ndjson.encode())
        f.seek(0)
        storage_client = storage.Client()
        bucket = storage_client.bucket('ice_ndjsons')
        blob = bucket.blob(json_file_name)
        blob.upload_from_filename('/tmp/%s' % (json_file_name))
        f.close
    uri = 'gs://ice_ndjsons/' + json_file_name
    return(uri)


In [None]:
#file_name = event['name']
file_name = 'gsm_init_muni_APFICC_GSMF10I.1.1_1.20201222T1700-05.xml.gz'
xmls = get_xmls_list(file_name)
uri = get_nested_ndjson_as_uri(file_name,xmls,get_timestamp_from_file_name(file_name))  
load_nested_table_uri_json('eng-reactor-287421.reference_data.ice_nested',uri)

In [None]:
#import os
#gcloud functions deploy load_ice_data_from_gz_file \
#--runtime python37 \
#--trigger-resource ref_data_1 \
#--trigger-event google.storage.object.finalize
import xmltodict
import json
import time
import gzip
import shutil
import collections
import re

from datetime import datetime
from google.cloud import bigquery, storage
from datetime import datetime,timedelta
from google.api_core.exceptions import BadRequest

#os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../creds.json"

bq_client = bigquery.Client()
    





 
    

def main(event, context):
    file_name = event['name']
    xmls = get_xmls_list(file_name)
    uri = get_ndjson_as_uri(file_name,xmls,get_timestamp_from_file_name(file_name))  
    load_table_uri_json('eng-reactor-287421.reference_data.ice',uri)

### loading from json rows in batches

### BigQuery API

In [16]:
def add_to_bigquery(rows_to_insert, table_id):
    bq_client = bigquery.Client()
    error = bq_client.insert_rows_json(table_id, rows_to_insert)  # Make an API request.
    return error

In [17]:
table_id = '{}.{}.{}'.format(PROJECT,dataset,data_table)

### test loading script

In [22]:
for hundreds in range():
    test_a_json = []
    for xml_instrument in xml_list[hundreds*100:(hundreds+1)*100]:
        xml_str = xml_instrument+'</instrument>' 
        instrument_dict = xmltodict.parse(xml_str,attr_prefix='',cdata_key='text')
        
        # add timestamp to dict, constant for now
        instrument_dict['ice_file_date'] = '2020-12-22 17:00:05 UTC'
        
        # conform fields with bq schema 
        derepeat_bad_fields(bad_address_list,instrument_dict)
        record_key_flatten(address_key_list,instrument_dict)
        repeated_enforcement(all_repeated_nodes,instrument_dict)
        
        # dict to json
        test_a_json.append(json.loads(json.dumps(instrument_dict)))

    error = add_to_bigquery(test_a_json,table_id)
    if len(error)>0: 
        print(error)
    else:
        print((hundreds+1)*100)


100


### get xml as list

In [18]:
#save_path = 'C:/Users/vcrom/Desktop/ficc/Files/'
save_path = '/tmp/'
file_name = 'gsm_init_muni_APFICC_GSMF10I.1.1_1.20201222T1700-05.xml'

In [19]:
with open(save_path+file_name,'r') as f:
    #file_str = f.read(100000000)
    file_str = f.read()
    clean_file = file_str.split('<payload>')[1]
    xml_list = clean_file.split('</instrument>')

In [20]:
len(xml_list)

11547

### Download XML file

In [None]:
from google.cloud import storage
client = storage.Client()
bucket = client.get_bucket('ref_data_1')

#save_path = 'C:/Users/vcrom/Desktop/ficc/Files/'
save_path = '/tmp/'
file_name = 'gsm_init_muni_APFICC_GSMF10I.1.1_1.20201222T1700-05.xml'

blob = bucket.blob(file_name)

blob.download_to_filename(save_path+file_name)


