In [None]:
from collections import defaultdict
import pandas as pd
import io
import os

from google.cloud import bigquery
from google.cloud import storage
from lxml import etree

In [None]:
#os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/user/ficc/ficc/mitas_creds.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/user/base/ficc/creds.json'

bq_client = bigquery.Client()
storage_client = storage.Client()

In [None]:
#The following have the structure <tag><element1></element1><element2></element2></tag>
TAGS_WITH_FLAT_ELEMENTS = ['creditEnhancement',
                           'clearingSchedule',
                           'derivedData',
                           'dealInfo',
                           'exdividendRecordDetails',
                           'holidaySchedule',
                           'instrumentRelation',
                           'organization',
                           'principalDetails',  
                           'regulationDetails',
                           'tradingAndSellingRestrictionDetails']

#The following have the structure <tag><element1></element1><element2></element2><element1></element1>...</tag>
TAGS_WITH_REPEATED_ELEMENTS = ['accretionDetails',
                               'callSchedule',
                               'cashFlowDetails',
                               'defaultDetails',
                               'entityDetails',
                               'interestSchedule',
                               'organizationIndustryClassification',
                               'putSchedule',
                               'remarketingDetails',
                               'sedolDetails',
                               'sinkSchedule',
                               ]
#The following have the structure <tag><element1></element1>...</tag><tag><element1></element1>...</tag>
TAG_WITH_MULTIPLE_REPEATED_ELEMENTS = ['amountDetails']



In [None]:
def process_element(child_element):
    child_details = {}
    for subchild in child_element:
        subtag = subchild.tag.split('}')[-1]
        child_details[subtag] = subchild.text
    return child_details


def process_instrument(element):
    instrument_data = dict()
    repeated_elements = defaultdict(list)    # since these are repeated elements, the underlying data structure will be a list that stores each of them

    for child in element:
        tag = child.tag.split('}')[-1]
        if tag in TAGS_WITH_REPEATED_ELEMENTS + TAG_WITH_MULTIPLE_REPEATED_ELEMENTS:
            repeated_elements[tag].append(process_element(child))
        elif tag in TAGS_WITH_FLAT_ELEMENTS:
            flat_data = process_element(child)
            instrument_data.update(flat_data)
        else:
            instrument_data[tag] = child.text

    for tag in TAGS_WITH_REPEATED_ELEMENTS + TAG_WITH_MULTIPLE_REPEATED_ELEMENTS:
        instrument_data[tag] = repeated_elements[tag]
    
    return instrument_data

In [None]:
BUCKET_NAME = 'sp_ref_data'
FILE_NAME = '2024/07/02/Muni_Standard-202407010.xml'

In [None]:
%%time
bucket = storage_client.bucket(BUCKET_NAME)
blob = bucket.blob(FILE_NAME)
xml_bytes = blob.download_as_bytes()

In [None]:
%%time
context = etree.iterparse(io.BytesIO(xml_bytes), events=('end',), tag='{http://dictionary.markit.com/ddgen/firef_xsd_consumer_1}Instrument')
data = []
for event, elem in context:
    data.append(process_instrument(elem))
    elem.clear()    # clear the contents of the current element to free memory
    while elem.getprevious() is not None:    # delete previous siblings of the current element from its parent to free up more memory
        del elem.getparent()[0]
df = pd.DataFrame(data)

# Testing

What does the `CUSIP` column look like?

In [None]:
df['cusip'].iloc[0]

How many line items were in this data? Specifically, how many CUSIPs do we have reference data for?

In [None]:
len(df)

What columns of reference data do we have?

In [None]:
len(df.columns)

In [None]:
print(list(df.columns))

What does the S&P ID look like? 

In [None]:
df['id'].iloc[0]

The below checks to see whether all the TAGS_WITH_REPEATED_ELEMENTS are truly nested or if they can be flattened.

In [None]:
def check_list_of_dicts_summary(df, column_name):

    non_empty_rows = df[df[column_name].apply(lambda x: isinstance(x, list) and len(x) > 0)]
    
    if not non_empty_rows.empty:
        is_flattenable = True
        first_non_null_value = non_empty_rows[column_name].iloc[0]
        first_nested_value = None
        
        for index, row in non_empty_rows.iterrows():
            if len(row[column_name]) != 1:
                is_flattenable = False
                if first_nested_value is None:
                    first_nested_value = row[column_name]
        
        status = "Flattenable" if is_flattenable else "Nested"
        
        print(f"Column: {column_name}, Status: {status}")
        if first_nested_value is not None:
            print("Example nested value:")
            #print(first_nested_value)
        else:
            print("First non-null value:")
            print(first_non_null_value)
    else:
        print(f"No non-empty rows found in column: {column_name}")

print("Nested Fields")
for tag in TAGS_WITH_REPEATED_ELEMENTS:
    check_list_of_dicts_summary(df, tag)


What does the `amountDetails` column look like (i.e. the one member of TAG_WITH_MULTIPLE_REPEATED_ELEMENTS)?

In [None]:
non_empty_amount_details = df[df['amountDetails'].apply(lambda x: len(x) > 0)]
non_empty_amount_details['amountDetails'].head()

It is indeed of the form `<tag><element1></element1>...</tag><tag><element1></element1>...</tag>`, so we need to handle it specially.

In [None]:
df['amountDetails'].iloc[0]

Which columns are null for the first row of the data?

In [None]:
first_row = df.iloc[0]
columns_with_nan_value = [column for column in df.columns if type(first_row.loc[column]) != list and pd.isna(first_row.loc[column])]
print(columns_with_nan_value)

Which columns are null for all of the data? There should be none, since the schema was built from the XML.

In [None]:
columns_with_all_null = [column for column in df.columns if df[column].isnull().all()]
print(columns_with_all_null)