In [1]:
import pandas as pd
import pandas.io.json
import numpy as np
import numba
import json

# Test Data

In [45]:
src = {
   "517098": {
      "rec_date": "2019-07-08",
      "doc_date": "06/26/2019",
      "document_id": "517098",
      "grantor": [
         "HESS BAKKEN INVESTMENTS II LLC",
         "2 HESS BAKKEN INVESTMENTS II LLC",
         "3 HESS BAKKEN INVESTMENTS II LLC"
      ],
      "grantee": [
         "PUBLIC",
         "2PUBLIC",
         "3PUBLIC"
      ],
      "instrument_type": "Affidavit",
      "legal": [
         "SE 9 151 95",
         "SW 9 151 95",
         "NE 9 151 95"
      ]
   },
   "517099": {
      "rec_date": "2019-07-08",
      "doc_date": "06/26/2019",
      "document_id": "517099",
      "grantor": [],
      "grantee": [],
      "instrument_type": "Affidavit",
      "legal": [
         "SE 9 151 95",
         "SW 9 151 95",
         "NW SE 23 151 95",
         "NW SW 17 151 95"
      ]
   },
   "517101": {
      "rec_date": "2019-07-08",
      "doc_date": "06/26/2019",
      "document_id": "517101",
      "grantor": [
         "HESS BAKKEN INVESTMENTS II LLC"
      ],
      "grantee": [],
      "instrument_type": "Affidavit",
      "legal": [
         "SE 9 151 95",
         "SW 9 151 95",
         "NW SW 17 151 95"
      ]
   },
   "517115": {
      "rec_date": "2019-07-08",
      "doc_date": "07/01/2019",
      "document_id": "517115",
      "grantor": [],
      "grantee": [
         "HESS BAKKEN INVESTMENTS II LLC"          
      ],
      "instrument_type": "Affidavit",
      "legal": [
         "NE SW 19 152 96",
         "E NW 19 152 96",
         "SW NW 19 152 96"
      ]
   },
   "517135": {
      "rec_date": "2019-07-08",
      "doc_date": "07/01/2019",
      "document_id": "517135",
      "grantor": [
         "BERGEN ESTATE, LILLIAN",
         "BERGEM ESTATE AKA, LILLIAN M",
         "THORESON, JENNIFER"
      ],
      "grantee": [
         "PUBLIC",
         "THORESON, JENNIFER"
      ],
      "instrument_type": "Affidavit",
      "legal": []
   },
   "517865": {
      "rec_date": "2019-07-31",
      "doc_date": "03/29/2019",
      "document_id": "517865",
      "grantor": [
         "DEHOYOS, KIMBERLY",
         "DEHOYOS AKA, KIMBERLY A"
      ],
      "grantee": [
         "WELLS FARGO BANK NA"
      ],
      "instrument_type": "AFFIDAVIT",
      "legal": [
         "WATFORD CITY THIRD ADDITION 5 2"
      ]
   }
}

# Option 1: Use set operations (fastest)

In [91]:
%%timeit
# Create single row dataframe
df1 = pd.DataFrame(
            data=src.values()
            , index=src.keys() 
        )
# Use as join key
df1['id'] = df1.index

# Split grantor into columns
df_grantor = pd.DataFrame(df1.grantor.values.tolist()
                  , index=df1.index)
df_grantor['id'] = df_grantor.index

# Melt (columns to rows)
grantor = df_grantor.melt(
            id_vars='id'
            , value_vars=None
            , var_name='grantor_var'
            , value_name='grantor_new'
        )
# Remove empty values
grantor.dropna(subset=['grantor_new'], inplace=True)
    
# Repeat for grantee (this can be refactored into a function)
df_grantee = pd.DataFrame(df1.grantee.values.tolist()
                  , index=df1.index)
df_grantee['id'] = df_grantee.index # use as join key
grantee = df_grantee.melt(
            id_vars='id'
            , value_vars=None
            , var_name='grantee_var'
            , value_name='grantee_new'
        )
grantee.dropna(subset=['grantee_new'], inplace=True)
#

# Repeat for legal (this can be refactored into a function)
df_legal = pd.DataFrame(df1.legal.values.tolist()
                  , index=df1.index)
df_legal['id'] = df_legal.index # use as join key
legal = df_legal.melt(
            id_vars='id'
            , value_vars=None
            , var_name='legal_var'
            , value_name='legal_new'
        )
legal.dropna(subset=['legal_new'], inplace=True)
#

# Now use database style set operations
df_join1 = pd.merge(df1, grantor, how='left', on='id')
df_join2 = pd.merge(df_join1, grantee, how='left', on='id')
final = pd.merge(df_join2, legal, how='left', on='id')
final = final.loc[:, ['id', 'rec_date', 'doc_date', 'document_id'
              , 'instrument_type', 'grantor_new', 'grantee_new'
              , 'legal_new']]

assert final.shape == (45, 8)
final

40.4 ms ± 1.81 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Option 2: Use pandas json_normalize (fast but not that fast)

In [281]:
%%timeit
# Pull grantor records
rec_id = "517098"
grantor = pandas.io.json.json_normalize(
        data=src[rec_id]
        , record_path=['grantor']
        , meta=[]
        , record_prefix='new_grantor'
        , meta_prefix='index'
        , sep="_"
        , errors='ignore')
grantor['key'] = 1

# Grantee
grantee = pandas.io.json.json_normalize(
        data=src[rec_id]
        , record_path=['grantee']
        , meta=[]
        , record_prefix='new_grantee'
        , meta_prefix='index'
        , sep="_"
        , errors='ignore')
grantee['key'] = 1

# Legal
legals = pandas.io.json.json_normalize(
        data=src[rec_id]
        , record_path=['legal']
        , meta=['rec_date'
                , 'doc_date'
                , 'document_id'
                , 'instrument_type']
        , record_prefix='new_legal'
        , meta_prefix='index'
        , sep="_"
        , errors='ignore')
legals['key'] = 1

# Take cross product
df1 = pd.merge(legals, grantor, on='key')
final = pd.merge(df1, grantee, on='key')
final

16.3 ms ± 502 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Option 3: Manual shredding functions (slowest)

In [225]:
def split_melt(index, df, pivot):
    
    """Split pivot field into columns, then melt (cols->rows)."""
    
    if df[pivot].values[0]:
        for i, v in enumerate(df[pivot].values[0]):
            df[pivot+str(i)] = v

        melted = df.melt(
                        id_vars=index
                        , value_vars=None
                        , var_name=pivot+'_var'
                        , value_name=pivot+'_new'
                    )
    else: # empty pivot
        melted = df
        melted[pivot+'_new'] = None
    
    return melted

def shred_to_pandas(rec_id, raw):
    """Shred grantor, grantee, and legal into unqiue rows"""
    
    # Create single row dataframe
    df1 = pd.DataFrame(
                data=raw.values()
                , index=raw.keys() 
            ).transpose()

    # Split grantor into columns, then melt. 
    index = ['rec_date', 'doc_date', 'document_id', 'instrument_type'
            , 'grantor', 'grantee', 'legal']

    grantors = split_melt(index, df1, pivot='grantor')        
        
    # Split grantee into columns. 
    index = ['rec_date', 'doc_date', 'document_id', 'instrument_type'
            , 'grantor_new', 'grantee', 'legal']

    grantees = split_melt(index, grantors.loc[:, index], pivot='grantee')        

    # Split legals. 
    index = ['rec_date', 'doc_date', 'document_id', 'instrument_type'
            , 'grantor_new', 'grantee_new', 'legal']

    legals = split_melt(index, grantees.loc[:, index], pivot='legal')        

    # Assign record ID
    legals["ID"] = rec_id
    final = legals.loc[:, ['ID', 'rec_date', 'doc_date', 'document_id'
                          , 'instrument_type', 'grantor_new', 'grantee_new'
                          , 'legal_new']]
    return final

# Run tests for option 3

In [226]:
%%timeit
df_full = []

for k, v in src.items():
    df_tmp = shred_to_pandas(k, v)
    df_full.append(df_tmp)

df_concat = pd.concat(df_full).reindex()
assert df_concat.shape == (45, 8)

df_concat

155 ms ± 2.07 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
