Parse raw credit pull table from credit db
- query credit pull response xml file
- parse inquiry table
- parse tradeline table

In [2]:
!pip install xmltodict

Collecting xmltodict
  Downloading xmltodict-0.12.0-py2.py3-none-any.whl (9.2 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.12.0


In [1]:
import sys, os, json
import pandas as pd
from sqlalchemy import create_engine
import sys;
import xml.dom.minidom;
import json
import jmespath
import time
import gc

sys.path.insert(1, "../../src")
import experian_xml_parser, experian_tradeline_parser, experian_inquiry_parser, experian_profile_summary_parser

%load_ext autoreload
%autoreload 2

In [2]:
def getSQL_dw(sql, u = "", p = ""):
    db_string = "postgres://" + u + ":" + p + "@localhost:15494/sofi_dw"
    db = create_engine(db_string)
    df = pd.read_sql_query(sql,con=db)
    return df

def getSQL_credit(sql, u = "", p = ""):
    db_string = "postgres://" + u + ":" + p + "@localhost:16029/sofi_credit"
    db = create_engine(db_string)
    df = pd.read_sql_query(sql,con=db)
    return df

In [3]:
query = """
select id,
       date_start,
       applicant_type,
       business_credit_pull_id
from dwanalyst.pl_gen4_base_202202
"""
data=getSQL_dw(query)

In [4]:
data['quarter'] = pd.PeriodIndex(data.date_start, freq='Q')
quarter_list = data['quarter'].value_counts().sort_index(ascending=True).reset_index(drop=False)['index'].astype(str).to_list()
quarter_list

['2021Q2', '2021Q3', '2021Q4', '2022Q1']

In [5]:
query="""
select id as business_credit_pull_id,created_dt,premier_raw_xml   from experian_credit_pull where id in ( --<placeholder>--) and created_dt>='2021-04-01' and created_dt<='2022-02-01'
"""

In [6]:
quarter_list = ['2021Q4', '2022Q1']
quarter_list

['2021Q4', '2022Q1']

note: two id (application) might share same business_credit_pull_id
- solution 1: dedup the data when after aggregation
- solution 2: df_credit = pd.merge(df_credit0, data[data.quarter == quarter)][['id', 'business_credit_pull_id', 'applicant_type']], on = ['business_credit_pull_id'], how = 'left')

In [None]:
for quarter in quarter_list:
    gc.collect()
    start=time.time()
    df_credit0 = pd.DataFrame()
    print("quarter: " + quarter)
    df_credit0 = getSQL_credit(query.replace("--<placeholder>--", ','.join([str(i) for i in data[(~data.business_credit_pull_id.isnull()) & (data.quarter == quarter)].business_credit_pull_id.astype(int)])))    
    df_credit = pd.merge(df_credit0, data[['id', 'business_credit_pull_id', 'applicant_type']], on = ['business_credit_pull_id'], how = 'left')

    trade_line_parser = experian_tradeline_parser.TradeLineParser(ignore_closed=False)
    inquiry_parser = experian_inquiry_parser.InquiryParser()
    
    # 1 quarter TTD take ~60 mins
    print("Parsing %s xml files..." % len(df_credit))
    start_time_sec = time.time()
    prev_time_sec = start_time_sec
    for i in range(0, len(df_credit)):
        if i > 0 and (i % 10000 == 0 or i == len(df_credit) - 1):
            time_sec = time.time()
            print("%d/%d in %.2f (%.2f) seconds." % (i, len(df_credit), time_sec - start_time_sec, time_sec - prev_time_sec))
            prev_time_sec = time_sec
        xml_root = experian_xml_parser.parse_experian_xml(df_credit['premier_raw_xml'][i])
        id = str(df_credit['id'][i]) 
        applicant_type = df_credit['applicant_type'][i]
        credit_pull_id=str(df_credit['business_credit_pull_id'][i]) 
        credit_pull_date=df_credit['created_dt'][i]
        if xml_root!=-99:
            trade_line_parser.parse_xml_dict(root=xml_root, id = id, applicant_type = applicant_type, credit_pull_id=credit_pull_id, credit_pull_date=credit_pull_date)
            inquiry_parser.parse_xml_dict(root=xml_root, id = id, applicant_type = applicant_type, credit_pull_id=credit_pull_id, credit_pull_date=credit_pull_date)
        else:
            print("error in row: %d" % (i))
            
    trade_line_df = trade_line_parser.to_data_frame()
    inquiry_df = inquiry_parser.to_data_frame()
    
    trade_line_df.to_parquet(f's3://sofi-data-science/hpeng/pl-gen4/data_dump/raw_tables/feature_creation/tradeline_{quarter}.parquet')
    inquiry_df.to_parquet(f's3://sofi-data-science/hpeng/pl-gen4/data_dump/raw_tables/feature_creation/inquiry_{quarter}.parquet')
    run_time=round((time.time()-start)/60,1)
    print('run_time: {0} mins'.format(run_time))

quarter: 2021Q4
Parsing 758088 xml files...
10000/758088 in 502.76 (502.76) seconds.
20000/758088 in 1031.44 (528.68) seconds.
30000/758088 in 1571.93 (540.49) seconds.
40000/758088 in 2128.92 (556.98) seconds.
50000/758088 in 2681.12 (552.21) seconds.
60000/758088 in 3238.83 (557.71) seconds.
70000/758088 in 3795.56 (556.73) seconds.
XML Parse Error

error in row: 71786
XML Parse Error

error in row: 72115
80000/758088 in 4351.28 (555.71) seconds.
90000/758088 in 4909.38 (558.10) seconds.
100000/758088 in 5498.96 (589.58) seconds.
110000/758088 in 6078.80 (579.84) seconds.
120000/758088 in 6669.10 (590.30) seconds.
130000/758088 in 7255.36 (586.26) seconds.
140000/758088 in 7841.48 (586.12) seconds.
150000/758088 in 8427.19 (585.72) seconds.
160000/758088 in 9008.92 (581.72) seconds.
170000/758088 in 9597.42 (588.50) seconds.
180000/758088 in 10180.82 (583.40) seconds.
190000/758088 in 10757.75 (576.93) seconds.
XML Parse Error

error in row: 195043
200000/758088 in 11329.33 (571.58) 

In [8]:
trade_line_df.head()

Unnamed: 0,id,applicant_type,credit_pull_id,credit_pull_date,OpenOrClosed,Subcode,SubscriberDisplayName,AccountNumber,AccountTypeCode,AccountType,...,PaymentStatusCode,SpecialComment,InitialPaymentLevelDate,EnhancedAccountType,EnhancedAccountTypeCode,AmountQualifier_1,AmountValue_1,AmountQualifier_2,AmountValue_2,HasTrendedData
0,15866509,PRIMARY,21934834,2021-11-04 02:32:30.943684,Closed,1884330,DEPT OF EDUCATION/NELN,900000341212724,12,Education Loan,...,10,Account closed due to refinance,2020-02-01,Education Loan,12,Original,245.0,Unknown,,False
1,15866509,PRIMARY,21934834,2021-11-04 02:32:30.943684,Closed,3963206,GM FINANCIAL,452421365,0,Auto Loan,...,97,,2019-03-01,AUT\tAuto Loan,0,Original,19194.0,Charge off amount,8651.0,False
2,15866509,PRIMARY,21934834,2021-11-04 02:32:30.943684,Closed,1270246,CAPITAL ONE BANK USA N,517805******,18,"Credit Card, Terms REV",...,11,Account closed at consumer's request,2021-08-01,"Credit Card, Terms REV",18,Limit,400.0,High balance,304.0,False
3,15866509,PRIMARY,21934834,2021-11-04 02:32:30.943684,Open,8740622,DEL NORTE CU,1135085L0001,0,Auto Loan,...,11,,2021-11-01,AUT\tAuto Loan,0,Original,20327.0,Unknown,,False
4,15866509,PRIMARY,21934834,2021-11-04 02:32:30.943684,Open,1270246,CAPITAL ONE BANK USA N,517805******,18,"Credit Card, Terms REV",...,11,,2021-10-01,"Credit Card, Terms REV",18,Limit,500.0,High balance,528.0,False
