In [1]:
import numpy as np
import pandas as pd
from dstk.utils.data_cleaning import clean_columns

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set_style('dark')
plt.rcParams['figure.figsize'] = (9,6)
np.set_printoptions(suppress=True)
pd.options.display.max_columns = 150

In [2]:
pd.options.display.float_format =  lambda x: '{:,.3f}'.format(x)

In [3]:
def col_descrip(table, col):
    "Looks up column description for given table"
    print(table)
    print(col)
    return col_des.loc[(col_des.Table==table) & (col_des.Row==col.upper()),
            'Description'].values[0]

def anom_eval(df, col, filename):
    print(col_descrip(filename, col))
    print()

    max_diff_idx = df[col].sort_values().diff().nlargest(3).index
    max_diffs = df.loc[max_diff_idx, col]
    nlargest = df[col].nlargest()
    nsmallest = df[col].nsmallest()

    print('Max Diffs')
    print(max_diffs)
    print()
    print("Largest Vals")
    print(nlargest)
    print()
    print("Smallest Vals")
    print(nsmallest)
    df[col].hist()
    df[col].value_counts().sort_index()

    return max_diffs, nlargest

def pct_null(df):
    null_counts = df.isnull().sum()[df.isnull().sum()>0].sort_values(ascending=False).to_frame()
    if null_counts.empty:
        return None
    null_counts['pct_null'] = null_counts/df.shape[0]
    null_counts.columns = ['n_null', 'pct_null']
    return null_counts

In [4]:
col_des = pd.read_csv('HomeCredit_columns_description.csv', encoding='latin-1')
col_des.shape

(219, 5)

In [None]:
prev_apps = pd.read_csv("previous_application.csv.zip", dtype= {'SK_ID_CURR':str, 'SK_ID_BUREAU':str, 'SK_ID_PREV':str})
prev_apps.shape

In [None]:
prev_apps.head()

In [None]:
list(
    zip(col_des[col_des.Table=='previous_application.csv'].Row.tolist(),col_des[col_des.Table=='previous_application.csv'].Description.tolist())
)

### Column Types

In [None]:
prev_apps.columns = clean_columns(prev_apps)

One problem with this is that some int cols will be float cols if there are nas. But not too worried about this.

In [None]:
pks = ['sk_id_curr', 'sk_id_prev']

# nunique excludes nan!
indicator_cols = prev_apps.columns[prev_apps.nunique() == 2].tolist()

obj_cols = prev_apps.select_dtypes('O').columns.drop(pks).tolist()
obj_cols = [o for o in obj_cols if o not in indicator_cols]
int_cols = prev_apps.select_dtypes('int').columns.tolist()
int_cols = [i for i in int_cols if i not in indicator_cols]
float_cols = prev_apps.select_dtypes('float').columns.tolist()
float_cols = [f for f in float_cols if f not in indicator_cols]

numeric_cols = int_cols + float_cols

ordered_cols = sorted(pks) + sorted(obj_cols) + sorted(int_cols) + sorted(float_cols) + sorted(indicator_cols)
print(len(prev_apps.columns), len(ordered_cols))

prev_apps = prev_apps[ordered_cols]

In [None]:
for c in indicator_cols:
    print(c)
    print(prev_apps[c].unique())
    print()

In [None]:
prev_apps.dtypes

In [None]:
(prev_apps[numeric_cols]<0).any(0)

In [None]:
neg_cols = prev_apps[numeric_cols].columns[(prev_apps[numeric_cols]<0).any(0)]
neg_cols

In [None]:
# manually inspect
neg_cols = [
    'days_decision',  
    'days_first_due',
    'days_last_due_1st_version', 
    'days_last_due', 
    'days_termination'
]

In [None]:
for col in neg_cols:
    print(col)
    prev_apps.loc[prev_apps[col]<0, col] *= -1

In [None]:
# vals of -1 are assumed to be missing
prev_apps['sellerplace_area'] = prev_apps.sellerplace_area.replace(-1, np.nan)

In [None]:
# negative values (there are 2) are assumed 0
prev_apps.loc[prev_apps.amt_down_payment<0, 'amt_down_payment'] = 0

In [None]:
# negative values (there are 2) are assumed 0
prev_apps.loc[prev_apps.rate_down_payment<0, 'rate_down_payment'] = 0

In [None]:
# droppping this
prev_apps.days_first_drawing.hist(bins=100)
prev_apps.drop('days_first_drawing', axis=1, inplace=True)
float_cols.remove('days_first_drawing')
numeric_cols.remove('days_first_drawing')

In [None]:
(prev_apps[numeric_cols]<0).any()

### Anomalies

In [None]:
gaps = prev_apps[numeric_cols].apply(lambda s: s.sort_values().diff().max()/s.std())
gaps[gaps>2]


In [None]:
gaps[gaps>2].index.tolist()

In [None]:
# manual exploration
anom_cols = [
    'sellerplace_area',
    'amt_annuity',
    'days_first_due',
    'days_last_due_1st_version',
    'days_last_due',
    'days_termination'
]


Note should create feature of diff between amt applied for and amt granted

In [None]:
anom_eval(prev_apps, 'days_last_due_1st_version', 'previous_application.csv')

In [None]:
for col in anom_cols:
    print(col)
    max_diff = prev_apps[col].sort_values().diff().max()
    pwr = int(np.log10(max_diff))
    if pwr <2:
        cutoff = max_diff
    else:
        cutoff = int(max_diff/(10**pwr))*(10**pwr)
    
    anoms = prev_apps.loc[prev_apps[col]>=cutoff, col]
    rest = prev_apps.loc[~prev_apps.index.isin(anoms.index), col]

    # replace outliers with median from non-outlying data
    prev_apps.loc[anoms.index, col] = rest.median()
    prev_apps[col+'_anom'] = 0
    
    # indicator
    prev_apps.loc[anoms.index, col+'_anom'] = 1
    
    indicator_cols.append(col+'_anom')

### Fill NA

In [None]:
obj_nulls = pct_null(prev_apps[obj_cols])
obj_nulls

In [None]:
prev_apps.name_type_suite.unique()

In [None]:
# Just marking name_type_suite as null
prev_apps['name_type_suite'] = prev_apps.name_type_suite.astype(str)

In [None]:
# fill product_combination with mode
prev_apps.fillna({'product_combination':prev_apps.product_combination.mode().squeeze()}, inplace=True)

In [None]:
# numeric nulls fill w median
medians = prev_apps[numeric_cols].median().squeeze()
prev_apps.fillna(medians, inplace=True)

### Encode Cat Vars

In [None]:
prev_apps.columns[prev_apps.nunique()<2]

In [None]:
binary_cols = prev_apps[obj_cols].columns[prev_apps[obj_cols].nunique()==2]
binary_cols

In [None]:
non_binary_cols = obj_cols.copy()
non_binary_cols

In [None]:
for col in non_binary_cols:
    print(col)
    ohe_col = pd.get_dummies(prev_apps[col], drop_first=True)
    ohe_col.columns = col + '_' + ohe_col.columns
    prev_apps = pd.concat([prev_apps, ohe_col], axis=1)
    
    indicator_cols.extend(ohe_col.columns.tolist())
    del prev_apps[col]
    obj_cols.remove(col)

In [None]:
prev_apps.to_csv('clean_data/prev_apps.csv', index=False)

In [5]:
prev_apps = pd.read_csv('clean_data/prev_apps.csv', dtype={'sk_id_curr':str, 'sk_id_bureau':str, 'sk_id_prev':str,
                                                                'num_instalment_version':str})

Aggregate

In [7]:
cc.head()

Unnamed: 0,sk_id_curr,sk_id_prev,amt_credit_limit_actual,cnt_drawings_current,months_balance,sk_dpd,sk_dpd_def,amt_balance,amt_drawings_atm_current,amt_drawings_current,amt_drawings_other_current,amt_drawings_pos_current,amt_inst_min_regularity,amt_payment_current,amt_payment_total_current,amt_receivable_principal,amt_recivable,amt_total_receivable,cnt_drawings_atm_current,cnt_drawings_other_current,cnt_drawings_pos_current,cnt_instalment_mature_cum,amt_payment_current_anom,amt_payment_total_current_anom,cnt_drawings_atm_current_anom,name_contract_status_Approved,name_contract_status_Completed,name_contract_status_Demand,name_contract_status_Refused,name_contract_status_Sent proposal,name_contract_status_Signed
0,378907,2562384,135000,1,6,0,0,56.97,0.0,877.5,0.0,877.5,1700.325,1800.0,1800.0,0.0,0.0,0.0,0.0,0.0,1.0,35.0,0,0,0,0,0,0,0,0,0
1,363914,2582071,45000,1,1,0,0,63975.555,2250.0,2250.0,0.0,0.0,2250.0,2250.0,2250.0,60175.08,64875.555,64875.555,1.0,0.0,0.0,69.0,0,0,0,0,0,0,0,0,0
2,371185,1740877,450000,0,7,0,0,31815.225,0.0,0.0,0.0,0.0,2250.0,2250.0,2250.0,26926.425,31460.085,31460.085,0.0,0.0,0.0,30.0,0,0,0,0,0,0,0,0,0
3,337855,1389973,225000,1,4,0,0,236572.11,2250.0,2250.0,0.0,0.0,11795.76,11925.0,11925.0,224949.285,233048.97,233048.97,1.0,0.0,0.0,10.0,0,0,0,0,0,0,0,0,0
4,126868,1891521,450000,1,1,0,0,453919.455,0.0,11547.0,0.0,11547.0,22924.89,27000.0,27000.0,443044.395,453919.455,453919.455,0.0,0.0,1.0,101.0,0,0,0,0,0,0,0,0,0


In [8]:
agg_dic = dict.fromkeys(cc.select_dtypes('number').columns.tolist(), 'mean')
sum_cols = cc.columns[-9:].tolist()
drop_cols = ['sk_dpd_def']

for s in sum_cols:
    agg_dic[s] = 'sum'
for d in drop_cols:
    del agg_dic[d]
    
cc_agg = cc.groupby(['sk_id_curr','sk_id_prev']).agg(agg_dic).reset_index()
cc_agg.shape

(104307, 30)

In [9]:
payments.head()

Unnamed: 0,sk_id_curr,sk_id_prev,num_instalment_number,amt_instalment,amt_payment,days_entry_payment,days_instalment,days_entry_payment_anom,amt_instalment_anom,amt_payment_anom,num_instalment_version_1.0,num_instalment_version_10.0,num_instalment_version_11.0,num_instalment_version_12.0,num_instalment_version_13.0,num_instalment_version_14.0,num_instalment_version_15.0,num_instalment_version_16.0,num_instalment_version_17.0,num_instalment_version_178.0,num_instalment_version_18.0,num_instalment_version_19.0,num_instalment_version_2.0,num_instalment_version_20.0,num_instalment_version_21.0,num_instalment_version_22.0,num_instalment_version_23.0,num_instalment_version_24.0,num_instalment_version_25.0,num_instalment_version_26.0,num_instalment_version_27.0,num_instalment_version_28.0,num_instalment_version_29.0,num_instalment_version_3.0,num_instalment_version_30.0,num_instalment_version_31.0,num_instalment_version_32.0,num_instalment_version_33.0,num_instalment_version_34.0,num_instalment_version_35.0,num_instalment_version_36.0,num_instalment_version_37.0,num_instalment_version_38.0,num_instalment_version_39.0,num_instalment_version_4.0,num_instalment_version_40.0,num_instalment_version_41.0,num_instalment_version_42.0,num_instalment_version_43.0,num_instalment_version_44.0,num_instalment_version_45.0,num_instalment_version_46.0,num_instalment_version_47.0,num_instalment_version_48.0,num_instalment_version_49.0,num_instalment_version_5.0,num_instalment_version_50.0,num_instalment_version_51.0,num_instalment_version_52.0,num_instalment_version_53.0,num_instalment_version_54.0,num_instalment_version_55.0,num_instalment_version_56.0,num_instalment_version_57.0,num_instalment_version_58.0,num_instalment_version_59.0,num_instalment_version_6.0,num_instalment_version_61.0,num_instalment_version_68.0,num_instalment_version_7.0,num_instalment_version_72.0,num_instalment_version_73.0,num_instalment_version_8.0,num_instalment_version_9.0
0,161674,1054186,6,6948.36,6948.36,1187.0,1180.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,151639,1330831,34,1716.525,1716.525,2156.0,2156.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,193053,2085231,1,25425.0,25425.0,63.0,63.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,199697,2452527,3,24350.13,24350.13,2426.0,2418.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,167756,2714724,2,2165.04,2160.585,1366.0,1383.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [40]:
# group by sk_id_prev
# drop installment number becaise it will aggregate otherwise
agg_dic = dict.fromkeys(payments.select_dtypes('number').columns.tolist(), 'sum')
agg_dic['days_instalment'] = 'mean' # can't sum number of days since installment
drop_cols = ['num_instalment_number']
for d in drop_cols:
    del agg_dic[d]

payments_agg = payments.drop('num_instalment_number', axis=1).groupby(['sk_id_curr', 'sk_id_prev']).agg(agg_dic).reset_index()
payments_agg.shape

(997752, 73)

In [41]:
payments.amt_payment.sum(), payments_agg.amt_payment.sum()

(234461380024.93488, 234461380024.9351)

In [42]:
pos.head()

Unnamed: 0,sk_id_prev,sk_id_curr,months_balance,cnt_instalment,cnt_instalment_future,sk_dpd,sk_dpd_def,name_contract_status_Active,name_contract_status_Amortized debt,name_contract_status_Approved,name_contract_status_Canceled,name_contract_status_Completed,name_contract_status_Demand,name_contract_status_Returned to the store,name_contract_status_Signed
0,1803195,182943,31,48.0,45.0,0,0,1,0,0,0,0,0,0,0
1,1715348,367990,33,36.0,35.0,0,0,1,0,0,0,0,0,0,0
2,1784872,397406,32,12.0,9.0,0,0,1,0,0,0,0,0,0,0
3,1903291,269225,35,48.0,42.0,0,0,1,0,0,0,0,0,0,0
4,2341044,334279,35,36.0,35.0,0,0,1,0,0,0,0,0,0,0


In [43]:
agg_dic = dict.fromkeys(pos.select_dtypes('number').columns.tolist(), 'sum')
mean_cols = ['months_balance', 'sk_dpd']
drop_cols = ['cnt_instalment', 'cnt_instalment_future', 'sk_dpd_def']
for c in mean_cols:
    agg_dic[c] = 'mean'
for d in drop_cols:
    del agg_dic[d]
pos_agg = pos.drop(drop_cols, axis=1).groupby(['sk_id_curr', 'sk_id_prev']).agg(agg_dic).reset_index()
pos_agg.shape

(936325, 10)

In [44]:
pos.name_contract_status_Active.sum(), pos_agg.name_contract_status_Active.sum()

(9151119, 9151119)

Agg Prev Apps

In [74]:
prev_apps.head()

Unnamed: 0,sk_id_curr,sk_id_prev,days_decision,hour_appr_process_start,sellerplace_area,amt_annuity,amt_application,amt_credit,amt_down_payment,amt_goods_price,cnt_payment,days_first_due,days_last_due,days_last_due_1st_version,days_termination,rate_down_payment,rate_interest_primary,rate_interest_privileged,flag_last_appl_per_contract,nflag_insured_on_approval,nflag_last_appl_in_day,sellerplace_area_anom,amt_annuity_anom,days_first_due_anom,days_last_due_1st_version_anom,days_last_due_anom,days_termination_anom,name_contract_type_Consumer loans,name_contract_type_Revolving loans,name_contract_type_XNA,weekday_appr_process_start_MONDAY,weekday_appr_process_start_SATURDAY,weekday_appr_process_start_SUNDAY,weekday_appr_process_start_THURSDAY,weekday_appr_process_start_TUESDAY,weekday_appr_process_start_WEDNESDAY,name_cash_loan_purpose_Business development,name_cash_loan_purpose_Buying a garage,name_cash_loan_purpose_Buying a holiday home / land,name_cash_loan_purpose_Buying a home,name_cash_loan_purpose_Buying a new car,name_cash_loan_purpose_Buying a used car,name_cash_loan_purpose_Car repairs,name_cash_loan_purpose_Education,name_cash_loan_purpose_Everyday expenses,name_cash_loan_purpose_Furniture,name_cash_loan_purpose_Gasification / water supply,name_cash_loan_purpose_Hobby,name_cash_loan_purpose_Journey,name_cash_loan_purpose_Medicine,name_cash_loan_purpose_Money for a third person,name_cash_loan_purpose_Other,name_cash_loan_purpose_Payments on other loans,name_cash_loan_purpose_Purchase of electronic equipment,name_cash_loan_purpose_Refusal to name the goal,name_cash_loan_purpose_Repairs,name_cash_loan_purpose_Urgent needs,name_cash_loan_purpose_Wedding / gift / holiday,name_cash_loan_purpose_XAP,name_cash_loan_purpose_XNA,name_contract_status_Canceled,name_contract_status_Refused,name_contract_status_Unused offer,name_payment_type_Cashless from the account of the employer,name_payment_type_Non-cash from your account,name_payment_type_XNA,code_reject_reason_HC,code_reject_reason_LIMIT,code_reject_reason_SCO,code_reject_reason_SCOFR,code_reject_reason_SYSTEM,code_reject_reason_VERIF,code_reject_reason_XAP,code_reject_reason_XNA,name_type_suite_Family,...,name_type_suite_Unaccompanied,name_type_suite_nan,name_client_type_Refreshed,name_client_type_Repeater,name_client_type_XNA,name_goods_category_Animals,name_goods_category_Audio/Video,name_goods_category_Auto Accessories,name_goods_category_Clothing and Accessories,name_goods_category_Computers,name_goods_category_Construction Materials,name_goods_category_Consumer Electronics,name_goods_category_Direct Sales,name_goods_category_Education,name_goods_category_Fitness,name_goods_category_Furniture,name_goods_category_Gardening,name_goods_category_Homewares,name_goods_category_House Construction,name_goods_category_Insurance,name_goods_category_Jewelry,name_goods_category_Medical Supplies,name_goods_category_Medicine,name_goods_category_Mobile,name_goods_category_Office Appliances,name_goods_category_Other,name_goods_category_Photo / Cinema Equipment,name_goods_category_Sport and Leisure,name_goods_category_Tourism,name_goods_category_Vehicles,name_goods_category_Weapon,name_goods_category_XNA,name_portfolio_Cars,name_portfolio_Cash,name_portfolio_POS,name_portfolio_XNA,name_product_type_walk-in,name_product_type_x-sell,channel_type_Car dealer,channel_type_Channel of corporate sales,channel_type_Contact center,channel_type_Country-wide,channel_type_Credit and cash offices,channel_type_Regional / Local,channel_type_Stone,name_seller_industry_Clothing,name_seller_industry_Connectivity,name_seller_industry_Construction,name_seller_industry_Consumer electronics,name_seller_industry_Furniture,name_seller_industry_Industry,name_seller_industry_Jewelry,name_seller_industry_MLM partners,name_seller_industry_Tourism,name_seller_industry_XNA,name_yield_group_high,name_yield_group_low_action,name_yield_group_low_normal,name_yield_group_middle,product_combination_Card X-Sell,product_combination_Cash,product_combination_Cash Street: high,product_combination_Cash Street: low,product_combination_Cash Street: middle,product_combination_Cash X-Sell: high,product_combination_Cash X-Sell: low,product_combination_Cash X-Sell: middle,product_combination_POS household with interest,product_combination_POS household without interest,product_combination_POS industry with interest,product_combination_POS industry without interest,product_combination_POS mobile with interest,product_combination_POS mobile without interest,product_combination_POS other with interest,product_combination_POS others without interest
0,271877,2030495,73,15,35.0,1730.43,17145.0,17145.0,0.0,17145.0,12.0,42.0,42.0,300.0,37.0,0.0,0.183,0.867,Y,0.0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,108129,2802425,164,11,65.0,25188.615,607500.0,679671.0,1638.0,607500.0,36.0,134.0,801.0,916.0,780.0,0.052,0.189,0.835,Y,1.0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,...,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,122040,2523466,301,11,65.0,15060.735,112500.0,136444.5,1638.0,112500.0,12.0,271.0,801.0,59.0,780.0,0.052,0.189,0.835,Y,1.0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,176158,2819243,512,7,65.0,10438.628,450000.0,470790.0,1638.0,450000.0,12.0,482.0,182.0,152.0,177.0,0.052,0.189,0.835,Y,1.0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,202054,1784265,781,9,65.0,31924.395,337500.0,404055.0,1638.0,337500.0,24.0,874.0,801.0,615.0,780.0,0.052,0.189,0.835,Y,,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [80]:
# how'd this happen?
# prev_apps.drop('flag_last_appl_per_contract', axis=1, inplace=True)

In [81]:
col_descrip("previous_application.csv", 'nflag_last_appl_in_day')

previous_application.csv
nflag_last_appl_in_day


'Flag if the application was the last application per day of the client. Sometimes clients apply for more applications a day. Rarely it could also be error in our system that one application is in the database twice'

In [82]:
prev_apps.columns.tolist().index('nflag_last_appl_in_day')

19

In [86]:
agg_dic = dict.fromkeys(prev_apps.select_dtypes('number').columns.tolist(), 'mean')
sum_cols = prev_apps.columns[19:].tolist()

for s in sum_cols:
    agg_dic[s] = 'sum'

    
prev_apps_agg = prev_apps.groupby(['sk_id_curr','sk_id_prev']).agg(agg_dic).reset_index()
prev_apps_agg.shape

(1670214, 153)

In [87]:
prev_apps_agg.shape

(1670214, 153)

Write all of the agg frames out

In [88]:
cc_agg.to_csv('clean_data/cc_agg.csv', index=False)
payments_agg.to_csv('clean_data/payments_agg.csv', index=False)
pos_agg.to_csv('clean_data/pos_agg.csv', index=False)
prev_apps_agg.to_csv('clean_data/prev_apps_agg.csv', index=False)