### Load libs

In [None]:
import pandas as pd
import json
import numpy as np
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
n = 2000
pd.set_option('display.max_rows', n)
pd.set_option('display.max_columns', n)

### Load data

In [None]:
# data
folder = '/content/drive/MyDrive/Loan_default/'
file = 'transactions.csv'
path = folder + file
data = pd.read_csv(path)
data.shape

(152222, 709)

### Adding classification target

In [None]:
def binary_target(init):
    
    """
    Get binary target
    """
    
    #init = str(row['target'])
    
    if init == 'Charged-off':
        return 'Default'
    else:
        return 'Solvent'
    

In [None]:
i_targets = data['target'].values
data['binaryTarget'] = [binary_target(a) for a in i_targets]
data['binaryTargetNumeric'] = (data['binaryTarget']=='Default').astype(int)

In [None]:
def get_year(init):
    
    """
    Get co year
    """
    year = init[0:4]
    
    return year


In [None]:
y_vals = data['reportingPeriodBeginningDateREvent'].values
years_vals = [get_year(a) for a in y_vals]
data['eventYear'] = years_vals


In [None]:
#encode boolean columns to binary
data['coObligorIndicatorLocRandom']=data['coObligorIndicatorLocRandom'].astype(int)
data['underwritingIndicatorLocRandom']=data['underwritingIndicatorLocRandom'].astype(int)

### Categorical encode features

In [None]:
encode_features = ['vehicleModelNameLocRandom',
                   'vehicleManufacturerNameLocRandom',
                   'obligorGeographicLocationLocRandom',
                   'obligorCreditScoreTypeLocRandom',
                   'interestCalculationTypeCodeMLocRandom',
                   'obligorEmploymentVerificationCodeMLocRandom',
                   'vehicleTypeCodeMLocRandom',
                   'vehicleNewUsedCodeMLocRandom',
                   'paymentTypeCodeMLocRandom',
                   'obligorIncomeVerificationLevelCodeMLocRandom',
                   'vehicleValueSourceCodeMLocRandom']


In [None]:
for col in encode_features:
    data[col] = data[col].fillna('missing')
    data[col] = data[col].astype(str)
    data[col] = data[col].str.lower().str.strip().str.replace(' ', '')
    

In [None]:
#encode categorical columns with two unique values into binary
data['vehicleNewUsedCodeMLocRandom']=(data['vehicleNewUsedCodeMLocRandom']=='new').astype(int)
data['obligorCreditScoreTypeLocRandom']=(data['obligorCreditScoreTypeLocRandom']=='creditbureauscore').astype(int)
data['obligorEmploymentVerificationCodeMLocRandom']=(data['obligorEmploymentVerificationCodeMLocRandom']=='stated,notverified').astype(int)

### Custom fields

In [None]:
mean_val = data[data['vehicleValueAmountLocRandom'] > 0]['vehicleValueAmountLocRandom'].mean()
mean_val

22729.71409454701

In [None]:
def fix_vehicle_value(init):
    
    """
    Fix vehicle value amount
    """
    
    if init > 0:
        return init
    else:
        return mean_val


In [None]:
v_vals = data['vehicleValueAmountLocRandom'].values
v_res = [fix_vehicle_value(a) for a in v_vals]
data['vehicleValueAmountLocRandom'] = v_res


In [None]:
# calc ltv
data['ltv'] = data['originalLoanAmountLocRandom'] / data['vehicleValueAmountLocRandom']

In [None]:
data['ltv'].mean()

1.0664543114107385

In [None]:
data['ltv'].max()

1.8460921933085501

In [None]:
data['ltv'].min()

0.08928478955039394

### Export

In [None]:
f_1 = '/content/drive/MyDrive/Loan_default/prepared/'
dir_bool = os.path.isdir(f_1)
if dir_bool == False:
    os.mkdir(f_1)

In [None]:
e_file = 'train_engineered.csv'
e_path = f_1 + e_file
data.to_csv(e_path, index = False)


In [None]:
print('complete...')

complete...
