Quickly extract sample loans and first 12 payments performance data

In [None]:
import pandas as pd
import numpy as np
import glob
import os

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
mpl.rc('axes', labelsize=18)
mpl.rc('xtick', labelsize=15)
mpl.rc('ytick', labelsize=15)
mpl.rc('legend',**{'fontsize':16.5})
mpl.rc('lines', linewidth=2)


from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve


def save_fig(fig_id, tight_layout=True):
    path = os.path.join(fig_id + ".png") 
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)   

In [None]:
orig_headers = [
    'CREDIT SCORE', 'FIRST PAYMENT DATE', 'FIRST TIME HOMEBUYER FLAG', 'MATURITY DATE',
    'MSA', 'MI %', 'NUMBER OF UNITS', 'OCCUPANCY STATUS', 'ORIGINAL CLTV',
    'ORIGINAL DTI', 'ORIGINAL UPB', 'ORIGINAL LTV', 'ORIGINAL INTEREST RATE',
    'CHANNEL', 'PPM FLAG', 'PRODUCT TYPE', 'PROPERTY STATE', 'PROPERTY TYPE', 
    'POSTAL CODE', 'LOAN SEQUENCE NUMBER', 'LOAN PURPOSE', 'ORIGINAL LOAN TERM',
    'NUMBER OF BORROWERS', 'SELLER NAME', 'SERVICER NAME', 'UNKNOWN'
]

mp_headers = [
    'LOAN SEQUENCE NUMBER', 'MONTHLY REPORTING PERIOD', 'CURRENT ACTUAL UPB',
    'CURRENT LOAN DELINQUENCY STATUS', 'LOAN AGE', 'REMAINING MONTHS TO LEGAL MATURITY',
    'REPURCHASE FLAG', 'MODIFICATION FLAG', 'ZERO BALANCE CODE',
    'ZERO BALANCE EFFECTIVE DATE', 'CURRENT INTEREST RATE', 'CURRENT DEFERRED UPB', 
    'DDLPI', 'MI RECOVERIES', 'NET SALES PROCEEDS', 'NON MI RECOVERIES', 'EXPENSES', 
    'LEGAL COSTS', 'MAINTENANCE AND PRESERVATION COSTS', 'TAXES AND INSURANCE', 
    'MISCELLANEOUS EXPENSES', 'ACTUAL LOSS CALCULATION', 'MODIFICATION COST',
    'STEP MODIFICATION FLAG', 'DEFERRED PAYMENT MODIFICATION', 'ELTV', 'ZERO BALANCE REMOVAL UPB',
    'DELINQUENT ACCRUED INTEREST'
]


In [None]:
orig = pd.read_csv('data/train/orig_train.txt', delimiter='|', names=orig_headers, low_memory=False)


cols = [
    'CREDIT SCORE', 'FIRST TIME HOMEBUYER FLAG',
    'MI %', 'NUMBER OF UNITS', 'OCCUPANCY STATUS', 'ORIGINAL CLTV',
    'ORIGINAL DTI', 'ORIGINAL UPB', 'ORIGINAL INTEREST RATE',
    'CHANNEL', 'PPM FLAG', 'PROPERTY STATE', 'PROPERTY TYPE', 
    'LOAN SEQUENCE NUMBER', 'LOAN PURPOSE', 'ORIGINAL LOAN TERM',
    'NUMBER OF BORROWERS'
]

orig = orig[cols]

cols_p = ['LOAN SEQUENCE NUMBER', 'LOAN AGE', 'ZERO BALANCE CODE']

In [None]:
mp_data = pd.read_csv('data/labelled_samp.csv', low_memory=False)
mp_data.head()

In [None]:
data = mp_data.set_index('LOAN SEQUENCE NUMBER').join(orig.set_index('LOAN SEQUENCE NUMBER'))
data.dropna(inplace=True)
data = data[(data['CREDIT SCORE'] >= 301) & (data['CREDIT SCORE'] <= 850)]
data = data[(data['ORIGINAL CLTV'] >= 0) & (data['ORIGINAL CLTV'] <= 200)]
data = data[(data['ORIGINAL DTI'] >= 0) & (data['ORIGINAL DTI'] <= 65)]
data['MI %'].replace(999, 0, inplace=True)
data['ZERO BALANCE CODE'].replace({1: 0, 9: 1, 6: 1, 3: 1, 2: 1, 15: 1}, inplace=True)

In [None]:
def mp_concatter(indir, outdir):
    filelist = glob.glob(indir)
    dflist = []
    for file in filelist:
        print(file)
        data = pd.read_csv(file, delimiter='|', names=mp_headers, low_memory=False)
        data = data[cols_p]
        data.dropna(inplace=True)
        dflist.append(data)
    concatdf = pd.concat(dflist, axis=0)
    concatdf.to_csv(outdir, index=False)
    
    
# mp_concatter('data/mp/*.txt', 'data/labels.csv')
# mp_concatter('data/test/mp_test/*.txt', 'data/test/mp_test.csv')


In [None]:
def orig_concatter(path):
    files = glob.glob(path)

    with open('orig.txt','w') as result:
        for file_ in files:
            for line in open(file_, 'r'):
                result.write(line)
                
# orig_concatter('data/orig/*.txt')
# orig_concatter('data/test/orig_test/*.txt')

In [None]:
X = data.drop('ZERO BALANCE CODE', axis=1)
y = data['ZERO BALANCE CODE']

In [None]:
one_hot_cols = [
    'FIRST TIME HOMEBUYER FLAG', 'OCCUPANCY STATUS', 'CHANNEL', 
    'PPM FLAG', 'PROPERTY STATE', 'PROPERTY TYPE', 'LOAN PURPOSE',
]



column_trans = make_column_transformer(
    (OneHotEncoder(), one_hot_cols),
    remainder='passthrough')

In [None]:
column_trans.fit_transform(X)

In [None]:
pipe_lr = make_pipeline(column_trans, LogisticRegression(class_weight='balanced', random_state=99))
                        
cross_val_score(pipe_lr, X, y, cv=5, scoring='roc_auc', n_jobs=-1)