## **Importing Modules**

In [45]:
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf
import keras

import pandas as pd
import numpy as np
#import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
import glob

import random
RANDOM_SEED = 42
TRAINING_SAMPLE = 0.99
BATCH_SIZE = 32
LATENT_DIMENSION = 16

from sklearn.preprocessing import StandardScaler, LabelBinarizer, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn_pandas import DataFrameMapper, gen_features

from scipy import sparse

np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

In [46]:
print("Available GPUS : ", keras.backend.tensorflow_backend._get_available_gpus())

Available GPUS :  ['/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1']


## **Loading Data**

In [47]:
ACQUISITION_FEATURES = ["LOAN IDENTIFIER", 
                             "ORIGINATION CHANNEL", 
                             "SELLER NAME", 
                             "ORIGINAL INTEREST RATE", 
                             "ORIGINAL UPB", 
                             "ORIGINAL LOAN TERM", 
                             "ORIGINATION DATE", 
                             "FIRST PAYMENT DATE", 
                             "ORIGINAL LOAN-TO-VALUE (LTV)", 
                             "ORIGINAL COMBINED LOAN-TO-VALUE (CLTV)", 
                             "NUMBER OF BORROWERS", 
                             "ORIGINAL DEBT TO INCOME RATIO", 
                             "BORROWER CREDIT SCORE AT ORIGINATION", 
                             "FIRST TIME HOME BUYER INDICATOR",
                             "LOAN PURPOSE",
                             "PROPERTY TYPE",
                             "NUMBER OF UNITS",
                             "OCCUPANCY TYPE",
                             "PROPERTY STATE",
                             "ZIP CODE SHORT",
                             "PRIMARY MORTGAGE INSURANCE PERCENT",
                             "PRODUCT TYPE", "CO-BORROWER CREDIT SCORE AT ORIGINATION",
                             "MORTGAGE INSURANCE TYPE",
                             "RELOCATION MORTGAGE INDICATOR"]

PERFORMANCE_FEATURES = ["LOAN IDENTIFIER",
                             "MONTHLY REPORTING PERIOD",
                             "SERVICER NAME",
                             "CURRENT INTEREST RATE",
                             "CURRENT ACTUAL UPB",
                             "LOAN AGE",
                             "REMAINING MONTHS TO LEGAL MATURITY",
                             "ADJUSTED MONTHS TO MATURITY",
                             "MATURITY DATE",
                             "METROPOLITAN STATISTICAL AREA (MSA)",
                             "CURRENT LOAN DELINQUENCY STATUS",
                             "MODIFICATION FLAG",
                             "ZERO BALANCE CODE",
                             "ZERO BALANCE EFFECTIVE DATE",
                             "LAST PAID INSTALLMENT DATE",
                             "FORECLOSURE DATE",
                             "DISPOSITION DATE",
                             "FORECLOSURE COSTS",
                             "PROPERTY PRESERVATION AND REPAIR COSTS",
                             "ASSET RECOVERY COSTS",
                             "MISCELLANEOUS HOLDING EXPENSES AND CREDITS",
                             "ASSOCIATED TAXES FOR HOLDING PROPERTY",
                             "NET SALE PROCEEDS",
                             "CREDIT ENHANCEMENT PROCEEDS",
                             "REPURCHASE MAKE WHOLE PROCEEDS",
                             "OTHER FORECLOSURE PROCEEDS",
                             "NON INTEREST BEARING UPB",
                             "PRINCIPAL FORGIVENESS AMOUNT",
                             "REPURCHASE MAKE WHOLE PROCEEDS FLAG",
                             "FORECLOSURE PRINCIPAL WRITE-OFF AMOUNT",
                             "SERVICING ACTIVITY INDICATOR"]

REL_ACQUISITION_FEATURES = ["LOAN IDENTIFIER",
                            "BORROWER CREDIT SCORE AT ORIGINATION",
                            "CO-BORROWER CREDIT SCORE AT ORIGINATION",
                            "FIRST TIME HOME BUYER INDICATOR",
                            "MORTGAGE INSURANCE TYPE",
                            "NUMBER OF BORROWERS",
                            "NUMBER OF UNITS",
                            "OCCUPANCY TYPE",
                            "ORIGINAL COMBINED LOAN-TO-VALUE (CLTV)",
                            "ORIGINAL DEBT TO INCOME RATIO",
                            "ORIGINATION CHANNEL",
                            "ORIGINAL INTEREST RATE",
                            "ORIGINAL LOAN TERM",
                            "ORIGINAL LOAN-TO-VALUE (LTV)",
                            "ORIGINAL UPB",
                            "PRIMARY MORTGAGE INSURANCE PERCENT",
                            "PRODUCT TYPE",
                            "PROPERTY TYPE",
                            "RELOCATION MORTGAGE INDICATOR",
                            "SELLER NAME"]

REL_PERFORMANCE_FEATURES = ["LOAN IDENTIFIER",
                            "ADJUSTED MONTHS TO MATURITY",
                            "CURRENT ACTUAL UPB",
                            "CURRENT INTEREST RATE",
                            "CURRENT LOAN DELINQUENCY STATUS",
                            "LOAN AGE",
                            "MODIFICATION FLAG",
                            "NON INTEREST BEARING UPB",
                            "PRINCIPAL FORGIVENESS AMOUNT",
                            "REMAINING MONTHS TO LEGAL MATURITY",
                            "SERVICING ACTIVITY INDICATOR",
                            "ZERO BALANCE CODE"]

In [48]:
path = "Data/2018" # use your path
performance_files = glob.glob(path + "/Performance_*.txt")
acquisition_files = glob.glob(path + "/Acquisition_*.txt")

print(performance_files)
print(acquisition_files)

def data_to_dataframe(files, features):
    li = []

    for filename in files:
        df = pd.read_csv(filename, sep = "|", names = features, index_col=None, header=0)
        li.append(df)

    frame = pd.concat(li, axis=0, ignore_index=True)
    return frame

acq_df = data_to_dataframe(acquisition_files, ACQUISITION_FEATURES)
per_df = data_to_dataframe(performance_files, PERFORMANCE_FEATURES)

['Data/2018/Performance_2018Q4.txt', 'Data/2018/Performance_2018Q3.txt', 'Data/2018/Performance_2018Q2.txt', 'Data/2018/Performance_2018Q1.txt']
['Data/2018/Acquisition_2018Q4.txt', 'Data/2018/Acquisition_2018Q3.txt', 'Data/2018/Acquisition_2018Q2.txt', 'Data/2018/Acquisition_2018Q1.txt']


In [49]:
per_df = per_df.loc[per_df["ZERO BALANCE CODE"].notnull(), REL_PERFORMANCE_FEATURES]
df = per_df.merge(acq_df[REL_ACQUISITION_FEATURES], on = "LOAN IDENTIFIER", how = "inner")

In [50]:
df.head()

Unnamed: 0,LOAN IDENTIFIER,ADJUSTED MONTHS TO MATURITY,CURRENT ACTUAL UPB,CURRENT INTEREST RATE,CURRENT LOAN DELINQUENCY STATUS,LOAN AGE,MODIFICATION FLAG,NON INTEREST BEARING UPB,PRINCIPAL FORGIVENESS AMOUNT,REMAINING MONTHS TO LEGAL MATURITY,...,ORIGINATION CHANNEL,ORIGINAL INTEREST RATE,ORIGINAL LOAN TERM,ORIGINAL LOAN-TO-VALUE (LTV),ORIGINAL UPB,PRIMARY MORTGAGE INSURANCE PERCENT,PRODUCT TYPE,PROPERTY TYPE,RELOCATION MORTGAGE INDICATOR,SELLER NAME
0,100013508071,0.0,266614.62,5.125,X,8,N,,,352,...,R,5.125,360,80,270000,,FRM,PU,N,PENNYMAC CORP.
1,100064828729,0.0,178000.0,5.0,X,4,N,,,356,...,R,5.0,360,80,178000,,FRM,PU,N,FREEDOM MORTGAGE CORP.
2,100070500618,0.0,85000.0,5.875,X,7,N,,,353,...,C,5.875,360,95,85000,16.0,FRM,SF,N,U.S. BANK N.A.
3,100080606733,0.0,387000.0,4.75,X,6,N,,,354,...,R,4.75,360,80,387000,,FRM,PU,N,FRANKLIN AMERICAN MORTGAGE COMPANY
4,100085155788,0.0,204101.1,4.875,X,14,N,,,346,...,C,4.875,360,80,208000,,FRM,PU,N,"LAKEVIEW LOAN SERVICING, LLC"


## **Data Preprocessing and Cleaning**

In [51]:
# Data Shape and Data Info
print(df.shape)
print(df.info())
# Checking for missing data
print(df.isnull().sum())

(413802, 31)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 413802 entries, 0 to 413801
Data columns (total 31 columns):
 #   Column                                   Non-Null Count   Dtype  
---  ------                                   --------------   -----  
 0   LOAN IDENTIFIER                          413802 non-null  int64  
 1   ADJUSTED MONTHS TO MATURITY              413801 non-null  float64
 2   CURRENT ACTUAL UPB                       413794 non-null  float64
 3   CURRENT INTEREST RATE                    413802 non-null  float64
 4   CURRENT LOAN DELINQUENCY STATUS          413304 non-null  object 
 5   LOAN AGE                                 413802 non-null  int64  
 6   MODIFICATION FLAG                        413802 non-null  object 
 7   NON INTEREST BEARING UPB                 1 non-null       float64
 8   PRINCIPAL FORGIVENESS AMOUNT             0 non-null       float64
 9   REMAINING MONTHS TO LEGAL MATURITY       413802 non-null  int64  
 10  SERVICING ACTIVITY 

In [52]:
df.drop(["ADJUSTED MONTHS TO MATURITY", 
                 "NON INTEREST BEARING UPB", 
                 "PRINCIPAL FORGIVENESS AMOUNT",
                 "SERVICING ACTIVITY INDICATOR",
                 "CO-BORROWER CREDIT SCORE AT ORIGINATION",
                 "MORTGAGE INSURANCE TYPE",
                 "PRIMARY MORTGAGE INSURANCE PERCENT",
                 "CURRENT LOAN DELINQUENCY STATUS"], axis = 1, inplace = True)

In [53]:
# Current UPB 
df["CURRENT ACTUAL UPB"].fillna(df["CURRENT ACTUAL UPB"].mean(), inplace = True)
df["BORROWER CREDIT SCORE AT ORIGINATION"].fillna(df["BORROWER CREDIT SCORE AT ORIGINATION"].mean(), inplace = True)
df["ORIGINAL DEBT TO INCOME RATIO"].fillna(df["ORIGINAL DEBT TO INCOME RATIO"].mean(), inplace = True)

In [54]:
## Encoding the target variable
# 0 - Defaulted, 1 - Not Defaulted
df["ZERO BALANCE CODE"].replace({
    1: 1,
    2: 0,
    3: 0,
    6: 0,
    9: 0,
    15: 1,
    16: 1
}, inplace = True)

In [55]:
#sns.distplot(LabelEncoder().fit_transform(df["ZERO BALANCE CODE"]))

In [56]:
df["ZERO BALANCE CODE"].value_counts()

1.0    411882
0.0      1920
Name: ZERO BALANCE CODE, dtype: int64

## **EDA**

In [57]:
# # Checking multicolinearity between continous features
# #correlation matrix
# corrmat = df.corr()
# f, ax = plt.subplots(figsize=(12, 9))
# sns.heatmap(corrmat, vmax=.8, square=True)

In [58]:
df.drop(["LOAN IDENTIFIER", "ORIGINAL UPB", "ORIGINAL LOAN-TO-VALUE (LTV)", "ORIGINAL LOAN TERM", "ORIGINAL INTEREST RATE"], axis = 1, inplace = True)

In [59]:
df.head()

Unnamed: 0,CURRENT ACTUAL UPB,CURRENT INTEREST RATE,LOAN AGE,MODIFICATION FLAG,REMAINING MONTHS TO LEGAL MATURITY,ZERO BALANCE CODE,BORROWER CREDIT SCORE AT ORIGINATION,FIRST TIME HOME BUYER INDICATOR,NUMBER OF BORROWERS,NUMBER OF UNITS,OCCUPANCY TYPE,ORIGINAL COMBINED LOAN-TO-VALUE (CLTV),ORIGINAL DEBT TO INCOME RATIO,ORIGINATION CHANNEL,PRODUCT TYPE,PROPERTY TYPE,RELOCATION MORTGAGE INDICATOR,SELLER NAME
0,266614.62,5.125,8,N,352,1.0,718.0,N,2,1,P,80.0,40.0,R,FRM,PU,N,PENNYMAC CORP.
1,178000.0,5.0,4,N,356,1.0,781.0,N,1,1,P,80.0,35.0,R,FRM,PU,N,FREEDOM MORTGAGE CORP.
2,85000.0,5.875,7,N,353,1.0,754.0,N,1,1,P,100.0,44.0,C,FRM,SF,N,U.S. BANK N.A.
3,387000.0,4.75,6,N,354,1.0,805.0,N,2,1,P,80.0,31.0,R,FRM,PU,N,FRANKLIN AMERICAN MORTGAGE COMPANY
4,204101.1,4.875,14,N,346,1.0,687.0,N,2,1,P,80.0,46.0,C,FRM,PU,N,"LAKEVIEW LOAN SERVICING, LLC"


## Preparing train and test splits¶

In [60]:
## Split the data into closed and defaulted 
paid_df = df[df["ZERO BALANCE CODE"] == 1]
default_df = df[df["ZERO BALANCE CODE"] == 0]

In [61]:
## Split into train test
# Shuffle the dataset 
paid_df = paid_df.sample(frac=1).reset_index(drop=True)

train_size = int(len(paid_df)*TRAINING_SAMPLE)

train_df = paid_df.iloc[:train_size]
test_df_paid = paid_df.iloc[train_size:].sample(frac=1).reset_index(drop=True)
test_df_default = default_df.sample(frac=1).reset_index(drop=True)

In [62]:
test_df_paid["ZERO BALANCE CODE"].value_counts()

1.0    4119
Name: ZERO BALANCE CODE, dtype: int64

In [63]:
test_df_default["ZERO BALANCE CODE"].value_counts()

0.0    1920
Name: ZERO BALANCE CODE, dtype: int64

## Encoding the Data

In [64]:
categorical_variables = ["MODIFICATION FLAG",
                         "FIRST TIME HOME BUYER INDICATOR",
                         "OCCUPANCY TYPE",
                         "ORIGINATION CHANNEL",
                         "PRODUCT TYPE",
                         "PROPERTY TYPE",
                         "RELOCATION MORTGAGE INDICATOR",
                         "SELLER NAME"]
target_variable = ["ZERO BALANCE CODE"]
numerical_variables = [[feature] for feature in df.columns if feature not in (categorical_variables + target_variable)]

In [65]:
def encode_data(dataframe, test_paid, test_default):
    
    cat_feature_def = gen_features(columns = categorical_variables, classes = [LabelBinarizer])
    num_feature_def = gen_features(columns = numerical_variables, classes = [StandardScaler])
    
    
    mapper = DataFrameMapper(cat_feature_def + num_feature_def, drop_cols = target_variable)
    
    mapper.fit(dataframe)
    train_data = mapper.transform(dataframe)
    test_paid_data = mapper.transform(test_paid)
    test_default_data = mapper.transform(test_default)
    
    return train_data, test_paid_data, test_default_data

In [68]:
train_data, test_input_paid, test_input_default = encode_data(train_df, test_df_paid, test_df_default)

In [69]:
train_data.shape

(407763, 48)

In [70]:
test_input_paid.shape

(4119, 48)

In [71]:
test_input_default.shape

(1920, 48)

In [72]:
# # Standardizing numerical features
# sc_num = StandardScaler()
# sc_num.fit(train_df[numerical_variables])

# train_num_variables_encoded = sc_num.transform(train_df[numerical_variables])
# test_num_paid = sc_num.transform(test_df_paid[numerical_variables])
# test_num_default = sc_num.transform(test_df_default[numerical_variables])

In [73]:
# #One hot encoding categorical variables
# one_cat = OneHotEncoder(handle_unknown="ignore")
# one_cat.fit(train_df[categorical_variables])

# train_cat_variables_encoded = one_cat.transform(train_df[categorical_variables])
# test_cat_paid = one_cat.transform(test_df_paid[categorical_variables])
# test_cat_default = one_cat.transform(test_df_default[categorical_variables])

## Autoencoder

In [74]:
# Create Input
# train_input = sparse.hstack((train_cat_variables_encoded, train_num_variables_encoded)).toarray()
train_data = np.expand_dims(train_data, axis = 2)
input_shape = (train_data.shape[1], train_data.shape[2])

In [75]:
# Model
def get_model(input_shape):
    autoencoder = tf.keras.models.Sequential()
    # Encoding Layers
    autoencoder.add(tf.keras.layers.Conv1D(64, 2, activation = "relu", padding = "same", input_shape = input_shape))
    autoencoder.add(tf.keras.layers.MaxPool1D(2))
    autoencoder.add(tf.keras.layers.BatchNormalization())
    autoencoder.add(tf.keras.layers.Conv1D(32, 2, activation = "relu", padding = "same"))
    autoencoder.add(tf.keras.layers.MaxPool1D(2))
    autoencoder.add(tf.keras.layers.BatchNormalization())
    autoencoder.add(tf.keras.layers.Flatten())
    autoencoder.add(tf.keras.layers.Dense(48, activation="relu"))
    
    # Compressed representation
    autoencoder.add(tf.keras.layers.Dense(LATENT_DIMENSION, activation="relu"))
    
    # Decoding layers
    autoencoder.add(tf.keras.layers.Dense(48, activation="relu"))
    autoencoder.add(tf.keras.layers.Reshape((12, 4)))
    autoencoder.add(tf.keras.layers.Conv1D(32, 2, activation = "relu", padding = "same"))
    autoencoder.add(tf.keras.layers.UpSampling1D(2))
    autoencoder.add(tf.keras.layers.BatchNormalization())
    autoencoder.add(tf.keras.layers.Conv1D(64, 2, activation = "relu", padding = "same"))
    autoencoder.add(tf.keras.layers.UpSampling1D(2))
    autoencoder.add(tf.keras.layers.BatchNormalization())

    autoencoder.add(tf.keras.layers.Conv1D(1, 2, padding = "same"))
    
    return autoencoder

In [76]:
# Compiling Model
model = get_model(input_shape)
model.compile(optimizer = tf.keras.optimizers.Adam(0.01), loss = "mae")
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_6 (Conv1D)            (None, 48, 64)            192       
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 24, 64)            0         
_________________________________________________________________
batch_normalization_5 (Batch (None, 24, 64)            256       
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 24, 32)            4128      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 12, 32)            0         
_________________________________________________________________
batch_normalization_6 (Batch (None, 12, 32)            128       
_________________________________________________________________
flatten_2 (Flatten)          (None, 384)               0         
__________

In [77]:
callback=[tf.keras.callbacks.EarlyStopping(patience=5, monitor="val_loss")]

In [78]:
model.fit(train_data, train_data, epochs = 100, batch_size = BATCH_SIZE, callbacks=callback, validation_split=0.25)

Train on 305822 samples, validate on 101941 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100


<tensorflow.python.keras._impl.keras.callbacks.History at 0x2b042640fb38>

## Reconstructing Test Data

In [79]:
reconstructions_paid = model.predict(np.expand_dims(test_input_paid, axis = 2))
reconstructions_default = model.predict(np.expand_dims(test_input_default, axis = 2))

In [84]:
mae_default.shape

(1920,)

In [82]:
mae_paid = np.mean(np.abs(np.squeeze(test_input_paid) - np.squeeze(reconstructions_paid)), axis=1)
mae_default = np.mean(np.abs(np.squeeze(test_input_default) - np.squeeze(reconstructions_default)), axis=1)

In [89]:
mae_paid.mean()

0.06295656544415648

In [86]:
mae_default.mean()

0.06928936402903059

## Plan of Action for the Next Two Weeks



### 1. Clustering of latent representations(Suggestion by Junyi)
#### Train the autoencoder on paid mortgages
#### Extract the latent representations for both paid and defaulted mortgages
#### Cluster these representations using Kmeans
#### t-distributed stochastic neighbor embedding (t-SNE) for visualizing high dimensional data

### 2. Categorical data embedding  

### 3. Compare performance with existing work
#### Deep Learning for Anomaly Detection: A Survey - Chalapathy et.al