In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
np.random.seed(203)
from tqdm import tqdm
import datetime
from collections import Counter
import re

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from keras.models import Model,Sequential
from keras.layers import *
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras import backend as K
from keras.utils import to_categorical
from keras.optimizers import RMSprop, Adam

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
data_train = pd.read_csv('/Users/s0c02nj/Desktop/Mortgage Modelling/CAX_MortgageModeling_Train.csv')
data_test = pd.read_csv('/Users/s0c02nj/Desktop/Mortgage Modelling/CAX_MortgageModeling_Test.csv')
data_sub = pd.read_csv('/Users/s0c02nj/Desktop/Mortgage Modelling/CAX_MortgageModeling_SubmissionFormat.csv')

In [4]:
data_train = data_train.drop(['Unique_ID','MORTGAGE NUMBER'],axis=1)
data_test =  data_test.drop(['Unique_ID','MORTGAGE NUMBER','RESULT'], axis=1)

In [5]:
#Train data separating the response
y       = data_train['RESULT']
x_train = data_train.drop(['RESULT'],axis=1)


In [None]:
(data_train['RESULT'].value_counts()[1])

In [6]:
#Test data and train data merged for pre-processing
x_comb = pd.concat([x_train,data_test],sort=False)

In [7]:
cont_cols = ['PROPERTY VALUE','MORTGAGE PAYMENT','GDS','LTV','TDS','AMORTIZATION','MORTGAGE AMOUNT',
             'INCOME','CREDIT SCORE','RATE']

In [8]:
cat_cols = ['PAYMENT FREQUENCY','PROPERTY TYPE','FSA','TERM','AGE RANGE','GENDER','INCOME TYPE',
           'NAICS CODE','MORTGAGE PURPOSE']

In [9]:
log_transformed = ['PROPERTY VALUE','MORTGAGE PAYMENT','LTV','INCOME','CREDIT SCORE','MORTGAGE AMOUNT']

In [10]:
non_log_cols = ['GDS','LTV','TDS','AMORTIZATION','RATE']

In [11]:
for col in tqdm(cat_cols):
    le = LabelEncoder()
    x_comb[col] = le.fit_transform(x_comb[col])

100%|██████████| 9/9 [00:00<00:00, 38.40it/s]


In [12]:
#Label Encoding the y-var
le_y = LabelEncoder()

y_cat = le_y.fit_transform(y)

In [13]:
for col in log_transformed:
    x_comb[col] = np.log1p(x_comb[col])


In [14]:
col_count = []

for i,col in tqdm(enumerate(cont_cols)):
    counter = Counter(x_comb[col])
    col_count.append(str(col)+'count')
    x_comb[str(col)+'count'] = x_comb[col].apply(lambda x:counter[x])

10it [00:00, 14.68it/s]


In [15]:
cont_cols_new = cont_cols + cat_cols

In [16]:
train_x = x_comb[0:45642]
test_x =  x_comb[45642:]

In [17]:
train_cat = train_x[cat_cols]
test_cat =  test_x[cat_cols]

train_cont = train_x[cont_cols_new]
test_cont = test_x[cont_cols_new]

In [18]:
def get_train_test_data_cat(df):
    
    data=[]
    for col in cat_cols:
        data.append(df[col])
    
    return data

In [19]:
x_train_cat = get_train_test_data_cat(train_x)
x_test_cat  = get_train_test_data_cat(test_x)

In [20]:
x_train_comb = x_train_cat+[train_cont]
x_test_comb = x_test_cat+ [test_cont]

In [29]:
y_def = to_categorical(y_cat)

In [21]:
len(cont_cols_new)

19

In [30]:
def model_deep():
    
    layer_cat  = []
    input_cat =  []
    
    #Categorical_var
    for i,categoical_var in tqdm(enumerate((cat_cols))): 
        
        no_of_unique_cat  = x_comb[categoical_var].nunique()
        embedding_size = min(np.ceil((no_of_unique_cat)/2),5)
        embedding_size = int(embedding_size)
        
        #Defining the input-----> branch_id
        input_catg = Input(shape=(1,))
        layer_catg =  Embedding(no_of_unique_cat+1 ,embedding_size,input_length=1,trainable=True)(input_catg)
        layer_catg =  Flatten()(layer_catg)
        layer_cat.append(layer_catg)
        input_cat.append(input_catg)
    
    
    #continuous var
    input_cont = Input(shape=(19,))
    layer_cont = Dense(30, activation='relu')(input_cont)
        
    #Merging
    layer_comb =  layer_cat + [layer_cont]
    layer_comb = concatenate(layer_comb)
    
    layer_dense = Dense(15, activation='relu')(layer_comb)
    layer_dense = Dropout(0.5)(layer_dense)
    
    #Final output
    layer_output = Dense(2,activation='softmax')(layer_dense)
    
    #Comb_inpus
    input_comb = input_cat + [input_cont]
    
    #Final model
    model = Model(inputs= input_comb ,outputs=layer_output)
    
    return model
        
    

In [31]:
model1 = model_deep()
model1.summary()

9it [00:00, 50.97it/s]


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_13 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_15 (

In [32]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [33]:
model1.compile(loss = "categorical_crossentropy", 
              optimizer = Adam(lr=0.01), 
              metrics = [f1])

In [38]:
history = model1.fit(x_train_comb,y_def,
                         batch_size = 256, 
                         epochs = 10, 
                         validation_split=0.16,
                         verbose = 1)


Train on 38339 samples, validate on 7303 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [39]:
pred = model1.predict(x_test_comb)

In [None]:
#Pred
y_classes = pred.argmax(axis=-1)

In [None]:
y_final = list(le_y.inverse_transform(y_classes))

In [None]:
data_sub['amount_spent_per_room_night_scaled']=pred

In [None]:
data_sub.to_csv('/Users/s0c02nj/Desktop/Mahindra/Sub_cat7.csv',index=False)