In [22]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
np.random.seed(203)
from tqdm import tqdm
import datetime
from collections import Counter
import re

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from keras.models import Model
from keras.layers import Input, Dropout, Dense, concatenate, Embedding
from keras.layers import BatchNormalization, Flatten, Activation
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras import backend as K
from keras.utils import to_categorical
from keras.optimizers import RMSprop, Adam

In [2]:
train_data = pd.read_csv('/Users/s0c02nj/Desktop/LTFS/train_aox2Jxw/train.csv')
test_data =  pd.read_csv('/Users/s0c02nj/Desktop/LTFS/test_bqCt9Pv.csv')
sub_data = pd.read_csv('/Users/s0c02nj/Desktop/LTFS/sample_submission_24jSKY6.csv')

In [3]:
train_data.head()

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,...,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,loan_default
0,420825,50578,58400,89.55,67,22807,45,1441,01-01-84,Salaried,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0,0
1,537409,47145,65550,73.23,67,22807,45,1502,31-07-85,Self employed,...,0,0,1991,0,0,1,1yrs 11mon,1yrs 11mon,0,1
2,417566,53278,61360,89.63,67,22807,45,1497,24-08-85,Self employed,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0,0
3,624493,57513,66113,88.48,67,22807,45,1501,30-12-93,Self employed,...,0,0,31,0,0,0,0yrs 8mon,1yrs 3mon,1,1
4,539055,52378,60300,88.39,67,22807,45,1495,09-12-77,Self employed,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,1,1


In [5]:
x_train = train_data.drop('loan_default',axis=1)
y_def = train_data['loan_default']

In [6]:
x_train = x_train.drop('UniqueID',axis=1)
x_test = test_data.drop('UniqueID',axis=1)

In [7]:
x_comb = pd.concat([x_train,x_test],sort=False)

In [8]:
x_comb = x_comb.replace(np.nan, 'unknown', regex=True)

In [9]:
from datetime import timedelta, date
col = 'date'
x_comb['Date.of.Birth'] = pd.to_datetime(x_comb['Date.of.Birth'])
future = x_comb['Date.of.Birth'] > date(year=2050,month=1,day=1)
x_comb.loc[future, 'Date.of.Birth'] -= timedelta(days=365.25*100)

'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  after removing the cwd from sys.path.


In [10]:
from datetime import timedelta, date
col = 'date'
x_comb['DisbursalDate'] = pd.to_datetime(x_comb['DisbursalDate'])
future = x_comb['DisbursalDate'] > date(year=2050,month=1,day=1)
x_comb.loc[future, 'DisbursalDate'] -= timedelta(days=365.25*100)

'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  after removing the cwd from sys.path.


In [11]:
x_comb['age'] = 2018-x_comb['Date.of.Birth'].dt.year

In [12]:
x_comb['disbursion_wk'] = x_comb['DisbursalDate'].dt.week
x_comb['disbursion_mnth'] = x_comb['DisbursalDate'].dt.month
x_comb['disbursion_day'] = x_comb['DisbursalDate'].dt.day

In [13]:
x_comb['avg_acct_age_yr'] =  x_comb['AVERAGE.ACCT.AGE'].apply(lambda x:re.findall(r'\d+', x)[0]).astype(int)
x_comb['avg_acct_age_mon'] = x_comb['AVERAGE.ACCT.AGE'].apply(lambda x:re.findall(r'\d+', x)[1]).astype(int)

x_comb['cred_his_yr'] =  x_comb['CREDIT.HISTORY.LENGTH'].apply(lambda x:re.findall(r'\d+', x)[0]).astype(int)
x_comb['cred_his_mon'] = x_comb['CREDIT.HISTORY.LENGTH'].apply(lambda x:re.findall(r'\d+', x)[1]).astype(int)

In [14]:
x_comb = x_comb.drop(['Date.of.Birth', 'DisbursalDate','AVERAGE.ACCT.AGE','CREDIT.HISTORY.LENGTH'], axis=1)

In [15]:
x_comb['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS_cat'] = x_comb['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS']

In [None]:
x_comb.columns

In [16]:
com_count={}
for col in tqdm(list(x_comb.columns)):
    val = len(x_comb[col].unique())
    com_count[col]=val

100%|██████████| 44/44 [00:00<00:00, 280.53it/s]


In [17]:
total_cols = list(x_comb.columns)

In [56]:
cat_cols = ['branch_id','supplier_id','manufacturer_id','Current_pincode_ID','Employment.Type',
           'State_ID','Employee_code_ID','Aadhar_flag','PAN_flag','VoterID_flag','Driving_flag','MobileNo_Avl_Flag',
           'Passport_flag','PERFORM_CNS.SCORE.DESCRIPTION','DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS_cat',
           'disbursion_wk','disbursion_mnth','disbursion_day']

In [57]:
cont_cols = list(set(total_cols)-set(cat_cols))

In [58]:
for i,col in tqdm(enumerate(cont_cols)):
    counter = Counter(x_comb[col])
    x_comb[str(col)+'count'] = x_comb[col].apply(lambda x:counter[x])

26it [00:04,  5.56it/s]


In [59]:
cont_cols_new = list(set(list(x_comb.columns))-set(cat_cols))

1.Label Encoding Categorical Columns

In [60]:
for col in tqdm(cat_cols):
    le = LabelEncoder()
    x_comb[col] = le.fit_transform(x_comb[col])

100%|██████████| 18/18 [00:00<00:00, 56.92it/s]


In [61]:
df_cat = x_comb[cat_cols]
df_cont = x_comb[cont_cols_new]

In [62]:
df_cont[cont_cols] = StandardScaler().fit_transform(df_cont[cont_cols])

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [63]:
df_cont.index = range(0,len(df_cont))
df_cat.index = range(0,len(df_cat))

In [64]:
df_cont_train = df_cont[0:233154]
df_cont_test = df_cont[233154:]

df_cat_train = df_cat[0:233154]
df_cat_test = df_cat[233154:]

In [65]:
df_branch_id_train = df_cat_train['branch_id']
df_supplier_id_train = df_cat_train['supplier_id']
df_manufacturer_id_train = df_cat_train['manufacturer_id']
df_Current_pincode_ID_train=df_cat_train['Current_pincode_ID']
df_Employment_Type_train = df_cat_train['Employment.Type']
df_State_ID_train = df_cat_train['State_ID']
df_Employee_code_ID_train = df_cat_train['Employee_code_ID']
df_Aadhar_flag_train = df_cat_train['Aadhar_flag']
df_PAN_flag_train = df_cat_train['PAN_flag']
df_VoterID_flag_train = df_cat_train['VoterID_flag']
df_Driving_flag_train = df_cat_train['Driving_flag']
df_Passport_flag_train = df_cat_train['Passport_flag']
df_PERFORM_CNS_SCORE_DESCRIPTION_train = df_cat_train['PERFORM_CNS.SCORE.DESCRIPTION']
df_DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS_cat_train =df_cat_train['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS_cat']
df_disbursion_wk_train = df_cat_train['disbursion_wk']
df_disbursion_mnth_train = df_cat_train['disbursion_mnth']
df_disbursion_day_train  = df_cat_train['disbursion_day']


In [66]:
df_branch_id_test = df_cat_test['branch_id']
df_supplier_id_test = df_cat_test['supplier_id']
df_manufacturer_id_test = df_cat_test['manufacturer_id']
df_Current_pincode_ID_test=df_cat_test['Current_pincode_ID']
df_Employment_Type_test = df_cat_test['Employment.Type']
df_State_ID_test = df_cat_test['State_ID']
df_Employee_code_ID_test = df_cat_test['Employee_code_ID']
df_Aadhar_flag_test = df_cat_test['Aadhar_flag']
df_PAN_flag_test = df_cat_test['PAN_flag']
df_VoterID_flag_test = df_cat_test['VoterID_flag']
df_Driving_flag_test = df_cat_test['Driving_flag']
df_Passport_flag_test = df_cat_test['Passport_flag']
df_PERFORM_CNS_SCORE_DESCRIPTION_test = df_cat_test['PERFORM_CNS.SCORE.DESCRIPTION']
df_DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS_cat_test=df_cat_test['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS_cat']
df_disbursion_wk_test = df_cat_test['disbursion_wk']
df_disbursion_mnth_test = df_cat_test['disbursion_mnth']
df_disbursion_day_test = df_cat_test['disbursion_day']


In [67]:
com_cont={}
for col in tqdm(list(df_cont.columns)):
    val = len(df_cont[col].unique())
    com_cont[col]=val

100%|██████████| 54/54 [00:00<00:00, 320.49it/s]


In [68]:
#com_cont

In [69]:
y_cat = to_categorical(y_def)

In [72]:
def model_deep():
    
    #Defining the input-----> branch_id
    inputs1 = Input(shape=(1,))
    layer1 = Embedding(com_count['branch_id'] ,80,input_length=1,trainable=True)(inputs1)
    layer1 = Flatten()(layer1)
    
    #Defining the input-----> supplier_id
    inputs2 = Input(shape=(1,))
    layer2 = Embedding(com_count['supplier_id'] ,3089,input_length=1,trainable=True)(inputs2)
    layer2 = Flatten()(layer2)
    
    #Defining the input-----> manufacturer_id
    inputs3 = Input(shape=(1,))
    layer3 = Embedding(com_count['manufacturer_id'] ,12,input_length=1,trainable=True)(inputs3)
    layer3 = Flatten()(layer3)
    
    #Defining the input-----> Current_pincode_ID
    inputs4 = Input(shape=(1,))
    layer4 = Embedding(com_count['Current_pincode_ID'] ,7096,input_length=1,trainable=True)(inputs4)
    layer4 = Flatten()(layer4)
    
    #Defining the input-----> Employment.Type
    inputs5 = Input(shape=(1,))
    layer5 = Embedding(com_count['Employment.Type'] ,2,input_length=1,trainable=True)(inputs5)
    layer5 = Flatten()(layer5)
    
    #Defining the input-----> State_ID
    inputs6 = Input(shape=(1,))
    layer6 = Embedding(com_count['State_ID'] ,22,input_length=1,trainable=True)(inputs6)
    layer6 = Flatten()(layer6)
    
    #Defining the input-----> Employee_code_ID
    inputs7 = Input(shape=(1,))
    layer7 = Embedding(com_count['Employee_code_ID'] ,3398,input_length=1,trainable=True)(inputs7)
    layer7 = Flatten()(layer7)
    
    #Defining the input-----> Aadhar_flag
    inputs8 = Input(shape=(1,))
    layer8 = Embedding(com_count['Aadhar_flag'] ,2,input_length=1,trainable=True)(inputs8)
    layer8 = Flatten()(layer8)
    
    #Defining the input-----> PAN_flag
    inputs9 = Input(shape=(1,))
    layer9 = Embedding(com_count['PAN_flag'] ,2,input_length=1,trainable=True)(inputs9)
    layer9 = Flatten()(layer9)
    
    #Defining the input-----> VoterID_flag
    inputs10 = Input(shape=(1,))
    layer10 = Embedding(com_count['VoterID_flag'] ,2,input_length=1,trainable=True)(inputs10)
    layer10 = Flatten()(layer10)
    
    #Defining the input-----> Driving_flag
    inputs11 = Input(shape=(1,))
    layer11 =  Embedding(com_count['Driving_flag'] ,2,input_length=1,trainable=True)(inputs11)
    layer11 =  Flatten()(layer11)
    
    #Defining the input-----> Passport_flag
    inputs12 = Input(shape=(1,))
    layer12 = Embedding(com_count['Passport_flag'] ,2,input_length=1,trainable=True)(inputs12)
    layer12 = Flatten()(layer12)
    
    #Defining the input-----> PERFORM_CNS.SCORE.DESCRIPTION
    inputs13 = Input(shape=(1,))
    layer13 = Embedding(com_count['PERFORM_CNS.SCORE.DESCRIPTION'] ,20,input_length=1,trainable=True)(inputs13)
    layer13 = Flatten()(layer13)
    
    #Defining the input-----> DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS_cat
    inputs14 = Input(shape=(1,))
    layer14 = Embedding(com_count['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS_cat'] ,16,input_length=1,
                        trainable=True)(inputs14)
    layer14 = Flatten()(layer14)
    
    #Defining the input-----> disbursion_wk
    inputs15 = Input(shape=(1,))
    layer15 = Embedding(com_count['disbursion_wk'] ,25,input_length=1,trainable=True)(inputs15)
    layer15 = Flatten()(layer15)
    
    #Defining the input-----> disbursion_mnth
    inputs16 = Input(shape=(1,))
    layer16 = Embedding(com_count['disbursion_mnth'] ,10,input_length=1,trainable=True)(inputs16)
    layer16 = Flatten()(layer16)
    
    #Defining the input-----> disbursion_day
    inputs17 = Input(shape=(1,))
    layer17 = Embedding(com_count['disbursion_day'] ,15,input_length=1,trainable=True)(inputs17)
    layer17 = Flatten()(layer17)
    
    #Defining the input-----> Consumer-complaint-summary
    inputs_cont = Input(shape=(54,))
    #layer_cont =  Dense(50, activation='relu')(inputs_cont)
    
    #Merge,Cocatenating the inputes
    layer_merge= concatenate([layer1, layer2,layer3,layer4,layer5,layer6,layer7,layer8,layer9, 
                              layer10,layer11,layer12,layer13,layer14,layer15,layer16,layer17,
                              inputs_cont])
    
    #Dense Layers
    layer_dense = Dense(512, activation='relu')(layer_merge)
    layer_dense = Dropout(0.5)(layer_dense)

    #Output Layer
    probabilities = Dense(2,activation='softmax')(layer_dense)

    model = Model(inputs=[inputs1,inputs2,inputs3,inputs4,inputs5,inputs6,inputs7,inputs8,inputs9,
                          inputs10,inputs11,inputs12,inputs13,inputs14,inputs15,inputs16,inputs17,inputs_cont]
                          ,outputs=probabilities)
    return model



In [77]:
df_cont_train.shape

(233154, 54)

In [78]:
model1 = model_deep()
model1.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_18 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_19 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_20 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_21 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_22 (

In [79]:
def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc

In [80]:
model1.compile(loss = "binary_crossentropy", optimizer = Adam(lr=0.001), metrics = [auc])

In [81]:
#Deep Learning Model
history = model1.fit([df_branch_id_train,
                     df_supplier_id_train,
                     df_manufacturer_id_train,
                     df_Current_pincode_ID_train,
                     df_Employment_Type_train,
                     df_State_ID_train,
                     df_Employee_code_ID_train,
                     df_Aadhar_flag_train,
                     df_PAN_flag_train,
                     df_VoterID_flag_train,
                     df_Driving_flag_train,
                     df_Passport_flag_train,
                     df_PERFORM_CNS_SCORE_DESCRIPTION_train,
                     df_DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS_cat_train,
                     df_disbursion_wk_train,
                     df_disbursion_mnth_train,
                     df_disbursion_day_train,
                     df_cont_train] ,
                     y_cat, batch_size = 4096, epochs = 2, validation_split=0.1, 
                     verbose = 1)


Train on 209838 samples, validate on 23316 samples
Epoch 1/2
Epoch 2/2


In [None]:
test_pred = model1.predict([df_branch_id_test,
                     df_supplier_id_test,
                     df_manufacturer_id_test,
                     df_Current_pincode_ID_test,
                     df_Employment_Type_test,
                     df_State_ID_test,
                     df_Employee_code_ID_test,
                     df_Aadhar_flag_test,
                     df_PAN_flag_test,
                     df_VoterID_flag_test,
                     df_Driving_flag_test,
                     df_Passport_flag_test,
                     df_PERFORM_CNS_SCORE_DESCRIPTION_test,
                     df_DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS_cat_test,
                     df_disbursion_wk_test,
                     df_disbursion_mnth_test,
                     df_disbursion_day_test,
                     df_cont_test])

In [None]:
pred_ones = test_pred[:,1]

In [None]:
pred_ones

In [None]:
#train_data['loan_default'].value_counts()

In [None]:
#pred_final = (pred_ones < 0.5).astype(int)

In [None]:
sub_data['loan_default'] = pred_ones

In [None]:
sub_data.to_csv('/Users/s0c02nj/Desktop/LTFS/Submission4.csv',index=False)