In [1]:
%matplotlib inline
import os
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import tensorflow
from keras.layers import Dense, Dropout
from keras.models import Model, Sequential
from keras.optimizers import Adam, SGD, RMSprop
from keras.callbacks import ReduceLROnPlateau,EarlyStopping, ModelCheckpoint
import seaborn as sns
from termcolor import colored
import warnings

Using TensorFlow backend.


In [2]:
credit_train = pd.read_csv('../input/data-storm-10/credit_card_default_train.csv')
credit_train.drop('Client_ID',axis=1,inplace=True)

credit_test = pd.read_csv('../input/data-storm-10/credit_card_default_test.csv')
credit_test.drop('Client_ID',axis=1,inplace=True)

In [3]:
def gender_map(dataframe):
    gender_map = {'M': 1, 'F':0}
    dataframe['Gender'] = dataframe['Gender'].map(gender_map)
    
def education_map(dataframe):
    edu_map = {'Graduate':0, 'High School':1, 'Other':2}
    dataframe['EDUCATION_STATUS'] = dataframe['EDUCATION_STATUS'].map(edu_map)
    
def marriage_map(dataframe):
    marry_map = {'Single':0, 'Other':1}
    dataframe['MARITAL_STATUS'] = dataframe['MARITAL_STATUS'].map(marry_map)
    
def age_map(dataframe):
    age_map = {'Less than 30':0, '31-45':1, '46-65':2, 'More than 65':3}
    dataframe['AGE'] = dataframe['AGE'].map(age_map)
    
def normalize(df):
    col_to_norm = ['PAID_AMT_JULY', 'PAID_AMT_AUG', 'PAID_AMT_SEP', 
                   'PAID_AMT_OCT','PAID_AMT_NOV','DUE_AMT_AVG', 'PAID_AVG']
    df[col_to_norm] = df[col_to_norm].apply(lambda x : (x-np.min(x)/(np.max(x)-np.min(x))))
   
    
def pay_month(dataframe):
    cols = ['PAY_JULY', 'PAY_AUG', 'PAY_SEP', 'PAY_OCT', 'PAY_NOV', 'PAY_DEC']
    for col in cols:
        dataframe[col] = dataframe[col].mask(dataframe[col] < 1, -1)
        dataframe[col].mask(dataframe[col] >= 1, 1)

In [4]:
#pre-processing the features of the training dataset
train_df = credit_train.copy()
gender_map(train_df)
education_map(train_df)
marriage_map(train_df)
age_map(train_df)
pay_month(train_df)

In [5]:
#pre-processing the features of the test dataset
test_df = credit_test.copy()
gender_map(test_df)
education_map(test_df)
marriage_map(test_df)
age_map(test_df)
pay_month(test_df)

In [6]:
#mapping the Balance_Limit_V1 categorically
train_test_data = [train_df, test_df]
balance_limit_mapping = {"100K": -2, "200K": -1, "300K": -1, "400K": 0, " 500K":1, "1M":2, "1.5M":3, "2.5M":4}
for dataset in train_test_data:
    dataset['Balance_Limit_V1'] = dataset['Balance_Limit_V1'].map(balance_limit_mapping)

In [7]:
#taking Balance_Limit_V1 as continuous values
credit_train.Balance_Limit_V1 = (credit_train.Balance_Limit_V1.replace(r'[KM]+$', '', regex=True).astype(float) * credit_train.Balance_Limit_V1.str.extract(r'[\d\.]+([KM]+)', expand=False).fillna(1).replace(['K','M'], [10**3, 10**6]).astype(int))
credit_train['Balance_Limit_V1'] = pd.to_numeric(credit_train['Balance_Limit_V1'])

credit_test.Balance_Limit_V1 = (credit_test.Balance_Limit_V1.replace(r'[KM]+$', '', regex=True).astype(float) * credit_test.Balance_Limit_V1.str.extract(r'[\d\.]+([KM]+)', expand=False).fillna(1).replace(['K','M'], [10**3, 10**6]).astype(int))
credit_test['Balance_Limit_V1'] = pd.to_numeric(credit_test['Balance_Limit_V1'])

In [8]:
#Feature Engineering 
def feature_eng(dataframe1, dataframe2):
    dataframe1['PAY_VALUE'] =  dataframe1.iloc[:, 5:11].sum(axis=1)*(6*dataframe2['Balance_Limit_V1'] - (dataframe1.iloc[:, 11:17].sum(axis=1)  - dataframe1.iloc[:, 17:23].sum(axis=1)))
    dataframe1['PAID_AVG'] = dataframe1.iloc[:,17:23].mean(axis=1)
    dataframe1['DUE_AMT_AVG'] = ((dataframe1['PAY_JULY']) * dataframe1['DUE_AMT_JULY'] + (dataframe1['PAY_AUG']) * dataframe1['DUE_AMT_AUG'] + (dataframe1['PAY_SEP']) * dataframe1['DUE_AMT_SEP'] + (dataframe1['PAY_OCT']) * dataframe1['DUE_AMT_OCT'] + (dataframe1['PAY_NOV']) * dataframe1['DUE_AMT_NOV'])/5 

In [9]:
feature_eng(train_df, credit_train)
feature_eng(test_df, credit_test)

#Correlations
corr_matrix = train_df.corr()
print(corr_matrix['NEXT_MONTH_DEFAULT'].sort_values(ascending=False))

NEXT_MONTH_DEFAULT    1.000000
PAY_JULY              0.398579
PAY_AUG               0.331788
PAY_SEP               0.286168
PAY_VALUE             0.274284
PAY_OCT               0.271352
PAY_NOV               0.265157
PAY_DEC               0.250302
DUE_AMT_AVG           0.227013
EDUCATION_STATUS      0.038902
AGE                   0.005968
DUE_AMT_DEC          -0.006521
DUE_AMT_NOV          -0.007614
DUE_AMT_OCT          -0.009755
DUE_AMT_SEP          -0.014722
DUE_AMT_AUG          -0.015180
DUE_AMT_JULY         -0.020010
MARITAL_STATUS       -0.027905
Gender               -0.041649
PAID_AMT_DEC         -0.052466
PAID_AMT_NOV         -0.053849
PAID_AMT_OCT         -0.056143
PAID_AMT_SEP         -0.057808
PAID_AMT_AUG         -0.059039
PAID_AMT_JULY        -0.072891
PAID_AVG             -0.101068
Balance_Limit_V1     -0.165948
Name: NEXT_MONTH_DEFAULT, dtype: float64


In [10]:
#features selected to train the model
feature_columns = ['Balance_Limit_V1', 'PAY_JULY','PAY_AUG','PAY_SEP','PAY_OCT','PAY_DEC','PAY_NOV', 'PAID_AMT_JULY', 'PAID_AMT_AUG', 
                   'PAID_AMT_SEP', 'PAID_AMT_OCT','PAID_AMT_NOV','PAID_AVG','PAY_VALUE', 'DUE_AMT_AVG']

In [11]:
train_df = train_df[feature_columns]
normalize(train_df)
train_df = train_df.join(credit_train['NEXT_MONTH_DEFAULT'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [12]:
test_df = test_df[feature_columns]
normalize(test_df)

In [13]:
X = train_df.iloc[:,:-1].values
Y = train_df.iloc[:,-1].values
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20, random_state = 42)

In [14]:
#checking whether the train and validation have similar distribution in terms of outcome
print(f'Proportion of credit defaults in training set : {np.sum(y_train==1)/len(y_train)}')
print(f'Proportion of credit defaults in test set : {np.sum(y_test==1)/len(y_test)}')

Proportion of credit defaults in training set : 0.22265625
Proportion of credit defaults in test set : 0.21979166666666666


In [15]:
#Sequencial Dense network
model = Sequential()
model.add(Dense(128, input_dim=len(feature_columns,), activation='tanh'))
model.add(Dense(64, activation='tanh'))
model.add(Dense(64))
model.add(Dropout(0.35))
model.add(Dense(32, activation='tanh'))
model.add(Dense(32, activation='tanh'))
model.add(Dense(32))
model.add(Dropout(0.35))
model.add(Dense(8, activation='tanh'))
model.add(Dense(8, activation='tanh'))
model.add(Dropout(0.25))
model.add(Dense(2, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               2176      
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_5 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_6 (Dense)              (None, 32)               

In [16]:
#Settting callbacks
early_stop = EarlyStopping(monitor='val_acc', patience=30, restore_best_weights=True)
reducer = ReduceLROnPlateau(monitor='val_acc', factor=0.01, patience=20)
callbacks = [early_stop, reducer]

In [17]:
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.0001, beta_1=0.97, decay=1e-16), metrics=['acc'])
history = model.fit(x_train, y_train, validation_data=(x_test, y_test) ,epochs=1000, batch_size=16, callbacks=callbacks)

Train on 19200 samples, validate on 4800 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000


In [18]:
y_pred = model.predict(x_test)

roc=metrics.roc_auc_score(y_test, y_pred>0.5)
acc = metrics.accuracy_score(y_test, y_pred>0.5)
prec = metrics.precision_score(y_test, y_pred>0.5)
rec = metrics.recall_score(y_test, y_pred>0.5)
f1 = metrics.f1_score(y_test, y_pred>0.5)

model_performance = pd.DataFrame([['Neural', acc,prec,rec, f1,roc]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score','ROC'])

model_performance

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC
0,Neural,0.81625,0.632466,0.391469,0.483607,0.663692


In [19]:
y_pred = model.predict(test_df)
y_pred = y_pred>0.5

In [20]:
test_file = pd.read_csv('/kaggle/input/data-storm-10/credit_card_default_test.csv')
test_file.loc[:,'NEXT_MONTH_DEFAULT'] = np.uint8(y_pred)
submission = test_file[['Client_ID', 'NEXT_MONTH_DEFAULT']]
submission.to_csv("sub9.csv",index=False)