# Credit Risk


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
customer_data=pd.read_csv('../input/credit-risk-classification-dataset/customer_data.csv')


In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.layers import Dropout
from keras import regularizers


In [None]:
def create_binary_model():
    # create model
    model = Sequential()
    model.add(Dense(16, input_dim=13, kernel_initializer='normal',  kernel_regularizer=regularizers.l2(0.001),activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(8, kernel_initializer='normal',  kernel_regularizer=regularizers.l2(0.001),activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile model
    adam = Adam(lr=0.001)
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

binary_model = create_binary_model()

print(binary_model.summary())

In [None]:
print(customer_data.info())

In [None]:
#remove NaN data
customer_data=customer_data.dropna()
#Lets run a regression
# We need to create test and train set to evaluate how good our model is at the end
import statsmodels.api as sm
exog = customer_data[['fea_1', 'fea_2','fea_3','fea_4', 'fea_5','fea_6','fea_7', 'fea_8','fea_9','fea_10', 'fea_11']]
endog= customer_data['label']
exog_train, exog_test, endog_train, endog_test = train_test_split(exog, endog, train_size = 0.70, test_size = 0.30, random_state = 1)
binomial_model = sm.GLM(endog_train, exog_train, family=sm.families.Binomial())
binomial_results = binomial_model.fit()
print(binomial_results.summary())

In [None]:
#only feature 4 has a significant z-value but let us see whether there is correlation between variables
#which can cause multicollinearity and make variables less significant

In [None]:
exog_train.corr()

In [None]:
#looking for correlations greater than 0.2
#fea_1 and fea_6 are correlated, fea_2 and fea_3,_4 are correlated, fea_3 correlated with fea_7,
#fea_7 and _10 are correlated, fea_10 and _11 are correlated
#from the corrleated features remove the one with the lower z-value
#6,2,3,7
exog = exog_train[['fea_1','fea_4', 'fea_5','fea_8','fea_9','fea_10', 'fea_11']]

In [None]:
binomial_model = sm.GLM(endog_train, exog, family=sm.families.Binomial())
binomial_results = binomial_model.fit()
print(binomial_results.summary())

In [None]:
#Lets remove more features that are very insignificant
exog = exog_train[['fea_1','fea_4','fea_8','fea_11']]
binomial_model = sm.GLM(endog_train, exog, family=sm.families.Binomial())
binomial_results = binomial_model.fit()
print(binomial_results.summary())

In [None]:
# we can make an argument of keeping feature 1 in hoping that when removing feature 11 feature 1's significance
#will rise but we must be aware that the correlation is small (11%) and this might not make the difference we are 
#hoping for
exog = exog_train[['fea_1','fea_4','fea_8']]
binomial_model = sm.GLM(endog_train, exog, family=sm.families.Binomial())
binomial_results = binomial_model.fit()
print(binomial_results.summary())

In [None]:
#remove feature 1
exog = exog_train[['fea_4','fea_8']]
binomial_model = sm.GLM(endog_train, exog, family=sm.families.Binomial())
binomial_results = binomial_model.fit()
print(binomial_results.summary())


In [None]:
coeff=[-9.133e-06,-0.0045]

In [None]:
exog_test[['fea_4','fea_8']]

In [None]:
matrix=np.array([[-9.133e-06],[-0.0045]])
test=exog_test[['fea_4','fea_8']]


In [None]:
#test is 293x2 ,matrix is 1x2 need to transpose the matrix
test_as_matrix=np.array(test)
test_as_matrix
predictions=np.matmul(test_as_matrix,matrix)
#making continuous variables binary so we can compare later
for i in range(293):
    if predictions[i]<0:
       predictions[i]=0
    if predictions[i]>0:
       predictions[i]=1


In [None]:
endog_test_matrix=np.array(endog_test)
endog_test_matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
score = accuracy_score(predictions,endog_test_matrix)
score
#this model is very good at predicting the zeros in the data unfortunatelly it is pretty bad
#when predicting 1's
#but what if we try an approach different than regression as the reason why our model is not good might
#be that the data is just not appropriate to regress on

In [None]:
#Lets try a different approach Neural Networks using Keras
def create_binary_model():
    # create model
    model = Sequential()
    model.add(Dense(16, input_dim=11, kernel_initializer='normal',  kernel_regularizer=regularizers.l2(0.001),activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(8, kernel_initializer='normal',  kernel_regularizer=regularizers.l2(0.001),activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile model
    adam = Adam(lr=0.001)
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

binary_model = create_binary_model()

print(binary_model.summary())

In [None]:
history=binary_model.fit(exog_train, endog_train, validation_data=(exog_test, endog_test),epochs=50, batch_size=10)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
# Model accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'])
plt.show()


In [None]:
from sklearn.metrics import classification_report, accuracy_score

# generate classification report using predictions for binary model 
binary_pred = np.round(binary_model.predict(exog_test)).astype(int)
print('Results for Binary Model')
print(accuracy_score(endog_test, binary_pred))
print(classification_report(endog_test, binary_pred))

In [None]:
#The model is not better than regression