In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('../input/credit-risk/original.csv')

In [None]:
data.head()

In [None]:
print(data.shape)
print(data.columns)
print(data.isnull().sum())

In [None]:
print(data.age.min(), data.age.max())
print(data.income.min(), data.income.max())
print(data.loan.min(),data.loan.max())
print(data.default.min(),data.default.max())

In [None]:
print(data.default.value_counts())
data.loc[data.age<0, 'age'] = data[data.age<0].age*(-1)

In [None]:
data.loc[data.age.isnull()==True,'age'] = data[data.age.isnull()!=True].age.mean()
data.reset_index(drop=True, inplace=True)

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

f, ax = plt.subplots(1,1,figsize=(10,5))
sns.countplot(data = data,x='default')
plt.tight_layout()

In [None]:
data.info()

In [None]:
print(data.age.value_counts())
print(data.age.isnull().count())

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10,5))
f1 = sns.distplot(data['income'],bins=50, ax=ax[0])
f1.set_title("history income ")
f2 = sns.violinplot(data=data, x='default',y='income',ax=ax[1])
f2.set_title('distribution default and income')

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10,5))
f1 = sns.distplot(data['age'],bins=50, ax=ax[0])
f1.set_title("history age ")
f2 = sns.violinplot(data=data, x='default',y='age',ax=ax[1])
f2.set_title('distribution default and age')

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10,5))
f1 = sns.distplot(data['loan'],bins=50, ax=ax[0])
f1.set_title("history loan ")
f2 = sns.violinplot(data=data, x='default',y='loan',ax=ax[1])
f2.set_title('distribution default and loan')

In [None]:
print(data.clientid.unique())
group1 = pd.DataFrame(data.groupby(['default'])['clientid'].count()).reset_index()
plt.bar(group1['default'], group1['clientid'])
plt.title('Count of defaut')
plt.show()

In [None]:
data = data.drop(['clientid'],axis=1)

In [None]:
X=data.iloc[:,0:-1].values
Y=data.iloc[:,-1].values

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(X,Y,train_size=0.8,test_size=0.2,random_state=1)

In [None]:
from sklearn.ensemble import RandomForestClassifier

randomforest = RandomForestClassifier(n_estimators = 50, random_state = 0)
randomforest.fit(x_train,y_train)

predict_train = randomforest.predict(x_train)
predict_test = randomforest.predict(x_test)

from sklearn.metrics import accuracy_score

print('accuracy Score predict train', accuracy_score(y_train,predict_train)*100)
print('accuracy Score predict test', accuracy_score(y_test,predict_test)*100)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

random_state = 20

classifier = [DecisionTreeClassifier(random_state= random_state),
             SVC(random_state=random_state),
             LogisticRegression(random_state= random_state),
             KNeighborsClassifier()]

dt_param_grid = {'min_samples_split':range(10,500,20), 'max_depth':range(1,20,2)}
svc_param_grid = {'kernel':['rbf'], 'gamma':[0.001,0.01,0.1,1],'C':[1,10,50,100,200,300,1000]}
#rf_param_grid = {'max_feature':[1,3,10], 'min_sample_split':[2,3,10], 'min_sample_leaf':[1,3,10], 'bootstrap':[False], 'n_estimators':[100,300], 'criterion':['gini']}
logreg_param_grid = {'C':np.logspace(-3,3,7), 'penalty':['l1','l2']}
knn_param_grid = {'n_neighbors':np.linspace(1,19,10,dtype=int).tolist(), 'weights':['uniform','distance'], 'metric':['euclidean','manhatan']}

classifier_param = [dt_param_grid,
                   svc_param_grid,
                   logreg_param_grid,
                   knn_param_grid]

cv_result = []
best_estimators= []

for i in range(len(classifier)):
    clf = GridSearchCV(classifier[i], param_grid = classifier_param[i], cv=StratifiedKFold(n_splits=10), scoring = 'accuracy', n_jobs= -1, verbose= 1)
    clf.fit(x_train, y_train)
    cv_result.append(clf.best_score_)
    best_estimators.append(clf.best_estimator_)
    print(cv_result[i])

In [None]:
cv_results = pd.DataFrame({'Cross Validation Means':cv_result, 'ML Models':['DecisionTreeClassifier','SVM','LogisticRegression','KNeighborsClassifier']})
graph1 = sns.barplot('Cross Validation Means','ML Models',data = cv_results)
graph1.set_xlabel('mean Accuracy')
graph1.set_ylabel('Cross Validation Score')

In [None]:
votingC=VotingClassifier(estimators=[("dt",best_estimators[0]),
                                     ("rfc",best_estimators[2]),
                                     ("lr",best_estimators[3])],
                                      voting="soft",n_jobs=-1)
votingC=votingC.fit(x_train,y_train)
print('Highest Precison:',100*accuracy_score(votingC.predict(x_test),y_test))

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

import keras

from keras.models import Sequential
from keras.layers import Dense

classifier = Sequential()
classifier.add(Dense(units = 3, kernel_initializer = 'uniform', activation = 'relu', input_dim=3))

classifier.add(Dense(units = 4, kernel_initializer = 'uniform', activation= 'relu'))

classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation='sigmoid'))

classifier.compile(optimizer = 'adam', loss= 'binary_crossentropy',metrics = ['accuracy'])

classifier.fit(x_train,y_train,batch_size =10, epochs=50)

In [None]:
predict_train_ann = classifier.predict(x_train)
predict_test_ann = classifier.predict(x_test)

predict_train_ann = predict_train_ann.flatten()
predict_test_ann = predict_test_ann.flatten()

train_pred = pd.DataFrame({'y_train':y_train, 'y_train_pred':predict_train_ann})
test_pred = pd.DataFrame({'y_test':y_test , 'y_test_pred':predict_test_ann})

train_pred['y_train_pred'] = train_pred.y_train_pred.apply(lambda x:1 if x>=0.5 else 0)
test_pred['y_test_pred'] = test_pred.y_test_pred.apply(lambda x:1 if x>=0.5 else 0)

print('ann accuract Score',accuracy_score(y_train,train_pred.y_train_pred)*100)
print('ann accuract Score',accuracy_score(y_test,test_pred.y_test_pred)*100)

In [None]:
total = len(y_train)

one_count = np.sum(y_train)
zero_count = total - one_count

plt.figure(figsize=(10,5))

plt.plot([0,total],[0,one_count], c='b', linestyle='--',label ='RamdomForest')
plt.plot([0, one_count,total],[0,one_count,one_count],c='grey',linewidth =2, label='Perfect model')

lm = [y for _, y in sorted(zip(predict_train_ann,y_train),reverse=True)]
x = np.arange(0,total+1)
y = np.append([0], np.cumsum(lm))
plt.plot(x,y,c='b',label='ann', linewidth =2)

lm = [y for _,y in sorted(zip(predict_train,y_train),reverse=True)]
x = np.arange(0,total+1)
y = np.append([0],np.cumsum(lm))
plt.plot(x,y,c='red',label='Random forest',linewidth=2)

plt.legend()

In [None]:
total = len(y_test)

one_count = np.sum(y_test)
zero_count = total - one_count

plt.figure(figsize=(10,5))

plt.plot([0,total],[0,one_count], c='b', linestyle='--',label ='RamdomForest')
plt.plot([0, one_count,total],[0,one_count,one_count],c='grey',linewidth =2, label='Perfect model')

lm = [y for _, y in sorted(zip(predict_test_ann,y_test),reverse=True)]
x = np.arange(0,total+1)
y = np.append([0], np.cumsum(lm))
plt.plot(x,y,c='b',label='ann', linewidth =2)

lm = [y for _,y in sorted(zip(predict_test,y_test),reverse=True)]
x = np.arange(0,total+1)
y = np.append([0],np.cumsum(lm))
plt.plot(x,y,c='red',label='Random forest',linewidth=2)

plt.legend()