In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import warnings
warnings.filterwarnings ('ignore')

## Load Dataframe

In [None]:
#load data
dataframe = pd.read_csv ('../input/german-credit-data-with-risk/german_credit_data.csv')

## Data Exploration

In [None]:
dataframe.head()

In [None]:
dataframe.info()

* There are a feature type that doesn't match, that's a Job feature.
* The saving account and checking account features have missing value.
* There are a features that are not used, that's Unnamed: 0

In [None]:
print ('Unique value of Saving accounts\n', dataframe['Saving accounts'].unique())
print ('Unique value of Checking account\n', dataframe['Checking account'].unique())
print ('Unique value of Purpose\n', dataframe['Purpose'].unique())

It can be seen that the features of saving and checking account are categorical data of ordinal type. So to handle it we need to use Labeling.

Meanwhile, the Purpose feature is categorical data of nominal type. So to handle it we need one hot encoding

In [None]:
dataframe[['Age', 'Credit amount', 'Duration']].describe()

In [None]:
#Check distribution of target
sns.set_style (style = "whitegrid")

plt.figure(figsize = (10, 5))
sns.countplot(dataframe['Risk'])
plt.title('Distribution of Target', color = 'blue', loc = 'center', fontsize = 20)
plt.xlabel('Category of Risk', color = 'black', fontsize = 14)
plt.ylabel('Count', color = 'black', fontsize = 14)
plt.show()

It appears that the distribution of the targets is not quite balanced 700 for good risk and 300 risk. 

In [None]:
#Check Distribution of numeric features
numeric_features = ['Age', 'Credit amount', 'Duration']
fig, ax = plt.subplots(1, 3, figsize=(15, 6))
dataframe[numeric_features][dataframe['Risk'] == "good"].hist(bins=10, color = 'Blue', alpha=0.5, ax = ax)
dataframe[numeric_features][dataframe['Risk'] == "bad"].hist(bins=10, color = 'Black', alpha=0.5, ax = ax)
plt.show()

Feature Age, Credit amount and duration there doesn't appear to be any inclination towards the Risk Credit of customers.

In [None]:
#Check distribution of feature
cols = ['Sex', 'Job', 'Housing', 'Purpose']

fig, axarr = plt.subplots(2, 3, figsize=(15, 15))
for i in cols:
    index = cols.index(i)
    plt.subplot(2, 3, index + 1)
    sns.countplot(x = i, data = dataframe, hue="Risk", palette = "deep")
    plt.xticks(rotation=90)

There is a tendency that bad risk will occur to people who are male, the type of work is 2,type housing is own housing, and the purpose of the credit loan is to pay off a car.

In [None]:
#Check outlier
fig, axarr = plt.subplots(1, 3, figsize=(10, 5))
cols = ['Age', 'Credit amount', 'Duration']
for i in cols:
    index = cols.index(i)
    plt.subplot(1,3,index + 1)
    sns.boxplot(dataframe[i])

## Data Preprocessing

In [None]:
#Drop unnecessary column
dataframe.drop('Unnamed: 0', axis = 1, inplace = True)

In [None]:
#Handle Missing Value
dataframe['Saving accounts'].fillna('None', inplace = True)
dataframe['Checking account'].fillna('None', inplace = True)

In [None]:
print ("Before handling ordinal type feature Saving accounts", dataframe['Saving accounts'].unique())
print ('Before handling ordinal type feature Checking accounts', dataframe['Checking account'].unique())
print ('Before handling ordinal type feature Checking accounts', dataframe['Risk'].unique())

In [None]:
le = LabelEncoder()
dataframe['Saving accounts'] = le.fit_transform(dataframe['Saving accounts'])
dataframe['Checking account'] = le.fit_transform(dataframe['Checking account'])
dataframe['Risk'] = le.fit_transform(dataframe['Risk'])

print ("After handling ordinal type feature Saving accounts", dataframe['Saving accounts'].unique())
print ('After handling ordinal type feature Checking accounts', dataframe['Checking account'].unique())
print ('After handling ordinal type feature Checking accounts', dataframe['Risk'].unique())

In [None]:
#Change data type Job
dataframe['Job'] = dataframe['Job'].astype(object)

In [None]:
#Handling outlier
Q1 = (dataframe[['Age', 'Credit amount', 'Duration']]).quantile(0.25)
Q3 = (dataframe[['Age', 'Credit amount', 'Duration']]).quantile(0.75)

IQR = Q3 - Q1

maximum = Q3 + (1.5*IQR)
minimum = Q3 - (1.5*IQR)

more_than = (dataframe[['Age', 'Credit amount', 'Duration']] > maximum)
lower_than = (dataframe[['Age', 'Credit amount', 'Duration']] < minimum)

dataframe[['Age', 'Credit amount', 'Duration']] = dataframe[['Age', 'Credit amount', 'Duration']].mask(more_than, maximum, axis=1)
dataframe[['Age', 'Credit amount', 'Duration']] = dataframe[['Age', 'Credit amount', 'Duration']].mask(lower_than, minimum, axis=1)

In [None]:
fig, axarr = plt.subplots(1, 3, figsize=(10, 5))
cols = ['Age', 'Credit amount', 'Duration']
for i in cols:
    index = cols.index(i)
    plt.subplot(1,3,index + 1)
    sns.boxplot(dataframe[i])

In [None]:
#One Hot Encoding
dataframe = pd.get_dummies(dataframe)

In [None]:
#Spliting Dataframe
X = dataframe.drop('Risk', axis = 1)
y = dataframe['Risk']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
print ('Size of X train', X_train.shape)
print ('Size of X test', X_test.shape)
print ('Size of y train', y_train.shape)
print ('Size of y test', y_test.shape)

## Modelling

In [None]:
DecisionTree = DecisionTreeClassifier()
DecisionTree.fit(X_train, y_train)
y_pred_train = DecisionTree.predict(X_train)
y_pred_test = DecisionTree.predict(X_test)

print ('Accuracy of model based on training set', accuracy_score(y_train, y_pred_train))
print ('Accuracy of model based on testing set', accuracy_score(y_test, y_pred_test))

print ('Classification report based on training set\n',classification_report(y_train, y_pred_train))
print ('Classification report based on testing set\n', classification_report(y_test, y_pred_test))

# Form confusion matrix as a DataFrame
confusion_matrix_train_df = pd.DataFrame((confusion_matrix(y_train, y_pred_train)), ('Bad', 'Good'), ('Bad', 'Good'))
# Plot confusion matrix
plt.figure()
heatmap = sns.heatmap(confusion_matrix_train_df, annot=True, annot_kws={'size': 14}, fmt='d', cmap='YlGnBu')
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)
heatmap.xaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)

plt.title('Confusion Matrix for Training Model\n(Decision Tree)', fontsize=18, color='darkblue')
plt.ylabel('True abel', fontsize=14)
plt.xlabel('Predicted label', fontsize=14)
plt.show()

confusion_matrix_testing_df = pd.DataFrame((confusion_matrix(y_test, y_pred_test)), ('Bad', 'Good'), ('Bad', 'Good'))
# Plot confusion matrix
plt.figure()
heatmap = sns.heatmap(confusion_matrix_testing_df, annot=True, annot_kws={'size': 14}, fmt='d', cmap='YlGnBu')
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)
heatmap.xaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)

plt.title('Confusion Matrix for Testing Model\n(Decision Tree)', fontsize=18, color='darkblue')
plt.ylabel('True abel', fontsize=14)
plt.xlabel('Predicted label', fontsize=14)
plt.show()

In [None]:
NaiveBayes = GaussianNB()
NaiveBayes.fit(X_train, y_train)
y_pred_train = NaiveBayes.predict(X_train)
y_pred_test = NaiveBayes.predict(X_test)

print ('Accuracy of model based on training set', accuracy_score(y_train, y_pred_train))
print ('Accuracy of model based on testing set', accuracy_score(y_test, y_pred_test))

print ('Classification report based on training set\n', classification_report(y_train, y_pred_train))
print ('Classification report based on testing set\n', classification_report(y_test, y_pred_test))

# Form confusion matrix as a DataFrame
confusion_matrix_train_df = pd.DataFrame((confusion_matrix(y_train, y_pred_train)), ('Bad', 'Good'), ('Bad', 'Good'))
# Plot confusion matrix
plt.figure()
heatmap = sns.heatmap(confusion_matrix_train_df, annot=True, annot_kws={'size': 14}, fmt='d', cmap='YlGnBu')
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)
heatmap.xaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)

plt.title('Confusion Matrix for Training Model\n(Naive Bayes Classifier)', fontsize=18, color='darkblue')
plt.ylabel('True abel', fontsize=14)
plt.xlabel('Predicted label', fontsize=14)
plt.show()

confusion_matrix_testing_df = pd.DataFrame((confusion_matrix(y_test, y_pred_test)), ('Bad', 'Good'), ('Bad', 'Good'))
# Plot confusion matrix
plt.figure()
heatmap = sns.heatmap(confusion_matrix_testing_df, annot=True, annot_kws={'size': 14}, fmt='d', cmap='YlGnBu')
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)
heatmap.xaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)

plt.title('Confusion Matrix for Testing Model\n(Naive Bayes Classifier)', fontsize=18, color='darkblue')
plt.ylabel('True abel', fontsize=14)
plt.xlabel('Predicted label', fontsize=14)
plt.show()

In [None]:
RandomForest = RandomForestClassifier()
RandomForest.fit(X_train, y_train)
y_pred_train = RandomForest.predict(X_train)
y_pred_test = RandomForest.predict(X_test)

print ('Accuracy of model based on training set', accuracy_score(y_train, y_pred_train))
print ('Accuracy of model based on testing set', accuracy_score(y_test, y_pred_test))

print ('Classification report based on training set\n', classification_report(y_train, y_pred_train))
print ('Classification report based on testing set\n', classification_report(y_test, y_pred_test))

# Form confusion matrix as a DataFrame
confusion_matrix_train_df = pd.DataFrame((confusion_matrix(y_train, y_pred_train)), ('Bad', 'Good'), ('Bad', 'Good'))
# Plot confusion matrix
plt.figure()
heatmap = sns.heatmap(confusion_matrix_train_df, annot=True, annot_kws={'size': 14}, fmt='d', cmap='YlGnBu')
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)
heatmap.xaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)

plt.title('Confusion Matrix for Training Model\n(Random Forest Classifier)', fontsize=18, color='darkblue')
plt.ylabel('True abel', fontsize=14)
plt.xlabel('Predicted label', fontsize=14)
plt.show()

confusion_matrix_testing_df = pd.DataFrame((confusion_matrix(y_test, y_pred_test)), ('Bad', 'Good'), ('Bad', 'Good'))
# Plot confusion matrix
plt.figure()
heatmap = sns.heatmap(confusion_matrix_testing_df, annot=True, annot_kws={'size': 14}, fmt='d', cmap='YlGnBu')
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)
heatmap.xaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)

plt.title('Confusion Matrix for Testing Model\n(Random Forest Classifier)', fontsize=18, color='darkblue')
plt.ylabel('True abel', fontsize=14)
plt.xlabel('Predicted label', fontsize=14)
plt.show()

In [None]:
LogisticReg = LogisticRegression()
LogisticReg.fit(X_train, y_train)
y_pred_train = LogisticReg.predict(X_train)
y_pred_test = LogisticReg.predict(X_test)

print ('Accuracy of model based on training set', accuracy_score(y_train, y_pred_train))
print ('Accuracy of model based on testing set', accuracy_score(y_test, y_pred_test))

print ('Classification report based on training set\n', classification_report(y_train, y_pred_train))
print ('Classification report based on training set\n', classification_report(y_test, y_pred_test))

# Form confusion matrix as a DataFrame
confusion_matrix_train_df = pd.DataFrame((confusion_matrix(y_train, y_pred_train)), ('Bad', 'Good'), ('Bad', 'Good'))
# Plot confusion matrix
plt.figure()
heatmap = sns.heatmap(confusion_matrix_train_df, annot=True, annot_kws={'size': 14}, fmt='d', cmap='YlGnBu')
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)
heatmap.xaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)

plt.title('Confusion Matrix for Training Model\n(Logistic Regression)', fontsize=18, color='darkblue')
plt.ylabel('True abel', fontsize=14)
plt.xlabel('Predicted label', fontsize=14)
plt.show()

confusion_matrix_testing_df = pd.DataFrame((confusion_matrix(y_test, y_pred_test)), ('Bad', 'Good'), ('Bad', 'Good'))
# Plot confusion matrix
plt.figure()
heatmap = sns.heatmap(confusion_matrix_testing_df, annot=True, annot_kws={'size': 14}, fmt='d', cmap='YlGnBu')
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)
heatmap.xaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)

plt.title('Confusion Matrix for Testing Model\n(Logistic Regression)', fontsize=18, color='darkblue')
plt.ylabel('True abel', fontsize=14)
plt.xlabel('Predicted label', fontsize=14)
plt.show()

In [None]:
KNN = KNeighborsClassifier()
KNN.fit(X_train, y_train)
y_pred_train = KNN.predict(X_train)
y_pred_test = KNN.predict(X_test)

print ('Accuracy of model based on training set', accuracy_score(y_train, y_pred_train))
print ('Accuracy of model based on testing set', accuracy_score(y_test, y_pred_test))

print ('Classification report based on training set\n', classification_report(y_train, y_pred_train))
print ('Classification report based on testing set\n', classification_report(y_test, y_pred_test))

# Form confusion matrix as a DataFrame
confusion_matrix_train_df = pd.DataFrame((confusion_matrix(y_train, y_pred_train)), ('Bad', 'Good'), ('Bad', 'Good'))
# Plot confusion matrix
plt.figure()
heatmap = sns.heatmap(confusion_matrix_train_df, annot=True, annot_kws={'size': 14}, fmt='d', cmap='YlGnBu')
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)
heatmap.xaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)

plt.title('Confusion Matrix for Training Model\n(KNN Classifier)', fontsize=18, color='darkblue')
plt.ylabel('True abel', fontsize=14)
plt.xlabel('Predicted label', fontsize=14)
plt.show()

confusion_matrix_testing_df = pd.DataFrame((confusion_matrix(y_test, y_pred_test)), ('Bad', 'Good'), ('Bad', 'Good'))
# Plot confusion matrix
plt.figure()
heatmap = sns.heatmap(confusion_matrix_testing_df, annot=True, annot_kws={'size': 14}, fmt='d', cmap='YlGnBu')
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)
heatmap.xaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)

plt.title('Confusion Matrix for Testing Model\n(KNN Classifier)', fontsize=18, color='darkblue')
plt.ylabel('True abel', fontsize=14)
plt.xlabel('Predicted label', fontsize=14)
plt.show()

## Evaluation Model

Based on the modeling above, we choose Logistic Regression. This is because the performance of the Logistic Regression model tends to be able to predict equally well in the training and testing phases. On the other hand, other algorithms tend to over-fit their performance.