In [39]:
# load libraries

import pandas as pd
import numpy as np
import operator
from pandas import read_csv
from pandas.plotting import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import plotly.express as px
import plotly.graph_objects as go

In [26]:
# load the dataset

credit=pd.read_csv('german_credit_data.csv')
print("The dataset has {} credit records.".format(len(credit)))

The dataset has 1000 credit records.


In [27]:
credit.head(2)

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad


In [28]:
credit=credit.iloc[:, 1:]
credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   817 non-null    object
 5   Checking account  606 non-null    object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
 9   Risk              1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 78.2+ KB


In [29]:
# Sex vs age cross tabulation

"""A box plot is a statistical representation of numerical data through their quartiles. 
The ends of the box represent the lower and upper quartiles, 
while the median (second quartile) is marked by a line inside the box."""

SA = credit.loc[:,['Sex','Age']]
fig = px.box(SA, x="Sex", y="Age", points="all",color="Sex")
fig.update_layout(
    title={
          'text':"Sex Vs Age Cross tabulation",
        'y':.95,
        'x':.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title="Sex",
    yaxis_title="Age",
   
)
fig.show()

In [30]:
# Purpose distribution

"""A histogram is a representation of the distribution of numerical data, 
where the data are binned and the count for each bin is represented."""

Purpose = credit['Purpose']
fig = px.histogram(credit, x="Purpose", color="Purpose")
fig.update_layout(
    title={
          'text':"Purpose breakdown",
        'y':.95,
        'x':.5,
        'xanchor': 'center',
        'yanchor': 'top'
    }
   
)
fig.show()

In [31]:
# Purpose Vs Credit Amount Cross tabulation

SC =credit.loc[:,['Purpose','Credit amount']]
fig = px.box(SC, x="Purpose", y="Credit amount", color="Purpose")
fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
fig.update_layout(
    title={
          'text':"Purpose Vs Credit Amount Cross tabulation",
        'y':.95,
        'x':.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title="Purpose",
    yaxis_title="Credit amount",
   
)
fig.show()

In [32]:
# Risk Vs attributes Cross tabulation

"""The parallel categories diagram is a visualization of multi-dimensional categorical data sets. 
Each variable in the data set is represented by a column of rectangles, where each rectangle corresponds to a 
discrete value taken on by that variable. The relative heights of the rectangles reflect the 
relative frequency of occurrence of the corresponding value."""

import ipywidgets as widgets

# Create dimensions
gender_dim = go.parcats.Dimension(values=credit.Sex, label="Sex")
Housing_dim = go.parcats.Dimension(values=credit.Housing, label="Housing")
#Saving_accounts_dim = go.parcats.Dimension(values=credit['Saving accounts'], label="Saving accounts")
Checking_account_dim = go.parcats.Dimension(values=credit['Checking account'], label="Checking account")
Purpose_dim = go.parcats.Dimension(values=credit.Purpose, label="Purpose")
Risk_dim = go.parcats.Dimension(values=credit.Risk, label="Risk")

# Create parcats trace
color = np.zeros(len(credit), dtype='uint8')
colorscale = [[0, 'gray'], [0.33, 'firebrick'],
              [0.33, 'firebrick'], [0.66, 'blue'],
              [0.66, 'blue'], [1.0, 'green']]
fig = go.Figure(data = [go.Parcats(dimensions=[gender_dim, Housing_dim,Checking_account_dim,Purpose_dim,Risk_dim],
        #line={'color':color,'colorscale': colorscale},
         hoveron='color', hoverinfo='count+probability',
        labelfont={'size': 18, 'family': 'Times'},
        tickfont={'size': 16, 'family': 'Times'},
        arrangement='freeform')])
fig.show()

In [33]:
credit['Risk'] = credit['Risk'].map({'bad':1, 'good':0})

In [34]:
credit['Saving accounts'] = credit['Saving accounts'].fillna('Others')
credit['Checking account'] = credit['Checking account'].fillna('Others')

In [35]:
credit_clean=credit.copy()

In [36]:
cat_features = ['Sex','Housing', 'Saving accounts', 'Checking account','Purpose']
num_features=['Age', 'Job', 'Credit amount', 'Duration', 'Risk']
for variable in cat_features:
    dummies = pd.get_dummies(credit_clean[cat_features])
    df1 = pd.concat([credit_clean[num_features], dummies],axis=1)

Risk= df1['Risk']          
df2=df1.drop(['Risk'],axis=1)

In [37]:
# Splitting data for training and testing

X_train, X_test, Y_train, Y_test = train_test_split(df2, Risk, test_size = 0.20, random_state = 30)

## Model-building process:

Risk-prediction is a standard supervised classification task.

1. Supervised: The labels are included in the training data and the goal is to train a model to learn to predict the labels from the features.
2. Classification: The label is a binary variable, 0 (no risk and loan will be on time), 1 (risky loan will have difficulty repaying loan).

In [40]:
random_forest = RandomForestClassifier(random_state = 100)
random_forest.fit(X_train, Y_train)

RandomForestClassifier(random_state=100)

## Model optimization:

Hyperparameters are model-specific parameters whose values are set before the learning process begins. In RandomForestClassifier, the hyperparameters include the number of trees in the forest (n_estimators) and the maximum depth of the tree (max_depth) as described within the model specifications.

Hyperparameters Tuning is a measure of how much performance can be gained by tuning them and searching for the right set of hyperparameter to achieve high precision and accuracy.

There are several parameter tuning techniques, but two of the most widely-used parameter optimizing techniques are:
1. Grid search: The concept behavior is similar to the grid, where all the values are placed in the form of a matrix. Each combination of parameters is taken into consideration.

2. Random search: The concept tries random combinations of the hyperparameters to find the best solution for the built model based on the defined scoring.

Adjusting the following RF set of hyperparameters using Random Search:

1. n_estimators = number of trees in the forest
2. max_features = max number of features considered for splitting a node
3. max_depth = max number of levels in each decision tree
4. min_samples_split = min number of data points placed in a node before the node is split
5. min_samples_leaf = min number of data points allowed in a leaf node
6. bootstrap = method for sampling data points (with or without replacement)

