https://corporatefinanceinstitute.com/resources/knowledge/credit/purpose-of-credit-risk-analysis/
https://corporatefinanceinstitute.com/resources/knowledge/credit/credit-risk-analysis/

In [1]:
import pandas as pd
import numpy as np

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df_credit = pd.read_csv("german_credit_data.csv", index_col=0)
df_credit.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


In [4]:
df_credit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   817 non-null    object
 5   Checking account  606 non-null    object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
 9   Risk              1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 62.5+ KB


In [5]:
df_credit.nunique()

Age                  53
Sex                   2
Job                   4
Housing               3
Saving accounts       4
Checking account      3
Credit amount       921
Duration             33
Purpose               8
Risk                  2
dtype: int64

In [6]:
#Credit amount column
interval = (18, 25, 35, 60, 120)
cats = ['Student', 'Young', "Adult", "Senior"]

In [7]:
df_credit["Age_cat"] = pd.cut(df_credit.Age, interval, labels = cats)
df_credit["Age_cat"].value_counts()

Young      398
Adult      367
Student    190
Senior      45
Name: Age_cat, dtype: int64

In [8]:
df_bad = df_credit[df_credit["Risk"]== 'bad']
df_bad.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,Age_cat
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad,Student
4,53,male,2,free,little,little,4870,24,car,bad,Adult
9,28,male,3,own,little,moderate,5234,30,car,bad,Young
10,25,female,2,rent,little,moderate,1295,12,car,bad,Student
11,24,female,2,rent,little,little,4308,48,business,bad,Student


In [9]:
df_good = df_credit[df_credit["Risk"]=='good']
df_good.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,Age_cat
0,67,male,2,own,,little,1169,6,radio/TV,good,Senior
2,49,male,1,own,little,,2096,12,education,good,Adult
3,45,male,2,free,little,little,7882,42,furniture/equipment,good,Adult
5,35,male,1,free,,,9055,36,education,good,Young
6,53,male,2,own,quite rich,,2835,24,furniture/equipment,good,Adult


In [10]:
import plotly.offline as py
py.init_notebook_mode(connected = True)

In [11]:
import plotly.graph_objects as go
import plotly.tools as tls
import warnings
from collections import Counter

In [12]:
trace0 = go.Bar(
    x = df_credit[df_credit["Risk"] == 'good']["Risk"].value_counts().index.values,
    y = df_credit[df_credit["Risk"] == 'good']["Risk"].value_counts().values,
    name = "Good credit"
)

In [13]:
trace0

Bar({
    'name': 'Good credit', 'x': array(['good'], dtype=object), 'y': array([700], dtype=int64)
})

In [14]:
trace1 = go.Bar(
    x = df_credit[df_credit["Risk"] == 'bad']["Risk"].value_counts().index.values,
    y = df_credit[df_credit["Risk"] == 'bad']["Risk"].value_counts().values,
    name = "Bad credit"
)
trace1

Bar({
    'name': 'Bad credit', 'x': array(['bad'], dtype=object), 'y': array([300], dtype=int64)
})

In [15]:
data = [trace0, trace1]
layout = go.Layout(
    yaxis=dict(
        title='Count'
    ),
    xaxis=dict(
        title='Risk Variable'
    ),
    title='Target variable distribution'
)
fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='grouped-bar')


In [16]:
df_good = df_credit.loc[df_credit["Risk"] == 'good']['Age'].values.tolist()
df_bad = df_credit.loc[df_credit["Risk"] == 'bad']['Age'].values.tolist()
df_age = df_credit['Age'].values.tolist()

In [17]:
#the good vs age was converted into list format

In [18]:
#First plot
trace0 = go.Histogram(
    x=df_good,
    histnorm='probability',
    name="Good Credit"
)
trace0

Histogram({
    'histnorm': 'probability',
    'name': 'Good Credit',
    'x': [67, 49, 45, ..., 40, 38, 27]
})

In [19]:
#Second plot
trace1 = go.Histogram(
    x=df_bad,
    histnorm='probability',
    name="Bad Credit"
)
trace1

Histogram({
    'histnorm': 'probability',
    'name': 'Bad Credit',
    'x': [22, 53, 28, ..., 33, 26, 23]
})

In [20]:
#Third plot
trace2 = go.Histogram(
    x=df_age,
    histnorm='probability',
    name="Overall Age"
)
trace2

Histogram({
    'histnorm': 'probability',
    'name': 'Overall Age',
    'x': [67, 22, 49, ..., 38, 23, 27]
})

In [21]:
#Creating the grid
fig = tls.make_subplots(rows=2, cols=2, specs=[[{}, {}], [{'colspan': 2}, None]],
                          subplot_titles=('Good','Bad', 'General Distribuition'))
#setting the figs
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig.append_trace(trace2, 2, 1)

fig['layout'].update(showlegend=True, title='Age Distribuition', bargap=0.05)
py.iplot(fig, filename='custom-sized-subplot-with-subplot-titles')


plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



In [22]:
#Let's look the Credit Amount column
interval = (18, 25, 35, 60, 120)

cats = ['Student', 'Young', 'Adult', 'Senior']
df_credit["Age_cat"] = pd.cut(df_credit.Age, interval, labels=cats)


df_good = df_credit[df_credit["Risk"] == 'good']
df_bad = df_credit[df_credit["Risk"] == 'bad']

In [23]:
trace0 = go.Box(
    y=df_good["Credit amount"],
    x=df_good["Age_cat"],
    name='Good credit',
    marker=dict(
        color='#3D9970'
    )
)

trace1 = go.Box(
    y=df_bad['Credit amount'],
    x=df_bad['Age_cat'],
    name='Bad credit',
    marker=dict(
        color='#FF4136'
    )
)

data = [trace0, trace1]
layout = go.Layout(
    yaxis  = dict(
        title = 'Credit amount (US Dollar)',
        zeroline = False
    ), 
    xaxis = dict(
        title = "Age categorical"
    ),
     boxmode = 'group'
)
fig = go.Figure(data = data , layout = layout)
py.iplot(fig, filename = 'box-age-cat')

## Distribution of Housing own and rent by Risk

In [24]:
trace0 = go.Bar(
    x = df_credit[df_credit["Risk"]== "good"]["Housing"].value_counts().index.values,
    #x = array(['own', 'rent', 'free'], dtype=object)
    y = df_credit[df_credit["Risk"] == "good"]["Housing"].value_counts().values,
    #y = array([527, 109,  64], dtype=int64)
    name = "Good Credit"
)
trace1 = go.Bar(
    x = df_credit[df_credit["Risk"] == "bad"]["Housing"].value_counts().index.values,
    y = df_credit[df_credit["Risk"] == "bad"]["Housing"].value_counts().values,
    name = "Bad Credit"
)

data = [trace0, trace1]

layout = go.Layout(
    title = "Housing Distribution"
)
fig = go.Figure(data = data , layout = layout)
py.iplot(fig, filename = "Housing_Grouped")

### Distribution of Credit Amount by Housing

In [25]:
fig = {
    "data": [
        {
            "type": "violin",
            "x": df_good["Housing"],
            "y": df_good["Credit amount"],
            "legendgroup": "Good Credit",
            "scalegroup": "No",
            "name": "Good Credit",
            "side": "negative",
            "box": {
                "visible": True
            },
            "meanline":{
               "visble" : True
            },
            "line": {
                "color": "blue"
            }
        },
        {
            "type": "violin",
            "x": df_bad["Housing"],
            "y": df_bad["Credit amount"],
            "legendgroup": "Bad Credit",
            "scalegroup": "No",
            "name": "Bad Credit",
            "side": "positive",
            "box": {
                "visible": True
            },
            "meanline": {
                "visible": True
            },
            "line": {
                "color": "green"
            }
        }
    ],
    "layout": {
        "yaxis":{
            "zeroline": False,
        },
        "violingap": 0,
        "violinmode": "overlay"
    }
}

py.iplot(fig, filename = "violin/split", validate = False)

### Looking the difference by Sex

In [28]:
y = df_credit[df_credit["Risk"] == "bad"]["Sex"].value_counts().values
y

array([191, 109], dtype=int64)

In [30]:
#First plot
trace0 = go.Bar(
    x = df_credit[df_credit["Risk"] == 'good']['Sex'].value_counts().index.values,
    y = df_credit[df_credit["Risk"] == 'good']['Sex'].value_counts().values,
    name = "Good Credit"
)
#First plot2
trace1 = go.Bar(
    x = df_credit[df_credit["Risk"] == "bad"]["Sex"].value_counts().index.values,
    y = df_credit[df_credit["Risk"] == "bad"]["Sex"].value_counts().values,
    name = "Good Credit"
)

#Second Plot
trace2 = go.Box(
    x = df_credit[df_credit["Risk"] == "good"]["Sex"],
    y = df_credit[df_credit["Risk"] == "good"]["Credit amount"],
    name = trace0.name
)

#second plot2
trace3 = go.Box(
    x = df_credit[df_credit["Risk"] == "bad"]["Sex"],
    y = df_credit[df_credit["Risk"] == "bad"]["Credit amount"],
    name = trace1.name
)

data = [trace0, trace1, trace2, trace3]

fig = tls.make_subplots(rows = 1, cols = 2,
                       subplot_titles = ("sex Count", "Credit Amount by Sex"))

fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 1, 2)

fig["layout"].update(height = 400, width = 800, title = "Sex Distribution", boxmode = 'group')
py.iplot(fig, filename = "sex-subplot")


plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



In [31]:
# How can i set boxplot in different places? how can i use the same legend to both graph?
#i will create categories of Age and look the distribution of Credit Amount by Risk...
#Distribution
#crossed by Credit Amount
#crossed by Age

https://www.kaggle.com/kabure/predicting-credit-risk-model-pipeline/notebook