<a href="https://colab.research.google.com/github/shila121/projects/blob/main/creditRisk_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

import plotly.graph_objects as go

In [2]:
df = pd.read_csv('bankloans.csv')
df.head(10)

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1.0
1,27,1,10,6,31,17.3,1.362202,4.000798,0.0
2,40,1,15,14,55,5.5,0.856075,2.168925,0.0
3,41,1,15,14,120,2.9,2.65872,0.82128,0.0
4,24,2,2,0,28,17.3,1.787436,3.056564,1.0
5,41,2,5,5,25,10.2,0.3927,2.1573,0.0
6,39,1,20,9,67,30.6,3.833874,16.668126,0.0
7,43,1,12,11,38,3.6,0.128592,1.239408,0.0
8,24,1,3,4,19,24.4,1.358348,3.277652,1.0
9,36,1,0,13,25,19.7,2.7777,2.1473,0.0


In [3]:
df.describe()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
count,1150.0,1150.0,1150.0,1150.0,1150.0,1150.0,1150.0,1150.0,700.0
mean,35.235652,1.695652,8.781739,8.485217,47.982609,10.063391,1.605111,3.103844,0.261429
std,8.089961,0.927051,6.914762,6.977725,40.508814,6.584288,2.135967,3.529273,0.439727
min,20.0,1.0,0.0,0.0,13.0,0.1,0.011696,0.045584,0.0
25%,29.0,1.0,3.0,3.0,24.0,5.2,0.415584,1.047996,0.0
50%,35.0,1.0,7.0,7.0,36.0,8.75,0.89913,2.038053,0.0
75%,41.0,2.0,13.0,12.0,56.75,13.6,1.89882,3.86496,1.0
max,56.0,5.0,33.0,34.0,446.0,41.3,20.56131,35.1975,1.0


In [4]:
df.nunique()

age          37
ed            5
employ       33
address      32
income      129
debtinc     245
creddebt    842
othdebt     848
default       2
dtype: int64

In [7]:
df.isnull().sum()

age           0
ed            0
employ        0
address       0
income        0
debtinc       0
creddebt      0
othdebt       0
default     450
dtype: int64

In [8]:
df =  df.dropna()

In [14]:
_CATEGORICAL_FEATURES = ['ed', 'employ', 'address', 'default']

In [13]:
df.dtypes

age           int64
ed            int64
employ        int64
address       int64
income        int64
debtinc     float64
creddebt    float64
othdebt     float64
default     float64
dtype: object

In [15]:
def separate_categorical_feature(df: pd.DataFrame, col: str) -> [pd.Series, pd.Series]:
    df_list = []

    common_index = df[col].value_counts().index

    for i in range(2):
        df_list.append(df[df['default'] == i][col].value_counts().reindex(common_index, fill_value=0))

    return df_list

In [16]:
def separate_numerical_feature(df: pd.DataFrame, col: str) -> [pd.Series, pd.Series, dict]:
    df_list = []

    start, end = df[col].min(), df[col].max()

    # set special properties for age chart
    if col == 'age':
        size = 5  # values will be divided in groups by 5 years
        xbins = {'start': start, 'end': end, 'size': size}
        bins = np.arange(start, end + size, size)
    else:
        end += 1
        nbins = 10  # values will be divided in 10 groups
        xbins = {'start': start, 'end': end, 'size': (end - start) / nbins}
        bins = np.linspace(start, end, nbins + 1)

    # divide values in groups and count values in every group
    common_index = pd.cut(df[col], bins, right=False).value_counts().sort_index()

    for i in range(2):
        df_clipped = df[df['default'] == i][col]
        df_list.append(pd.Series({
            index: len(df_clipped[(df_clipped >= index.left) & (df_clipped < index.right)].index) for index in common_index.index
        }, index=common_index.index))

    return df_list + [xbins]

In [17]:
def set_text_args(df_not_default: pd.Series, df_default: pd.Series, args: dict) -> None:
    # set text for default bars (default rate)
    args['text'] = (df_default / (df_default + df_not_default)) * 100
    args['texttemplate'] = '%{text:.0f}%'
    args['textposition'] = 'outside'

In [18]:
def add_bar_trace(df: pd.DataFrame, col: str, traces: list, chart_args: dict) -> None:
    # separation rows on default and not default
    df_not_default, df_default = separate_categorical_feature(df, col)

    for j, args in enumerate(chart_args):
        if j:
            df_agg = df_default

            set_text_args(df_not_default, df_default, args)
        else:
            df_agg = df_not_default

        # create Bar trace
        traces.append(
            go.Bar(
                x=df_agg.index,
                y=df_agg,
                **args,
            ),
        )

In [19]:
def add_histogram_trace(df: pd.DataFrame, col: str, traces: list, chart_args: list) -> None:
    # separation rows on default and not default
    df_not_default, df_default, xbins = separate_numerical_feature(df, col)

    for j, args in enumerate(chart_args):
        if j:
            set_text_args(df_not_default, df_default, args)

        # create Histogram trace
        traces.append(
            go.Histogram(
                x=df[df['default'] == j][col],
                histfunc='count',
                xbins=xbins,
                **args,
            ),
        )

In [20]:
def create_bar_chart(df: pd.DataFrame) -> None:
    # set up multiple traces
    traces = []
    buttons = []

    for i, col in enumerate(df.columns):
        # visible flag for first two traces
        visible = not bool(i)

        # layout attributes
        layout_attrs = {
            'title': {
                'text': f'Default rate by {col}',
                'xanchor': 'center',
                'yanchor': 'top',
                'x': 0.5,
                'y': 0.9,
            },
            'bargap': 0.2,
        }

        # chart properties
        chart_args = [
            {
                'name': 'not default',
                'visible': visible,
                'marker_color': '#3366CC',
            },
            {
                'name': 'default',
                'visible': visible,
                'marker_color': '#DC3912',
            },
        ]

        # set special properties for charts and layout based upon feature
        if col in _CATEGORICAL_FEATURES:
            layout_attrs['xaxis'] = {'type': 'category',
                                     'categoryorder': 'array',
                                    }

            add_bar_trace(df, col, traces, chart_args)
        else:
            layout_attrs['xaxis'] = {'type': 'linear'}

            add_histogram_trace(df, col, traces, chart_args)

        # set visible properties for traces
        index = 2 * i
        visible_flags = 2 * [False] * len(df.columns)
        visible_flags[index] = visible_flags[index + 1] = True

        # add button
        buttons.append({'method': 'update',
                        'label': col,
                        'args': [{'visible': visible_flags}, layout_attrs],
                       })

    # add dropdown
    layout = go.Layout(
        updatemenus=[
            dict(
                buttons=buttons,
                direction='down',
                pad={'r': 10, 't': 10},
                showactive=True,
                x=0.9,
                xanchor='left',
                y=1.2,
                yanchor='top',
            ),
        ],
        barmode='stack',
        **buttons[0]['args'][-1],
    )

    fig = go.Figure(data=traces, layout=layout)

    fig.show()

In [21]:
create_bar_chart(df)

In [24]:
# Class Distribution Analysis

# divide clients into age categories of 5 years
def calc_cat_age(x: pd.Series) -> str:
    age_cat_num = int(x['age'] // 5 * 5)
    return f'{age_cat_num} - {age_cat_num + 5}'

In [26]:
df.groupby('age').mean().reset_index()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,20,2.0,2.0,0.5,15.5,6.0,0.122388,0.752112,0.5
1,21,1.8,1.4,1.1,20.8,9.87,0.562147,1.440753,0.3
2,22,1.833333,1.583333,1.833333,20.75,10.8,0.745415,1.480169,0.416667
3,23,1.777778,2.111111,2.111111,23.444444,9.944444,0.692862,1.514304,0.555556
4,24,1.541667,3.083333,2.291667,20.791667,9.858333,0.688836,1.318956,0.5
5,25,1.75,3.65,2.9,23.35,11.13,0.746883,1.706917,0.45
6,26,1.565217,4.565217,3.391304,26.869565,9.430435,0.944614,1.737951,0.173913
7,27,2.142857,4.5,4.357143,33.392857,9.557143,1.062909,2.218591,0.357143
8,28,1.918919,3.864865,4.864865,28.405405,12.148649,1.258811,2.203325,0.378378
9,29,1.613636,5.613636,5.227273,32.340909,9.463636,1.061284,2.026534,0.318182
