### A beginner's guide to model the German credit Risk data

In [None]:
### Necessary imports
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import fbeta_score

import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
import shap

import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
df_credit = pd.read_csv("../input/german-credit-data-with-risk/german_credit_data.csv", index_col=0)

### below renaming is just to make the data consistent with the one on my local
df_credit.rename(columns = {'Checking account': 'Credit History', 'Sex': 'Gender'}, inplace=True)

y = df_credit['Risk']
X = df_credit.drop(columns = ['Risk'])
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1)

df_train = pd.concat([X_train, y_train], axis = 1)
df_test = pd.concat([X_test, y_test], axis = 1)

print(df_train.shape, df_test.shape)

In [None]:
df_train.head()

Train and Test Distribution comparison 

In [None]:
df_train.loc[:, 'Credit History'].hist(alpha=0.5, label='Train', density=True)    
df_test.loc[:, 'Credit History'].hist(alpha=0.5, label='Test', density=True)
plt.xlabel('Credit History')
plt.legend()

In [None]:
df_train.loc[:, 'Age'].hist(alpha=0.5, label='Train', density=True)    
df_test.loc[:, 'Age'].hist(alpha=0.5, label='Test', density=True)  
plt.xlabel('Age')
plt.legend()

In [None]:
df_train.loc[:, 'Job'].hist(alpha=0.5, label='Train', density=True)    
df_test.loc[:, 'Job'].hist(alpha=0.5, label='Test', density=True)
plt.xlabel('Job')
plt.legend()

In [None]:
df_train.loc[:, 'Duration'].hist(alpha=0.5, label='Train', density=True)    
df_test.loc[:, 'Duration'].hist(alpha=0.5, label='Test', density=True)
plt.xlabel('Duration')
plt.legend()

In [None]:
df_train.loc[:, 'Credit amount'].hist(alpha=0.5, label='Train', density=True)    
df_test.loc[:, 'Credit amount'].hist(alpha=0.5, label='Test', density=True)
plt.xlabel('Credit amount')
plt.legend()

In [None]:
df_train.loc[:, 'Saving accounts'].hist(alpha=0.5, label='Train', density=True)    
df_test.loc[:, 'Saving accounts'].hist(alpha=0.5, label='Test', density=True)
plt.xlabel('Saving accounts')
plt.legend()

In [None]:
from scipy.stats import ks_2samp
ks_2samp(df_train['Age'], df_test['Age'])
ks_2samp(df_train['Credit amount'], df_test['Credit amount'])

### Preliminary data analysis

In [None]:
df_train.head()

In [None]:
df_train.nunique()

In [None]:
df_train.Risk.value_counts() ### Good = 1 (credit worthy), Bad = 0 (not worthy)

In [None]:
df_train.Risk.value_counts()

In [None]:
df_train_summ = df_train.describe()
df_train_summ

In [None]:
plt.hist(df_train['Credit amount'])
plt.title('Credit amount distribution')

In [None]:
### the credit amount is highly skewed distribution, lets analyse the extreme values beyond 3 sigma
def extreme_count(sig_factor, feat):
    sig_cutoff = df_train_summ[feat]['mean'] + sig_factor*df_train_summ[feat]['std'] 
    sig_count = len(df_train[df_train[feat] > sig_cutoff])
    print("instances of {} greater than {} sigma ({} cutoff) are {}".format(feat, sig_factor, sig_cutoff, sig_count))
    return

extreme_count(3, feat = 'Credit amount')

In [None]:
plt.hist(df_train['Age'])
plt.title('Age distribution')
extreme_count(3, feat = 'Age')

In [None]:
plt.hist(df_train['Duration'])
plt.title('Duration distribution')
extreme_count(3, feat = 'Duration')

In [None]:
''' Even if there are certain instances where the above features are beyond 3sigma of their mean value, they dont appear to be 
outliers, as its legible to have certain certain loans with high credit value, or loan duration is longer, or older population 
is seeking loan. Hence, not eliminating these rows'''

### Finding Missing values, checking if they are legitimate and applying apt transformation

In [None]:
df_train.isnull().sum()

In [None]:
### NaN is a valid field here implying no saving account

df_train['Saving accounts'].value_counts()
df_train['Saving accounts'].unique()

### So, replacing NaN with 'no account'
df_train.loc[df_train['Saving accounts'].isnull(), 'Saving accounts'] = 'no account'
df_train.loc[df_train['Credit History'].isnull(), 'Credit History'] = 'no history'

### Replaced in df
df_train['Saving accounts'].value_counts()
df_train['Saving accounts'].unique()

### No NaNs anymore
df_train.isnull().sum()

### Checking data types and categorical states of features for encoding

In [None]:
df_train.dtypes

In [None]:
df_train.nunique()

### Feature Selection for label and one hot encoding

In [None]:
df_dtypes = pd.DataFrame((df_credit.dtypes == 'object'), columns = ['obj_type'])
obj_list = df_dtypes[(df_dtypes.obj_type == True)].index
print("Features for label encoding:", obj_list)

### Label Encoding

In [None]:
df_train[obj_list].head()

In [None]:
def le_col(df, col):
    le = LabelEncoder()
    le.fit(df[col])
    df[col] = le.transform(df[col])
    return df, le

df_train, le_gender = le_col(df_train, 'Gender')
df_train, le_housing = le_col(df_train, 'Housing')
df_train, le_sa = le_col(df_train, 'Saving accounts')
df_train, le_purpose = le_col(df_train, 'Purpose')
df_train, le_ch = le_col(df_train, 'Credit History')
df_train, le_risk = le_col(df_train, 'Risk')

df_train[obj_list].head()

### Distribution of Risk variable

In [None]:
df_train.Risk.value_counts()

In [None]:
df_train.corr()

### EDA 1 : More credit history is equivalent to credit worthiness

Conclusion: As the credit history increases, the good risk increases proportionately i..e credit worthiness improves sharply

In [None]:
plt.hist([df_train.loc[df_train['Risk'] == 0, 'Credit History'].values, df_train.loc[df_train['Risk'] == 1, 'Credit History'].values], alpha=0.5, label=['Bad Risk', 'Good Risk'])
plt.legend(loc='upper right')

In [None]:
df_train[df_train['Risk'] == 0]['Credit History'].value_counts()

In [None]:
df_train[df_train['Risk'] == 1]['Credit History'].value_counts()

### EDA 2 : Are young people more credit worthy?

In [None]:
plt.hist([df_train.loc[df_train['Risk'] == 0, 'Age'].values, df_train.loc[df_train['Risk'] == 1, 'Age'].values], alpha=0.5, label=['Bad Risk', 'Good Risk'])
plt.legend(loc='upper right')

In [None]:
df_train.Age[df_train.Age <= 30] = 0
df_train.Age[(df_train.Age > 30) & (df_train.Age < 45)] = 1
df_train.Age[(df_train.Age >= 45)] = 2

In [None]:
df_train[df_train['Risk'] == 0]['Age'].value_counts()
df_train[df_train['Risk'] == 1]['Age'].value_counts()

### Modelling

X_train, y_train prep

In [None]:
y_train = df_train['Risk']
X_train = df_train.drop(columns = ['Risk'])

Test data prep

In [None]:
df_test.isnull().sum()
### So, replacing NaN with 'no account' and 'no history'
df_test.loc[df_test['Saving accounts'].isnull(), 'Saving accounts'] = 'no account'
df_test.loc[df_test['Credit History'].isnull(), 'Credit History'] = 'no history'
df_test.isnull().sum()

In [None]:
df_test['Gender'] = le_gender.transform(df_test['Gender'])
df_test['Housing'] = le_housing.transform(df_test['Housing'])
df_test['Saving accounts'] = le_sa.transform(df_test['Saving accounts'])
df_test['Purpose'] = le_purpose.transform(df_test['Purpose'])
df_test['Credit History'] = le_ch.transform(df_test['Credit History'])

df_test['Risk'] = le_risk.transform(df_test['Risk'])

In [None]:
df_test.head()

In [None]:
y_test = df_test['Risk']
X_test = df_test.drop(columns = ['Risk'])

In [None]:
import numpy as np
from sklearn.metrics import fbeta_score, make_scorer
ftwo_scorer = make_scorer(fbeta_score, beta=1/5)

In [None]:
### Assuming, it is bad to classify a customer as good when they are bad i.e. objective is to reduce FP, we want better precision 
### Hence, applying beta = 1/5 and selecting fbeta_score as evaluation metric
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
fbeta_score(y_test, y_pred, beta=1/5)