# German Credit Risk Analysis

In [None]:
#importing the libraries
import pandas as pd
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt

from IPython.display import display

In [None]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [None]:
credit_df = pd.read_csv('../input/german-credit-data-with-risk/german_credit_data.csv')
credit_df = credit_df.iloc[:, 1:]
credit_df.head()

In [None]:
credit_df.info()

In [None]:
credit_df.describe()

In [None]:
print (credit_df.shape)

# Reading and understanding Data

In [None]:
# checking for number of unique values in each column
credit_df.nunique()

Separating out numeric and categorical variables

In [None]:
credit_df_numeric = credit_df.select_dtypes(include=['int64'])
credit_df_numeric.head()

In [None]:
credit_df_categorical = credit_df.select_dtypes(include=['object'])
credit_df_categorical.head()

Changing datatype of job from numeric to object

In [None]:
credit_df['Job'] = credit_df['Job'].astype(object)

In [None]:
credit_df_numeric = credit_df.select_dtypes(include=['int64'])
credit_df_numeric.head()

In [None]:
credit_df_categorical = credit_df.select_dtypes(include=['object'])
del credit_df_categorical['Risk']
credit_df_categorical.head()

In [None]:
credit_categorical_col = credit_df_categorical.columns
credit_numeric_col = credit_df_numeric.columns

In [None]:
print (credit_categorical_col)
print (credit_numeric_col)

# EDA

In [None]:
def showLabels(ax, d=None):
    plt.margins(0.2, 0.2)
    rects = ax.patches
    i = 0
    locs, labels = plt.xticks() 
    counts = {}
    if not d is None:
        for key, value in d.items():
            counts[str(key)] = value

    # For each bar: Place a label
    for rect in rects:
        # Get X and Y placement of label from rect.
        y_value = rect.get_height()
        x_value = rect.get_x() + rect.get_width() / 2

        # Number of points between bar and label. Change to your liking.
        space = 5
        # Vertical alignment for positive values
        va = 'bottom'

        # If value of bar is negative: Place label below bar
        if y_value < 0:
            # Invert space to place label below
            space *= -1
            # Vertically align label at top
            va = 'top'

        # Use Y value as label and format number with one decimal place
        if d is None:
            label = "{:.1f}%".format(y_value)
        else:
            try:
                label = "{:.1f}%".format(y_value) + '\n' + str(counts[str(labels[i].get_text())])
            except:
                label = "{:.1f}%".format(y_value)
        
        i = i+1

        # Create annotation
        plt.annotate(
            label,                      # Use `label` as label
            (x_value, y_value),         # Place label at end of the bar
            xytext=(0, space),          # Vertically shift label by `space`
            textcoords="offset points", # Interpret `xytext` as offset in points
            ha='center',                # Horizontally center label
            va=va)                      # Vertically align label differently for
                                        # positive and negative values.

In [None]:
def plot_distribution(dataframe, col):
    plt.figure(figsize=(15,5))
    plt.subplot(1, 2, 1)
    ax = sns.histplot(dataframe[col])
    plt.subplot(1, 2, 2)
    sns.boxplot(x=dataframe[col], y=dataframe['Risk'], data=dataframe)
    plt.show()

In [None]:
def plot_percentages(dataframe, by, sortbyindex=False):
    plt.subplot(1, 2, 1)
    values = (credit_df[by].value_counts(normalize=True)*100)
    if sortbyindex:
        values = values.sort_index()
    ax = values.plot.bar(color=sns.color_palette('husl', 16))
    ax.set_ylabel('% in dataset', fontsize=16)
    ax.set_xlabel(by, fontsize=12)
    showLabels(ax)
    plt.subplot(1, 2, 2)
    values = (credit_df.loc[credit_df['Risk']=='bad'][by].value_counts(normalize=True)*100)
    if sortbyindex:
        values = values.sort_index()
    ax = values.plot.bar(color=sns.color_palette('husl', 16))
    ax.set_ylabel('% of bad risks ou', fontsize=16)
    showLabels(ax)

In [None]:
def plotRiskStatus(dataframe, by, risk='bad'):
    grp = dataframe.groupby(['Risk',by])[by].count()
    cnt = dataframe.groupby(by)[by].count()
    #print(grp)
    percentages = grp.unstack() * 100 / cnt.T
    #print(percentages)
    ax = percentages.loc[risk].plot.bar(color=sns.color_palette('husl', 16))
    ax.set_ylabel('% of ' + risk + ' risks')
    showLabels(ax, grp[risk].to_dict())
    plt.margins(0.2, 0.2)
    plt.tight_layout()
    return ax

In [None]:
grp = credit_df.groupby(['Risk','Sex'])['Sex'].count()
grp

In [None]:
for col in credit_categorical_col:
    plt.figure(figsize=(15,10))
    plot_percentages(credit_df, col)
    plt.figure(figsize=(15,10))
    plotRiskStatus(credit_df, col)

**Some insights for categorical variables:**
1. There are more males than females in the dataset. Out of the total percentage of bad risks, there are more male incidents. But out of the total females in the dataset there are more % of bad risks.

In [None]:
for col in credit_numeric_col:
    plt.figure(figsize=(15,10))
    plot_distribution(credit_df, col)

# Data Cleaning

Handling missing values

In [None]:
round(credit_df.isnull().sum().sort_values(ascending=False)/len(credit_df.index)*100, 2)

Since close to 40% rows are null we can add them into a new category None

In [None]:
credit_df['Checking account'] = credit_df['Checking account'].fillna('None')

In [None]:
round(credit_df.isnull().sum().sort_values(ascending=False)/len(credit_df.index)*100, 2)

We now drop the rows where saving accounts are null

In [None]:
credit_df.dropna(inplace=True)

In [None]:
round(credit_df.isnull().sum().sort_values(ascending=False)/len(credit_df.index)*100, 2)

In [None]:
credit_df.info()

Handling outliers

In [None]:
for column in credit_numeric_col:
    q1 = credit_df[column].quantile(0.1)
    q3 = credit_df[column].quantile(0.9)
    iqr = q3 - q1
    
    #Excluding everything outside the interquantile range
    credit_df = credit_df[(credit_df[column] >= q1 - 1.5*iqr) & 
                      (credit_df[column] <= q3 + 1.5*iqr)] 
    print(credit_df.shape)

Adding dummy columns for categorical variables

In [None]:
binary_var = ['Sex']

for col in binary_var:
    ulist = credit_df[col].unique()
    credit_df[col] = credit_df[col].map({ulist[0] : 1, ulist[1] : 0})
    
credit_df['Risk'] = credit_df['Risk'].map({'good' : 0, 'bad' : 1})

In [None]:
credit_categorical_col = list(credit_categorical_col)

In [None]:
credit_categorical_col.remove('Sex')

In [None]:
credit_df_dummies = pd.get_dummies(credit_df[credit_categorical_col], drop_first=True)

In [None]:
credit_df_dummies.head()

In [None]:
credit_df.drop(credit_categorical_col, axis=1, inplace=True)
credit_df = pd.concat((credit_df, credit_df_dummies), axis = 1)
credit_df.head()

# Feature Engineering

In [None]:
credit_df_fe = credit_df.copy()
credit_df.head()

We can combine credit amount and duration to a variable credit/month

In [None]:
credit_df_fe['credit_per_month'] = credit_df_fe['Credit amount'] / credit_df_fe['Duration']
credit_df_fe.drop(['Credit amount', 'Duration'], axis=1, inplace=True)

In [None]:
credit_df_fe.head()

# Data Preprocessing

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
y = credit_df_fe.pop('Risk')
X = credit_df_fe

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [None]:
X_train.info()

In [None]:
X_test.info()

In [None]:
y_train.count()

Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
col = X.columns
index = X.index

In [None]:
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = col)
X_test = pd.DataFrame(scaler.transform(X_test), columns = col)

In [None]:
X_train.head()

Class imbalance

print("Number of churners before sampling: ", sum(y_train==1))
print("Number of non-churners before sampling: ", sum(y_train==0))
print("Churn rate before sampling: ", sum(y_train==1)/len(y_train)*100)

#performing SMOTE
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=100)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

print("Shape of X_train = ", X_train_res.shape)
print("Shape of y_train = ", y_train_res.shape)

print("Number of churners after sampling: ", sum(y_train_res==1))
print("Number of non-churners after sampling: ", sum(y_train_res==0))
print("Churn rate after sampling: ", sum(y_train_res==1)/len(y_train_res)*100)

# Modelling

In [None]:
#import libraries for modeling
import sklearn.preprocessing
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [None]:
def getModelMetrics(actual=False, pred=False):
    confusion = confusion_matrix(actual, pred)
    
    TP = confusion[1,1]
    FP = confusion[0,1]
    TN = confusion[0,0]
    FN = confusion[1,0]
    
    print("Roc_auc_score : {}".format(metrics.roc_auc_score(actual,pred)))
    # Sensitivity
    print('Sensitivity/Recall : {}'.format(TP / float(TP+FN)))
    # specificity
    print('Specificity: {}'.format(TN / float(TN+FP)))
    # false postive rate - predicting churn when customer has not churned
    print('False Positive Rate: {}'.format(FP/ float(TN+FP)))
    # positive predictive value 
    print('Positive predictive value: {}'.format(TP / float(TP+FP)))
    # Negative predictive value
    print('Negative Predictive value: {}'.format(TN / float(TN+ FN)))
    # sklearn precision score value 
    print('sklearn precision score value: {}'.format(metrics.precision_score(actual, pred )))

In [None]:
def plot_accuracies(scores,param):
    # plotting accuracies with max_depth
    plt.figure()
    plt.plot(scores["param_"+param], 
    scores["mean_train_score"], 
    label="training accuracy")
    plt.plot(scores["param_"+param], 
    scores["mean_test_score"], 
    label="test accuracy")
    plt.xlabel(param)
    plt.ylabel("f1")
    plt.legend()
    plt.show()

In [None]:
def predictRiskWithCutOff(model,X,y,prob):
    # Funtion to predict the churn using the input probability cut-off
    
    # predict
    pred_probs = model.predict_proba(X)[:,1]
    
    y_df= pd.DataFrame({'risk':y, 'risk_Prob':pred_probs})

    y_df['final_predicted'] = y_df.churn_Prob.map( lambda x: 1 if x > prob else 0)
    # Let's see the head
    getModelMetrics(y_df.churn,y_df.final_predicted)
    return y_df

In [None]:
def findOptimalCutoff(df):
    #Function to find the optimal cutoff for classifing as churn/non-churn
    # Let's create columns with different probability cutoffs 
    numbers = [float(x)/10 for x in range(10)]
    for i in numbers:
        df[i] = df.churn_Prob.map( lambda x: 1 if x > i else 0)
    #print(df.head())
    
    # Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
    cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
    from sklearn.metrics import confusion_matrix
    
    # TP = confusion[1,1] # true positive 
    # TN = confusion[0,0] # true negatives
    # FP = confusion[0,1] # false positives
    # FN = confusion[1,0] # false negatives
    
    num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
    for i in num:
        cm1 = metrics.confusion_matrix(df.churn, df[i] )
        total1=sum(sum(cm1))
        accuracy = (cm1[0,0]+cm1[1,1])/total1
        
        speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
        sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
        cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
    print(cutoff_df)
    # Let's plot accuracy sensitivity and specificity for various probabilities.
    cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
    plt.show()

**Logistic Regression**

In [None]:
logisticModel = LogisticRegression(verbose=1)

In [None]:
logisticModel.fit(X_train, y_train)

In [None]:
y_train_pred = logisticModel.predict(X_train)

In [None]:
getModelMetrics(y_train, y_train_pred)

In [None]:
y_test_pred = logisticModel.predict(X_test)

In [None]:
getModelMetrics(y_test, y_test_pred)

Not so great for logistic Regression

**SVM**

In [None]:
svmLin = SVC(C=1, kernel='linear')

In [None]:
svmLin.fit(X_train, y_train)
y_train_pred = svmLin.predict(X_train)
getModelMetrics(y_train, y_train_pred)

Hyper parameter tuning for SVM

In [None]:
params = {"C" : [0.1, 1, 50]}

In [None]:
svmLin1 = SVC(kernel='linear')

In [None]:
model_cv = GridSearchCV(estimator = svmLin1,
                       param_grid = params,
                       scoring = 'recall',
                       cv = 3,
                       verbose = 5,
                       n_jobs = 8,
                       return_train_score=True)
model_cv.fit(X_train, y_train)

In [None]:
plot_accuracies(model_cv.cv_results_, 'C')

In [None]:
model_cv.best_estimator_

In [None]:
svm_final = SVC(C=50, kernel='linear')

#fit on train model
svm_final.fit(X_train, y_train)

# predict on train
y_pred = svm_final.predict(X_train)
getModelMetrics(y_train,y_pred)

In [None]:
#predict on test
y_pred = svm_final.predict(X_test)
getModelMetrics(y_test, y_pred)

Linear kernel is having a hard time

In [None]:
svmNonLin = SVC(kernel='rbf')

In [None]:
model_cv = GridSearchCV(estimator = svmNonLin,
                       param_grid = params,
                       scoring = 'recall',
                       cv = 3,
                       verbose = 5,
                       n_jobs = 8,
                       return_train_score=True)
model_cv.fit(X_train, y_train)

In [None]:
model_cv.best_estimator_

In [None]:
plot_accuracies(model_cv.cv_results_, 'C')

In [None]:
svm_final = SVC(C=1, kernel='rbf')

#fit on train model
svm_final.fit(X_train, y_train)

# predict on train
y_train_pred = svm_final.predict(X_train)
getModelMetrics(y_train,y_train_pred)

In [None]:
y_test_pred = svm_final.predict(X_test)
getModelMetrics(y_test,y_test_pred)

We see that model is overfitting even in SVM. I think we should get more features to explain the variance on the dataset and generalize the whole data.

Do let me know if there is any other way of avoiding overfitting