# Import the libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', None)
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ROCAUC
import plotly.graph_objects as go

# Read the German Credit Data and extract the features for building multiple models

In [None]:
df = pd.read_csv('/kaggle/input/german-credit-data-with-risk/german_credit_data.csv')
df = df.drop(['Unnamed: 0'],axis=1)
df.head()

In [None]:
df.columns = list(map(lambda name:name.replace(' ','_'),df.columns))

In [None]:
df.dtypes

In [None]:
# convet column into category types
df[['Sex','Job','Housing','Saving_accounts','Checking_account','Purpose']] = \
    df[['Sex','Job','Housing','Saving_accounts','Checking_account','Purpose']].astype('category')

In [None]:
df['Sex']=df['Sex'].cat.codes
df['Job']=df['Job'].cat.codes
df['Housing']=df['Housing'].cat.codes
df['Saving_accounts']=df['Saving_accounts'].cat.codes
df['Checking_account']=df['Checking_account'].cat.codes
df['Purpose']=df['Purpose'].cat.codes
df['Risk']=df['Risk'].map({'good':0,'bad':1})

In [None]:
df.head()

### Now our data is ready to build models. Here Risk is the target variable  Let's split a data into train-test

In [None]:
X,y = df.loc[:, df.columns != 'Risk'],df['Risk']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
features=X_train.columns.to_list()

In [None]:
# features we are going to use for our model training
features

In [None]:
print(X_train.shape)
print(X_test.shape)

# Model 1:Gradient Boosting 

In [None]:
gbm = GradientBoostingClassifier(n_estimators=500,random_state=2)

visualizer = ROCAUC(gbm, classes=[ "good", "bad"])
visualizer.fit(X_train, y_train)        
visualizer.score(X_test, y_test)        
visualizer.show()                       

# Model 2. RandomForest 

In [None]:
rf = RandomForestClassifier(n_estimators=500,random_state=2)

visualizer = ROCAUC(rf, classes=[ "good", "bad"])
visualizer.fit(X_train, y_train)        
visualizer.score(X_test, y_test)        
visualizer.show() 

# Model Selection Basis: KS statistics, Lift and Gain Charts.

# K-S 
K-S or Kolmogorov-Smirnov chart measures performance of classification models. More accurately, K-S is a measure of the degree of separation between the positive and negative distributions. The K-S is 100 if the scores partition the population into two separate groups in which one group contains all the positives and the other all the negatives. On the other hand, If the model cannot differentiate between positives and negatives, then it is as if the model selects cases randomly from the population. The K-S would be 0. In most classification models the K-S will fall between 0 and 100, and that the higher the value the better the model is at separating the positive from negative cases.
Model Which gives higher KS statistics values is good over antoher model which has less KS statistics value.

In [None]:

X_test['gbm_predicted_probability'] = gbm.predict_proba(X_test[features])[:,1] # Predicted Proba for bads(=1)
X_test['rf_predicted_probability'] = rf.predict_proba(X_test[features])[:,1] # Predicted Proba for bads(=1)
X_test['Risk'] = y_test #Ground Truth
X_test.head()

In [None]:
def k_s_statistics_gain_lift(data,predicted_probability,ground_truth,response_name='Risk'):
    """
    This function gives K-S statistics Tables 
    KS Statistics is the difference between the cumulative Success and Non-Success Rate.Which gives optimal threshold for the group separation
    inuputs:
    data:dataframe 
    predicted_probability:string,coulmn name which contains predicted probability from the model
    ground_truth:string,column name which contains actual labels in integer form
    response_name:string,name of your success label e.g.deault,fraud,churn etc
    """
    #Sort the data in descending order of predicted probabilities.
    data= data.sort_values(by=predicted_probability, ascending=False)
    #print(data)
    #Cut deciles based on the predicted probabilities
    data['decile_group'] = pd.qcut(data[predicted_probability], q=10)
    #Create success and failure response column
    
    KS_data = data.groupby('decile_group').agg( #Group by Deciles of Predicted Probabilties
            [
                'count', #The total number of customers(data points) in the decile
                'sum', #The total number of bad customers(Risk=1)
            ]
            )[ground_truth].sort_index(ascending=False)
    KS_data.columns = ['Total count','Number of '+response_name]
    KS_data['Number of '+'Non-'+response_name]=KS_data['Total count']-KS_data['Number of '+response_name]
    KS_data[response_name+'_Rate'+'%'] = (KS_data['Number of '+response_name] / KS_data['Total count']).apply(lambda x:round(100*x,2))
    KS_data['Percent of '+response_name+'%'] = (KS_data['Number of '+response_name]/KS_data['Number of '+response_name].sum()).apply(lambda x:round(100*x,2))
    KS_data['Percent of '+'Non-'+response_name+'%'] = (KS_data['Number of '+'Non-'+response_name]/KS_data['Number of '+'Non-'+response_name].sum()).apply(lambda x:round(100*x,2))
    KS_data['ks_stats'] = np.round(((KS_data['Number of '+response_name] / KS_data['Number of '+response_name].sum()).cumsum() -(KS_data['Number of '+'Non-'+response_name] / KS_data['Number of '+'Non-'+response_name].sum()).cumsum()), 4) * 100
    KS_data['max_ks'] = KS_data['ks_stats'].apply(lambda x: '*****' if x == KS_data['ks_stats'].max() else '')
    #Calculate Gain = Cumulative Percent of Events/Total success events
    KS_data['Gain'] = KS_data['Percent of '+response_name+'%'].cumsum() 
    #Calculate Lift = Ratio of Bads to the number of data points in the decile
    KS_data['Lift'] = (KS_data['Gain']/np.array(range(10,100+10,10))).apply(lambda x:round(x,2))     
    return KS_data


In [None]:
gbm_ks_data=k_s_statistics_gain_lift(data=X_test,predicted_probability='gbm_predicted_probability',ground_truth='Risk')
gbm_ks_data

In [None]:
rf_ks_data=k_s_statistics_gain_lift(data=X_test,predicted_probability='rf_predicted_probability',ground_truth='Risk')
rf_ks_data

## KS statistics Inference:
KS statistics value for Random forest is 39.67 which  higher than GBM model ks value 37.26
Hecne  Random Forest model is out performing better than GBM.We can do this for different model perfomance check

# Model selection from  Gain and Lift graph   

# Gain and Lift Charts

Gain and Lift charts help us in visualising the performance of our model in comparison to the base model/no model.However, in contrast to the confusion matrix that evaluates models on the whole population gain or lift chart evaluates model performance in a portion of the population i.e Gain and Lift charts can help us in understanding how our model is performing on different sections of the data.

1.[source](https://www.datavedas.com/model-evaluation-classification-models/#:~:text=Higher%20K%2DS%20value%20means%20that,separate%20class%20label%20of%20observations.)
2.[source](https://towardsdatascience.com/how-to-determine-the-best-model-6b9c584d0db4)

In [None]:
def model_selection_by_gain_chart(model_gains_dict):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=list(range(0,100+10,10)), y=list(range(0,100+10,10)),
                    mode='lines+markers',name='Random Model'))
    for model_name,model_gains in model_gains_dict.items():
        model_gains.insert(0,0)
        fig.add_trace(go.Scatter(x=list(range(0,100+10,10)), y=model_gains,
                    mode='lines+markers',name=model_name))
    fig.update_xaxes(
        title_text = "% of Data Set",)

    fig.update_yaxes(title_text = "% of Gain",)
    fig.update_layout(title='Gain Charts',)
    fig.show()
    

In [None]:
model_selection_by_gain_chart(model_gains_dict={'GradientBoosting':gbm_ks_data.Gain.to_list(),
                                                'RandomForest':rf_ks_data.Gain.to_list()})

In [None]:
def model_selection_by_lift_chart(model_lift_dict):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=list(range(10,100+10,10)), y=np.repeat(1,10),
                    mode='lines+markers',name='Random Lift'))
    for model_name,model_lifts in model_lift_dict.items():
        fig.add_trace(go.Scatter(x=list(range(10,100+10,10)), y=model_lifts,
                    mode='lines+markers',name=model_name))
    fig.update_xaxes(
        title_text = "% of Data Set",)

    fig.update_yaxes(title_text = "Lift",)
    fig.update_layout(title='Lift Charts',)
    fig.show()

In [None]:
model_selection_by_lift_chart(model_lift_dict={'GradientBoosting':gbm_ks_data.Lift.to_list(),
                                                'RandomForest':rf_ks_data.Lift.to_list()})

#  Gain Lift chart Inference:
Also from Gain and lift chart it seems RF model is having more gain and lift than GBM.
So from KS ,Gain and Lift RF is best model in this our scenario .You can try other models and check it out.

# More Inference basis  RF model :


In [None]:
rf_ks_data

1. Lift for 1st decile is 2.71 which means Decile 1 of Random Forest can get 2.71 times of the risky customers compared to random selection.
2. From the table it is clear that we are able to get 78% risky customers within first 5 deciles.

### Please do comment if any consern and also do vote if you like my work :-) 
### Thank You