# Import the libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ROCAUC
import plotly.graph_objects as go

# Read the Data and extract the features

In [None]:
df = pd.read_csv('/kaggle/input/german-credit-data-with-risk/german_credit_data.csv')
df = df.drop(['Unnamed: 0'],axis=1)
df.head()

In [None]:
df.columns = list(map(lambda name:name.replace(' ','_'),df.columns))

In [None]:
df[['Sex','Job','Housing','Saving_accounts','Checking_account','Purpose']] = \
    df[['Sex','Job','Housing','Saving_accounts','Checking_account','Purpose']].astype('category')

In [None]:
df['Sex']=df['Sex'].cat.codes
df['Job']=df['Job'].cat.codes
df['Housing']=df['Housing'].cat.codes
df['Saving_accounts']=df['Saving_accounts'].cat.codes
df['Checking_account']=df['Checking_account'].cat.codes
df['Purpose']=df['Purpose'].cat.codes
df['Risk']=df['Risk'].map({'good':0,'bad':1})

In [None]:
df.head()

# Risk is the target variable and then a train-test split

In [None]:
X,y = df.loc[:, df.columns != 'Risk'],df['Risk']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit a Gradient Boosting Machine and examine the ROC AUC graph.

In [None]:
gbm = GradientBoostingClassifier(n_estimators=500)

visualizer = ROCAUC(gbm, classes=[ "good", "bad"])
visualizer.fit(X_train, y_train)        
visualizer.score(X_test, y_test)        
visualizer.show()                       

# Calculate the Lift and Gain Charts.

In [None]:

X_test['predicted_probability'] = gbm.predict_proba(X_test)[:,1] # Predicted Proba for bads(=1)
X_test['Risk'] = y_test #Ground Truth
X_test.head()

In [None]:
#Sort the data in descending order of predicted probabilities.
X_test = X_test.sort_values(by='predicted_probability', ascending=False)
X_test.head()

In [None]:
#Cut deciles based on the predicted probabilities
X_test['decile_group'] = pd.qcut(X_test['predicted_probability'], 10)
X_test.head()

In [None]:
lift = X_test.groupby('decile_group').agg( #Group by Deciles of Predicted Probabilties
[
    'count', #The total number of customers(data points) in the decile
    'sum', #The total number of bad customers(Risk=1)
]
)['Risk'].sort_index(ascending=False)
lift.columns = ['Number of customers','Number of goods']
lift['Cumulative goods'] = lift['Number of goods'].cumsum() #Cumulative Sum of the number of bads
#Calculate Gain = Cumulative Percent of Events/Bads
lift['Percent of Events'] = lift['Number of goods']/lift['Number of goods'].sum()*100
lift['Gain'] = lift['Percent of Events'].cumsum() 
#Calculate Lift = Ratio of Bads to the number of data points in the decile
lift['Lift'] = lift['Gain']/np.array(range(10,100+10,10))
lift.head(10)

#  Inference
### Gain - The ratio of bads(risk=1) across each of the deciles of predicted probabilities. 84% of bads can be found across the top 6 deciles.
### Lift - Compare the predictive model with the random model. Like in the first 10% data one can find ~2.2 times the bads by using the gradient boosted trees model compared to a random predictor.

# Plot the Charts

In [None]:
gain = lift.Gain.tolist()
gain.insert(0,0)
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(0,100+10,10)), y=list(range(0,100+10,10)),
                    mode='lines+markers',
                    name='lines+markers'))
fig.add_trace(go.Scatter(x=list(range(0,100+10,10)), y=gain,
                    mode='lines+markers',
                    name='lines+markers'))

fig.update_xaxes(
        title_text = "% of Data Set",
)

fig.update_yaxes(
        title_text = "% of Gain",
        )
fig.update_layout(title='Gain Charts',)

fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(10,100+10,10)), y=np.repeat(1,10),
                    mode='lines+markers',
                    name='lines+markers'))
fig.add_trace(go.Scatter(x=list(range(10,100+10,10)), y=lift.Lift,
                    mode='lines+markers',
                    name='lines+markers'))

fig.update_xaxes(
        title_text = "% of Data Set",
)

fig.update_yaxes(
        title_text = "Lift",
        )
fig.update_layout(title='Lift Charts',)

fig.show()