<h1>**Preprocessing on German Credit Risk data set**</h1>

## 1.) Imports

In [None]:
import numpy as np
from numpy import mean
from numpy import std
import pandas as pd

from scipy import stats 
import math

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, classification_report, f1_score, precision_score, recall_score
from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.preprocessing import label_binarize

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline

#from colorsetup import colors, palette

## 2.) Read the data

In [None]:
gcr_data = pd.read_csv("../input/german-credit-data-with-risk/german_credit_data.csv", index_col=0)
#gcr_data = pd.read_csv("gcr_data_imputed.csv")
#gcr_data = pd.read_csv("gcr_processed.csv")

In [None]:
gcr_data.head()

In [None]:
gcr_data.nunique()

In [None]:
gcr_data.info()

In [None]:
gcr_data.isnull().sum()

In [None]:
gcr_data['Checking account'].value_counts()

Credit Amount means the maximum amount that Lender is committed to lend

In [None]:
feature_cols = [x for x in gcr_data.columns if x!='Risk']
X = gcr_data[feature_cols]
y = gcr_data['Risk']

In [None]:
X.head()

## 3.) Visualize the data

In [None]:
sns.heatmap(gcr_data.corr(), annot=True)

In [None]:
sns.barplot(x="Job", y="Credit amount", hue="Sex", data=gcr_data);

In [None]:
sns.barplot(x="Sex", y="Credit amount", hue="Risk", data=gcr_data);

In [None]:
sns.pointplot(x="Housing", y="Duration", hue="Sex", data=gcr_data,
              palette={"male": "blue", "female": "pink"},
              markers=["*", "o"], linestyles=["-", "--"]);

In [None]:
y_tar = (gcr_data['Risk']=='good').astype(int)
correlations = gcr_data[list(gcr_data.columns[:-1])].corrwith(y_tar)
correlations.sort_values(inplace=True)
correlations

sns.set_context('talk')
#sns.set_palette(palette)
sns.set_style('white')

sns.pairplot(gcr_data, hue='Risk')

In [None]:
ax = correlations.plot(kind='bar')
ax.set(ylim=[-1, 1], ylabel='pearson correlation');

## 4).Feature Engineering

### Label encoding categorical variables

In [None]:
cat_cols = gcr_data.columns[gcr_data.dtypes == 'O']
num_cols = gcr_data.columns[gcr_data.dtypes == 'int']
#ordinal_cols = [ 'Housing', 'Saving accounts', 'Checking account'] 

#nominal_cols = ['Purpose']


In [None]:
print(cat_cols)

In [None]:
replace_map = {'Housing': {'free': 1, 'rent': 2, 'own': 3}}
gcr_data.replace(replace_map, inplace=True)

In [None]:
replace_map = {'Saving accounts': {'little': 1, 'moderate': 2, 'quite rich': 3, 'rich': 4}}
gcr_data.replace(replace_map, inplace=True)

In [None]:
replace_map = {'Checking account': {'little': 1, 'moderate': 2, 'rich': 3}}
gcr_data.replace(replace_map, inplace=True)

In [None]:
gcr_data.head()

In [None]:
binary_cols = ['Sex', 'Risk']

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for column in binary_cols:
    gcr_data[column] = le.fit_transform(gcr_data[column])

In [None]:
gcr_data.head()

In [None]:
nominal_cols = ['Housing', 'Saving accounts', 'Checking account', 'Purpose']

gcr_data = pd.get_dummies(gcr_data, columns=nominal_cols, drop_first=False)

In [None]:
gcr_data.head()

In [None]:
gcr_data.nunique()

<h2>Categorical Missing Values Imputation</h2>


### 1) SimpleImputer

from sklearn.impute import SimpleImputer

values = gcr_data.values 
imputer = SimpleImputer(missing_values= np.nan, strategy='constant', fill_value='missing') 
transformed_values = imputer.fit_transform(values) 

gcr_data_imputed = pd.DataFrame(transformed_values, columns=gcr_data.columns)

gcr_data_imputed["Credit amount"] = gcr_data_imputed["Credit amount"].astype(int)
gcr_data_imputed["Duration"] = gcr_data_imputed["Duration"].astype(int)
gcr_data_imputed["Job"] = gcr_data_imputed["Job"].astype(int)
gcr_data_imputed["Age"] = gcr_data_imputed["Age"].astype(int)

outputfile = 'gcr_data_imputed.csv'
gcr_data_imputed.to_csv(outputfile, index=False)

### 2) IterativeImputer

In [None]:
feature_cols = [x for x in gcr_data.columns if x!='Risk']
X = gcr_data[feature_cols]
y = gcr_data['Risk']

In [None]:
X.head()

In [None]:
# define imputer
imputer = IterativeImputer(estimator=BayesianRidge(), n_nearest_features=None, imputation_order='ascending')

In [None]:
# fit on the dataset
imputer.fit(X)

In [None]:
# transform the dataset
Xtrans = imputer.transform(X)

In [None]:
X = pd.DataFrame(Xtrans, columns=X.columns)

In [None]:
X.isnull().sum()

### Log transforming skew variables

In [None]:
X.dtypes

In [None]:
#num_cols = X.columns[X.dtypes == 'float']
num_cols = X.columns
num_cols

In [None]:
skew_vals = X[num_cols].skew()

skew_limit = 0.75
skew_cols = (skew_vals.
            sort_values(ascending=False)
            .to_frame()
            .rename(columns={0:'Skew'})
            .query('abs(Skew) > {}'.format(skew_limit)))
skew_cols

In [None]:
for col in skew_cols.index.values:
    X[col] =X[col].apply(np.log1p)

In [None]:
X.head()

In [None]:
X.isnull().sum()

In [None]:
gcr_data_imputed = pd.concat([X, y], axis=1)

In [None]:
outputfile = 'gcr_data_imputed2.csv'
gcr_data_imputed.to_csv(outputfile, index=False)

<h1>**AdaBoost Classifier on German Credit Risk data set**</h1>

# Tables of Content:

**1. [Introduction](#intro_abc)** <br>
    - Information about the data set <br>
**2. [Reason for using this model](#reasons_abc)** <br>
    - The purpose of this specific model <br>
**3. [Libraries](#libraries_abc)** <br>
    - Importing Libraries <br>
    - Importing Dataset <br>
**4. [Preprocess](#preprocessing_abc)** <br>
    - 4.1 Separating feature and target variables <br>
    - 4.2 [Feature Selection](#feature_selection_abc)<br>
    - 4.3 [Spliting the X and Y in train and test](#split_abc)<br>
**5. [Models](#modelling_abc)**<br>
    - 5.1 AdaBoostClassifier with GridSearchCV<br>
    - 5.2 [Metrics](#metrics_abc)<br>
    - 5.3 [Confusion Matrix and Classification Report](#conmat_abc)<br>
    - 5.4 ROC curve and Precision Recall curve<br>
**6. [Conclusion and Benefits of the model](#summary_abc)** <br>
    The summary of the model implementation

<a id="intro_abc"></a> <br>
# **1. Introduction:** 
<h2>Context</h2>
The original dataset contains 1000 entries with 9 feature variables. In this dataset, each entry represents a person who takes a credit by a bank. Each person is classified as good or bad credit risks according to the set of attributes.

<h2>Content</h2>
I have cleaned and preprocessed the data already and also I have created a more relevant feature from two of the most important feature in the data set i.e. Credit amount and Duration. The preprocessed data set is already saved in a CSV file and we are going to use that file for our model training and testing purposes. The selected variables from the orginal data set are:

<b>Age </b>(numeric)<br>
<b>Sex </b>(text: male, female)<br>
<b>Job </b>(numeric: 0 - unskilled and non-resident, 1 - unskilled and resident, 2 - skilled, 3 - highly skilled)<br>
<b>Housing</b> (text: own, rent, or free)<br>
<b>Saving accounts</b> (text - little, moderate, quite rich, rich)<br>
<b>Checking account </b>(numeric, in DM - Deutsch Mark)<br>
<b>Credit amount</b> (numeric, in DM)<br>
<b>Duration</b> (numeric, in month)<br>
<b>Purpose</b>(text: car, furniture/equipment, radio/TV, domestic appliances, repairs, education, business, vacation/others<br>
<b>Risk </b> (Value target - Good or Bad Risk)<br>

<a id="resons_abc"></a> <br>
# **2. Reason for using this model**
<h2>Our goal is to: </h2>

- Implement AdaBoostClassifier with GridSearchCV.
- Moreover, we are going to assess various metrics for the model and plot area-under-curve and precision-recall curve.
- We are going to estimate the best estimator i.e. the best hyperparameters for our model.
- False Positive Rate are calculated using confusion matrix to better understand the potentiality of losses which will incur due to giving loans to the person who will default.

<a id="libraries_abc"></a> <br>
# **3. Libraries**

### Imports

In [None]:
import numpy as np
from numpy import mean
from numpy import std
import pandas as pd

from scipy import stats 
import math

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, classification_report, f1_score, precision_score, recall_score
from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.preprocessing import label_binarize

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline

#from colorsetup import colors, palette

### Read the Data

In [None]:
gcr_data = pd.read_csv("gcr_data_imputed2.csv")
#gcr_data = pd.read_csv("gcr_data_imputed.csv")
#gcr_data = pd.read_csv("gcr_processed.csv")

In [None]:
gcr_data.head()

In [None]:
gcr_data['Credit_amount/duration'] = gcr_data['Credit amount']/gcr_data['Duration']

In [None]:
gcr_data.head()

<a id="preprocessing_abc"></a> <br>
# **4. Preprocess**

<h2>4.1 Separating feature and target</h2>

In [None]:
#X = gcr_data.drop(['Risk','Credit_amount/duration'], axis=1)
#X = gcr_data.drop(['Risk', 'Credit amount'], axis=1)
X = gcr_data.drop(['Risk'], axis=1)

y = gcr_data['Risk']

In [None]:
X.shape

<a id="feature_selection_abc"></a>
<h2>4.2 Feature Selection</h2>

### 1. Univariate Selection

In [None]:
#apply SelectKBest class to extract top 5 best features
bestfeatures = SelectKBest(score_func=chi2, k=6)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs', 'Score']  #naming the dataframe columns
print(featureScores.nlargest(10, 'Score'))  #print best features

In [None]:
#X = gcr_data[featureScores.nlargest(23, 'Score')['Specs'].values]

### 2. Feature Importance

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

model_fi = ExtraTreesClassifier()
model_fi.fit(X,y)
#print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers

#plot graph of feature importances for better visualization
feat_importances = pd.Series(model_fi.feature_importances_, index=X.columns)
feat_importances.sort_values(ascending=True).nlargest(14).plot(kind='barh')
plt.show()

In [None]:
# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

model_eval = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1))

results = list()
for i in range(1,X.shape[1]+1):
        scores = evaluate_model(model_eval, X[feat_importances.nlargest(i).index], y)
        results.append(scores)
        print('> %s) %.3f (%.3f)' % (i, mean(scores), std(scores)))

In [None]:
no_of_features = [str(i) for i in range(1,X.shape[1]+1)]
# plot model performance for comparison
plt.figure(figsize=(8,6))
plt.boxplot(results, labels=no_of_features, showmeans=True)
plt.xticks(rotation=75)
plt.title('No. of features vs. Average Accuracy')
plt.show()

### 3. Correlation Matrix with Heatmap

### 4. RFE

In [None]:
# get a list of models to evaluate
def get_models():
    models = dict()
    # lr
    rfe = RFE(estimator=LogisticRegression(), n_features_to_select=5)
    model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1))
    models['lr'] = Pipeline(steps=[('s',rfe),('m',model)])
    
    # perceptron
    rfe = RFE(estimator=Perceptron(), n_features_to_select=5)
    model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1))
    models['per'] = Pipeline(steps=[('s',rfe),('m',model)])
    
    # cart
    rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=5)
    model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1))
    models['cart'] = Pipeline(steps=[('s',rfe),('m',model)])
    
    # rf
    rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=5)
    model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1))
    models['rf'] = Pipeline(steps=[('s',rfe),('m',model)])
    
    # gbm
    rfe = RFE(estimator=GradientBoostingClassifier(), n_features_to_select=5)
    model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1))
    models['gbm'] = Pipeline(steps=[('s',rfe),('m',model)])
    return models
 
# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores

In [None]:
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('>%s) %.3f (%.3f)' % (name, mean(scores), std(scores)))

In [None]:
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.title('RFE Estimator vs. Average Accuracy')
plt.show()

In [None]:
# get a list of models to evaluate
def best_estimator():
    models = dict()
    for i in range(2, X.shape[1]+1):
        rfe = RFE(estimator=GradientBoostingClassifier(), n_features_to_select=i)
        model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1))
        models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

# get the models to evaluate
models = best_estimator()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('> %s) %.3f (%.3f)' % (name, mean(scores), std(scores)))

In [None]:
# plot model performance for comparison
plt.figure(figsize=(8,6))
plt.boxplot(results, labels=names, showmeans=True)
plt.xticks(rotation=75)
plt.title('No. of features vs. Average Accuracy')
plt.show()

In [None]:
# define RFE
rfe = RFE(estimator=GradientBoostingClassifier(), n_features_to_select=8)
# fit RFE
rfe.fit(X, y)
# summarize all features
for i in range(X.shape[1]):
    print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

<a id="split_abc"></a>
<h2>4.3 StratifiedShuffleSplit</h2>

In [None]:
#X = gcr_data[feat_importances.nlargest(12).index]
X = gcr_data[X.columns[rfe.support_]]

X.head()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

strat_shuf_split = StratifiedShuffleSplit(n_splits=1, 
                                          test_size=0.3, 
                                          random_state=42)

train_idx, test_idx = next(strat_shuf_split.split(X, gcr_data.Risk))

# Create the dataframes
X_train = X.loc[train_idx, X.columns]
y_train = gcr_data.loc[train_idx, 'Risk']

X_test  = X.loc[test_idx, X.columns]
y_test  = gcr_data.loc[test_idx, 'Risk']

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

<a id="modelling_abc"></a> <br>
# **5. Models**

#### Suppressing any warnings

In [None]:
# Suppress warnings about too few trees from the early models
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

<h2>5.1 AdaBoostClassifier with GridSearchCV</h2>

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ABC = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1))
#ABC = AdaBoostClassifier(n_estimators=100, base_estimator= None,learning_rate=1, random_state = 1)

param_grid = {'n_estimators': [100, 150, 200],
              'learning_rate': [0.01, 0.001]}

ABC_GCV = GridSearchCV(ABC,
                      param_grid=param_grid, 
                      scoring='accuracy',
                      n_jobs=-1)

ABC_GCV = ABC_GCV.fit(X_train, y_train)

# The best model
print(ABC_GCV.best_estimator_)

ABC_GCV = AdaBoostClassifier(n_estimators=100, base_estimator= DecisionTreeClassifier(max_depth=1),learning_rate=0.01)
ABC_GCV = ABC_GCV.fit(X_train, y_train)

<a id="metrics_abc"></a>
<h2>5.2 Metrics</h2>

In [None]:
y_pred = list()
y_prob = list()

labels = ['ABC_GCV']
models = [ABC_GCV]

for lab,mod in zip(labels, models):
    y_pred.append(pd.Series(mod.predict(X_test), name=lab))
    y_prob.append(pd.Series(mod.predict_proba(X_test).max(axis=1), name=lab))
    
y_pred = pd.concat(y_pred, axis=1)
y_prob = pd.concat(y_prob, axis=1)

metrics = list()
cm = dict()

for lab in labels:

    # Precision, recall, f-score from the multi-class support function
    precision, recall, fscore, _ = score(y_test, y_pred[lab], average='weighted')
    
    # The usual way to calculate accuracy
    accuracy = accuracy_score(y_test, y_pred[lab])
    
    # ROC-AUC scores can be calculated by binarizing the data
    auc = roc_auc_score(label_binarize(y_test, classes=[0,1]),
              label_binarize(y_pred[lab], classes=[0,1]), 
              average='weighted')
    
    # Last, the confusion matrix
    cm[lab] = confusion_matrix(y_test, y_pred[lab])
    
    metrics.append(pd.Series({'precision':precision, 'recall':recall, 
                              'fscore':fscore, 'accuracy':accuracy,
                              'auc':auc}, 
                             name=lab))

metrics = pd.concat(metrics, axis=1)

metrics

<a id="conmat_abc"></a>
<h2>5.4 Confusion Matrix and Classification Report</h2>

In [None]:
sns.set_context('talk')

fig, axList = plt.subplots(nrows=1, ncols=2)
axList = axList.flatten()
fig.set_size_inches(10, 4)

axList[-1].axis('off')

for ax,lab in zip(axList, labels):
    sns.heatmap(cm[lab], ax=ax, annot=True, fmt='d');
    ax.set(title=lab);
    
plt.tight_layout()

In [None]:
from sklearn.metrics import classification_report, f1_score

print('#'*60)

y_pred_gb = ABC_GCV.predict(X_test)
print('AdaBoostClassifier')
print(classification_report(y_test, y_pred_gb))
print('Accuracy score: ', round(accuracy_score(y_test, y_pred_gb), 3))
print('F1 Score: ', round(f1_score(y_test, y_pred_gb), 3))

print('\n')
print('#'*60)

<h2>5.5 ROC curve and Precision-Recall curve</h2>

In [None]:
sns.set_context('talk')

fig, axList = plt.subplots(nrows=1, ncols=2)
fig.set_size_inches(10, 5)

# Plot the ROC-AUC curve

ax = axList[0]
fpr, tpr, thresholds = roc_curve(y_test, y_prob[lab])
ax.plot(fpr, tpr, linewidth=5)

# It is customary to draw a diagonal dotted line in ROC plots.
# This is to indicate completely random prediction. Deviation from this
# dotted line towards the upper left corner signifies the power of the model.
ax.plot([0, 1], [0, 1], ls='--', color='black', lw=.3)
ax.set(xlabel='FPR',
       ylabel='TPR',
       xlim=[-.01, 1.01], ylim=[-.01, 1.01],
       title='ROC curve: {}'.format(lab))
ax.grid(True)

# Plot the precision-recall curve

ax = axList[1]
precision, recall, _ = precision_recall_curve(y_test, y_prob[lab])
ax.plot(recall, precision, linewidth=5)
ax.set(xlabel='Recall', ylabel='Precision',
       xlim=[-.01, 1.01], ylim=[-.01, 1.01],
       title='Precision-Recall curve: {}'.format(lab))
ax.grid(True)

plt.tight_layout()

<a id="summary_abc"></a> <br>
# **6. Conclusion and Benefits of the model**

- The AdaBoostClassifier with GridSearchCV gives the FPR of 20% and 71.0% accuracy.
- Changing the number of important features does not affect the metrics.

########################################################################################################################################