# Data
South German Credit. (2019). UCI Machine Learning Repository. https://doi.org/10.24432/C5X89F.

# Preliminaries

Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import warnings; warnings.filterwarnings('ignore')

Read files and save as dataframe

In [2]:
df = pd.read_table('SouthGermanCredit.asc', sep=' ')

Read provided attributes information

In [3]:
# Attributes Information
with open('codetable.txt') as f:
    lines = f.read()
    print(lines)

$`laufkont = status`
                                               
 1 : no checking account                       
 2 : ... < 0 DM                                
 3 : 0<= ... < 200 DM                          
 4 : ... >= 200 DM / salary for at least 1 year

$`laufzeit = duration`
     

$`moral = credit_history`
                                                
 0 : delay in paying off in the past            
 1 : critical account/other credits elsewhere   
 2 : no credits taken/all credits paid back duly
 3 : existing credits paid back duly till now   
 4 : all credits at this bank paid back duly    

$`verw = purpose`
                        
 0 : others             
 1 : car (new)          
 2 : car (used)         
 3 : furniture/equipment
 4 : radio/television   
 5 : domestic appliances
 6 : repairs            
 7 : education          
 8 : vacation           
 9 : retraining         
 10 : business          

$`hoehe = amount`
     

$`sparkont = savings`
                     

Translate attributes name from german to english in order to comprehend the data

In [4]:
df.columns = ['status', 'duration', 'credit_history', 'purpose', 'amount', 'saving', 'employment_duration', 'installment_rate'
 , 'personal_status_sex', 'other_debtors', 'present_residence', 'property', 'age', 'other_installment_plans', 'housing'
 , 'number_credits', 'job', 'people_liable', 'telephone', 'foreign_worker', 'credit_risk']

In [5]:
df

Unnamed: 0,status,duration,credit_history,purpose,amount,saving,employment_duration,installment_rate,personal_status_sex,other_debtors,...,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk
0,1,18,4,2,1049,1,2,4,2,1,...,2,21,3,1,1,3,2,1,2,1
1,1,9,4,0,2799,1,3,2,3,1,...,1,36,3,1,2,3,1,1,2,1
2,2,12,2,9,841,2,4,2,2,1,...,1,23,3,1,1,2,2,1,2,1
3,1,12,4,0,2122,1,3,3,3,1,...,1,39,3,1,2,2,1,1,1,1
4,1,12,4,0,2171,1,3,4,3,1,...,2,38,1,2,2,2,2,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,24,2,3,1987,1,3,2,3,1,...,1,21,3,1,1,2,1,1,2,0
996,1,24,2,0,2303,1,5,4,3,2,...,1,45,3,2,1,3,2,1,2,0
997,4,21,4,0,12680,5,5,4,3,1,...,4,30,3,3,1,4,2,2,2,0
998,2,12,2,3,6468,5,1,2,3,1,...,4,52,3,2,1,4,2,2,2,0


# Explanatory Data Analysis (EDA)

Check data information regarding missing value and data types

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   status                   1000 non-null   int64
 1   duration                 1000 non-null   int64
 2   credit_history           1000 non-null   int64
 3   purpose                  1000 non-null   int64
 4   amount                   1000 non-null   int64
 5   saving                   1000 non-null   int64
 6   employment_duration      1000 non-null   int64
 7   installment_rate         1000 non-null   int64
 8   personal_status_sex      1000 non-null   int64
 9   other_debtors            1000 non-null   int64
 10  present_residence        1000 non-null   int64
 11  property                 1000 non-null   int64
 12  age                      1000 non-null   int64
 13  other_installment_plans  1000 non-null   int64
 14  housing                  1000 non-null   int64
 15  numbe

Check descriptive statistics to check any abnormal values

In [7]:
df.describe()

Unnamed: 0,status,duration,credit_history,purpose,amount,saving,employment_duration,installment_rate,personal_status_sex,other_debtors,...,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,2.577,20.903,2.545,2.828,3271.248,2.105,3.384,2.973,2.682,1.145,...,2.358,35.542,2.675,1.928,1.407,2.904,1.845,1.404,1.963,0.7
std,1.257638,12.058814,1.08312,2.744439,2822.75176,1.580023,1.208306,1.118715,0.70808,0.477706,...,1.050209,11.35267,0.705601,0.530186,0.577654,0.653614,0.362086,0.490943,0.188856,0.458487
min,1.0,4.0,0.0,0.0,250.0,1.0,1.0,1.0,1.0,1.0,...,1.0,19.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,1.0,12.0,2.0,1.0,1365.5,1.0,3.0,2.0,2.0,1.0,...,1.0,27.0,3.0,2.0,1.0,3.0,2.0,1.0,2.0,0.0
50%,2.0,18.0,2.0,2.0,2319.5,1.0,3.0,3.0,3.0,1.0,...,2.0,33.0,3.0,2.0,1.0,3.0,2.0,1.0,2.0,1.0
75%,4.0,24.0,4.0,3.0,3972.25,3.0,5.0,4.0,3.0,1.0,...,3.0,42.0,3.0,2.0,2.0,3.0,2.0,2.0,2.0,1.0
max,4.0,72.0,4.0,10.0,18424.0,5.0,5.0,4.0,4.0,3.0,...,4.0,75.0,3.0,3.0,4.0,4.0,2.0,2.0,2.0,1.0


# Data Preparation

Some categorical variables are encoded as a ordinal variables but actually nominals, these variables are:
* purpose
* personal_status_sex
* other_debtors
* property
* other_installment_plans
* housing
* job

Following code used to convert it to more suitable form for nominal variables.

In [8]:
# Purpose
df.purpose = df.purpose.replace({0 : 'others', 1 : 'car_new', 2 : 'car_used', 3 : 'furniture/equipment'
                                 , 4 : 'radio/television', 5 : 'domestic appliances', 6 : 'repairs', 7 : 'education'
                                , 8 : 'vacation', 9 : 'retraining', 10 : 'business'})

df.personal_status_sex = df.personal_status_sex.replace({1 : 'male : divorced/separated', 
                                                         2 : 'female : non-single or male : single', 
                                                         3 : 'male : married/widowed', 
                                                         4 : 'female : single'})

df.other_debtors = df.other_debtors.replace({1 : 'none', 2 : 'co-applicant', 3 : 'guarantor'})

df.property = df.property.replace({1 : 'unknown / no property',
                                   2 : 'car or other',                             
                                   3 : 'building soc. savings agr./life insurance',
                                   4 : 'real estate'})

df.other_installment_plans = df.other_installment_plans.replace({1 : 'bank',
                                                                 2 : 'stores',
                                                                 3 : 'none'})

df.housing = df.housing.replace({1 : 'for free',
                                 2 : 'rent',
                                 3 : 'own'})

df.job = df.job.replace({1 : 'unemployed/unskilled - non-resident',
                         2 : 'unskilled - resident',
                         3 : 'skilled employee/official',
                         4 : 'manager/self-empl./highly qualif. employee'})

In [9]:
nominals = ['purpose', 'personal_status_sex', 'other_debtors', 'property', 'other_installment_plans', 'housing', 'job']

for i in nominals:
    df = pd.concat([df, pd.get_dummies(df[i], prefix=i)],axis=1)
df.drop(nominals, axis=1, inplace=True)

In [10]:
df

Unnamed: 0,status,duration,credit_history,amount,saving,employment_duration,installment_rate,present_residence,age,number_credits,...,other_installment_plans_bank,other_installment_plans_none,other_installment_plans_stores,housing_for free,housing_own,housing_rent,job_manager/self-empl./highly qualif. employee,job_skilled employee/official,job_unemployed/unskilled - non-resident,job_unskilled - resident
0,1,18,4,1049,1,2,4,4,21,1,...,0,1,0,1,0,0,0,1,0,0
1,1,9,4,2799,1,3,2,2,36,2,...,0,1,0,1,0,0,0,1,0,0
2,2,12,2,841,2,4,2,4,23,1,...,0,1,0,1,0,0,0,0,0,1
3,1,12,4,2122,1,3,3,2,39,2,...,0,1,0,1,0,0,0,0,0,1
4,1,12,4,2171,1,3,4,4,38,2,...,1,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,24,2,1987,1,3,2,4,21,1,...,0,1,0,1,0,0,0,0,0,1
996,1,24,2,2303,1,5,4,1,45,1,...,0,1,0,0,0,1,0,1,0,0
997,4,21,4,12680,5,5,4,4,30,1,...,0,1,0,0,1,0,1,0,0,0
998,2,12,2,6468,5,1,2,1,52,1,...,0,1,0,0,0,1,1,0,0,0


Split data into train and test set with ratio 8:2

In [11]:
X = df.drop(['credit_risk'], axis=1)
y = df.credit_risk

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=123, test_size=0.2)

# Model Development

Create logistic regression model baseline by using pipeline and cross-validation

In [13]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

In [14]:
numerical = ['duration', 'amount', 'age']
scaler = MinMaxScaler()

preprocessor = ColumnTransformer(
        remainder='passthrough', #passthough features not listed
        transformers=[
            ('scale', MinMaxScaler() , numerical)
        ])

pipeline1 = Pipeline([('scale', preprocessor), ('estimator', LogisticRegression())])

In [15]:
cv_score1 = cross_validate(pipeline1, X_train, y_train, cv=skf,  return_train_score=True, scoring=['accuracy'])
print('Train accuracy      : {}%'.format(round(100*cv_score1['train_accuracy'].mean(),2)))
print('Validation accuracy : {}%'.format(round(100*cv_score1['test_accuracy'].mean(),2)))

Train accuracy      : 79.47%
Validation accuracy : 75.63%


## Hyperparameter Tuning

Model hyperparameter is tuned. In this case hyperparameter C will be searched.

In [16]:
params={
    'estimator__C': stats.uniform(0,1),
    #'estimator__solver': ['lbfgs', 'liblinier', 'newton-cholesky']
}

rscv = RandomizedSearchCV(pipeline1, params, cv=skf, n_iter=50, random_state=123,
                          return_train_score=True, scoring='accuracy').fit(X_train, y_train)

In [17]:
rscv.best_params_

{'estimator__C': 0.4385722446796244}

In [18]:
trainScore = rscv.cv_results_['mean_train_score'][rscv.best_index_]
valScore = rscv.cv_results_['mean_test_score'][rscv.best_index_]

model = LogisticRegression(C=rscv.best_params_['estimator__C'])
X_train[numerical], X_test[numerical] = scaler.fit_transform(X_train[numerical]), scaler.transform(X_test[numerical])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Train accuracy      : {}%'.format(round(100*trainScore,2)))
print('Validation accuracy : {}%'.format(round(100*valScore,2)))
print('Test accuracy       : {}%'.format(round(100*accuracy_score(y_test, y_pred) ,2)))

Train accuracy      : 79.31%
Validation accuracy : 76.0%
Test accuracy       : 77.0%


Hyperparameter tune increase validation accuracy and slightly reduce variance which means better generalization. In this case, logistic regression model reach 77% accuracy for test set.