In [8]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.io import arff
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
import scikitplot as skplt
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline, FeatureUnion, make_union
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import scale
from sklearn.preprocessing import LabelBinarizer # one hot encoding
from sklearn.preprocessing import PolynomialFeatures # add polynomial features

## load our cleaned german credit dataset

In [4]:
df = pd.read_csv('german_credit.csv')

In [5]:
df.dtypes

Creditability                        int64
Account Balance                      int64
Duration of Credit (month)           int64
Payment Status of Previous Credit    int64
Purpose                              int64
Credit Amount                        int64
Value Savings/Stocks                 int64
Length of current employment         int64
Instalment per cent                  int64
Sex & Marital Status                 int64
Guarantors                           int64
Duration in Current address          int64
Most valuable available asset        int64
Age (years)                          int64
Concurrent Credits                   int64
Type of apartment                    int64
No of Credits at this Bank           int64
Occupation                           int64
No of dependents                     int64
Telephone                            int64
Foreign Worker                       int64
dtype: object

In [7]:
## summary statistics of our continuous data

df[['Age (years)', 'Duration of Credit (month)', 'Credit Amount']].describe()

Unnamed: 0,Age (years),Duration of Credit (month),Credit Amount
count,1000.0,1000.0,1000.0
mean,35.542,20.903,3271.248
std,11.35267,12.058814,2822.75176
min,19.0,4.0,250.0
25%,27.0,12.0,1365.5
50%,33.0,18.0,2319.5
75%,42.0,24.0,3972.25
max,75.0,72.0,18424.0


## merging and expanding numerical data types
 - creating polynomial features

In [13]:
pf = PolynomialFeatures(degree=2, interaction_only=False,  
                        include_bias=False)
result = pf.fit_transform(df[['Age (years)', 'Duration of Credit (month)', 'Credit Amount']])

In [14]:
pd.DataFrame(result).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,21.0,18.0,1049.0,441.0,378.0,22029.0,324.0,18882.0,1100401.0
1,36.0,9.0,2799.0,1296.0,324.0,100764.0,81.0,25191.0,7834401.0
2,23.0,12.0,841.0,529.0,276.0,19343.0,144.0,10092.0,707281.0
3,39.0,12.0,2122.0,1521.0,468.0,82758.0,144.0,25464.0,4502884.0
4,38.0,12.0,2171.0,1444.0,456.0,82498.0,144.0,26052.0,4713241.0


In [16]:
result = pd.DataFrame(result)
result.columns = ['Age (years)', 'Duration of Credit (month)', 'Credit Amount', 'Age^2', 'AgexCreditDuration', 
                  'AgexCreditAmount', 'CreditDuration^2', 'CreditDurationxCreditAmount', 'CreditAmount^2' ]

In [34]:
X, y = df.loc[:, df.columns != 'Creditability'], df['Creditability']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.74


In [35]:
new_df = df.drop(['Age (years)', 'Duration of Credit (month)', 'Credit Amount'], axis=1)
new_df = pd.concat([new_df, result[['Age^2', 'AgexCreditDuration', 
                  'AgexCreditAmount', 'CreditDuration^2', 'CreditDurationxCreditAmount', 'CreditAmount^2']]], axis=1)

In [42]:
X, y = new_df.loc[:, new_df.columns != 'Creditability'], new_df['Creditability']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.76
