# Demo German Credit dataset

Author: <alberto.suarez@uam.es> 


In [None]:
# The packages imported need to be installed in the Python environment.

import pandas as pd # data analysis tools
import matplotlib.pyplot as plt # plotting utilities
import seaborn as sns # statistical data visualization
%matplotlib inline

# See also: https://www.kaggle.com/code/diegoeliascosta/soen691-germancreditreport

In [None]:
# Load dataset

data = pd.read_csv('./data/german_credit_data.csv', sep=';')
# Source: https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)

print (data.columns)
data.head(10)

In [None]:
# Load dataset with numerically encoded attributes and class labels
data_numeric = pd.read_csv('./data/german_credit_data_numeric.csv', sep=';')
data_numeric.head(10)

In [None]:
# Data exploration

import numpy as np

n_plots_per_row = 5
    
n_unique = data.nunique()

# For displaying purposes, pick attributes that have between 1 and 50 unique values
data_plot = data[[col for col in data if n_unique[col] > 1 and n_unique[col] < 50]] 

n_examples, n_attributes = np.shape(data_plot)
attribute_names = list(data_plot)

n_rows_fig = int(np.ceil((n_attributes - 1) / n_plots_per_row))

fig, axes = plt.subplots(
    nrows=n_rows_fig,
    ncols=n_plots_per_row,
    figsize=(6 * n_plots_per_row, 12 * n_rows_fig), 
    dpi=80, 
    facecolor='w', 
    edgecolor='k',
)

fontsize = 20

for index_attribute in range(n_attributes):
    row_plot, column_plot = np.divmod(index_attribute, n_plots_per_row) 
    ax = axes[row_plot, column_plot]

    values_plot = data_plot.iloc[:, index_attribute]
    if (np.issubdtype(type(values_plot.iloc[0]), np.number)):
        values_plot.plot.hist(ax=ax)
    else:
        values_plot = values_plot.value_counts()
        values_plot.plot.bar(ax=ax)

    # ax.xaxis.set_tick_params(rotation=90)
    ax.tick_params(axis='x', rotation=90, labelsize=fontsize)
    ax.set_ylabel('counts', fontsize=fontsize)
    ax.set_title(
        '{} (column {})'.format(
            attribute_names[index_attribute], 
            index_attribute
        ), 
        fontsize=fontsize,
    )

plt.tight_layout(pad=1.0, w_pad=1.0, h_pad=1.0)
plt.show()

In [None]:
# Data encoding

data['account_check_status '] = data['account_check_status '].map(
    {'no checking account': 0,
     '< 0 DM': 1,
     '0 <= ... < 200 DM': 2,
     '>= 200 DM / salary assignments for at least 1 year': 3}
)
data.head(10)

In [None]:
# Correlations between attributes

plt.scatter(data['credit_amount '], data['age '])
plt.figure()

In [None]:
sns.pairplot(data)

In [None]:
# Analyzing segmented data

attribute_name = 'credit_amount '
threshold = 5000.0

masks = [data[attribute_name] <= threshold, data[attribute_name] > threshold]

fig, axes = plt.subplots(nrows=1, ncols=len(masks), figsize=(12, 4))

for mask, ax in zip(masks, axes):
    data_plot = data[mask]    
    values_plot = data_plot.loc[:, 'purpose ']

    if (np.issubdtype(type(values_plot.iloc[0]), np.number)):
        values_plot.plot.hist(values_plot, ax=ax)
    else:
        values_plot = values_plot.value_counts()
        values_plot.plot.bar(ax=ax)

    ax.tick_params(axis='x', rotation=90, labelsize=fontsize)
    ax.set_ylabel('counts', fontsize=fontsize)
    ax.set_title('{}'.format(attribute_name), fontsize=fontsize)

fig.suptitle('Threshold = {}'.format(threshold), fontsize=fontsize)
    
plt.show()

In [None]:
# Clustering

from sklearn.cluster import KMeans, SpectralClustering
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import normalize


X_norm = normalize(data_numeric);
y = KMeans().fit_predict(X_norm)

"""
spectral = SpectralClustering(
        n_clusters=8, eigen_solver='arpack',
        affinity="nearest_neighbors")

y = spectral.fit_predict(X_norm)

"""

# Dimensionality reduction PCA

# linear PCA
# X_PCA = PCA(n_components=10).fit_transform(X_norm, 2);

# Nonlinear (kernel PCA)
gamma = 100000
X_PCA = KernelPCA(n_components=10, gamma=gamma).fit_transform(X_norm, 2);

# Display clusters

fix, axs = plt.subplots(nrows=1, ncols=2, figsize=(14, 5))
x1_attribute = 'credit_amount '
x2_attribute = 'age '

fontsize = 14

axs[0].scatter(data[x1_attribute], data[x2_attribute], c=y)
axs[0].set_xlabel(x1_attribute, fontsize=fontsize)
axs[0].set_ylabel(x2_attribute, fontsize=fontsize)

axs[1].scatter(X_PCA[:, 0], X_PCA[:, 1], c=y)
axs[1].set_xlabel('1st principal component', fontsize=fontsize)
axs[1].set_ylabel('2nd principal component', fontsize=fontsize)

plt.show()