In [None]:
import operator as op
import random
random.seed(123)

import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.cluster import KMeans
import sklearn.metrics as skm
from sklearn.preprocessing import StandardScaler

from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
sns.set(rc={'figure.figsize':(12, 8)})

from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=ConvergenceWarning,
                        module="sklearn")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# 1. Data Import

In [None]:
df = pd.read_csv('/kaggle/input/german-credit/german_credit_data.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

# 2. Feature Engineering
### From the snapshot of the data frame and its info, we would need to:
* Dropping a repeated column ('Unnamed: 0')
* Categorical encoding.
* Missing Values Imputation 

In [None]:
# Dropping unnecessary column
df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
# Imputing missing values
df.replace(['?', 'NaN', np.nan], -1, inplace=True) 
num_vars= ['Saving accounts', 'Checking account']
for i in num_vars:
    df[i] =  df[i].astype('category')
imp = SimpleImputer(missing_values=-1, strategy='most_frequent')
df[num_vars] = imp.fit_transform(df[num_vars])

In [None]:
# Encoding 
cat_vars = ['Sex', 'Job', 'Housing','Purpose', 'Saving accounts', 'Checking account']
for i in cat_vars:
    df[i+"_cat"] = df[i].astype('category').cat.codes
df.head()

# 3. Exploratory Data Analysis

## What does the relationship between variables looks like?

In [None]:
# Masking to show only one side of the matrix
corr = np.corrcoef(df.corr())                        
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

# Axtual Correlation matrix as a heatmap
sns.heatmap(df.corr(), annot=True, mask=mask)
plt.show()

## Based on the 0.62 coefficient, how does Credit amount and duration relate on a scatterplot?

In [None]:
# Scatterplot based on the strongest relationship
sns.scatterplot(data=df, x='Duration', y='Credit amount')
plt.show()

## Does Age imply greater purchasing power and thus higher credit amount?

In [None]:
sns.lineplot(data=df, x='Age', y='Credit amount')
plt.show()

## What is the demographics of our creditors?

In [None]:
fig = plt.figure()
gs = fig.add_gridspec(1, 2, hspace=0.2, wspace=0.2)
(ax1), (ax2) = gs.subplots(sharex=False, sharey=False)

sns.histplot(ax=ax1, data=df, x='Sex')
sns.histplot(ax=ax2, data=df, x='Age', bins=10, kde=True)
plt.show()

## How do different Job types fair in terms of Credit Amount and Duration?

In [None]:
fig = plt.figure()
gs = fig.add_gridspec(1, 2, hspace=0.2, wspace=0.2)
(ax1), (ax2) = gs.subplots(sharex='all', sharey=False)

sns.violinplot(ax=ax1, data=df, x='Job', y='Credit amount', 
               hue='Sex', split=True)

sns.violinplot(ax=ax2, data=df, x='Job', y='Duration',
              hue='Sex', split=True)

plt.show()

# Clustering

In [None]:
# Dictionary with models performance metrics
models = {}

## K-Means

In [None]:
class kmeans():
    def __init__(self, df):
        self.df = df
        
    def pre_processing(self):
        
        X = self.df.drop(cat_vars, axis=1).values
        scaler = StandardScaler()
        self.X_scaled = scaler.fit_transform(X)
        return self
        
    def fit_pred(self):
        kmeans_kwargs = {"init": "random",
                        "n_init": 10,
                        "max_iter": 100,
                        "random_state": 2,}
#         clf = KMeans(init="random", n_clusters=2, n_init=10, max_iter=300)
#         clf.fit(self.X_scaled)
#         print(clf.inertia_)
#         print(clf.cluster_centers_)
#         print(clf.n_iter_)
        sse = []
        for k in range(1, 21):
            kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
            kmeans.fit(self.X_scaled)
            sse.append(kmeans.inertia_)
        sns.lineplot(x=range(1, 21), y=sse, markers=True)
        plt.xticks(range(1, 21))
        plt.xlabel("Number of Clusters")
        plt.ylabel("SSE")
        plt.show()

#         pred = clf.predict(self.X_test)
#         print(pred)

model = kmeans(df).pre_processing()
model.fit_pred()