# Dataset Information
----
This dataset contains information on default payments, demographic factors, credit data, history of payment, and bill statements of credit card clients in Taiwan from April 2005 to September 2005.

Content
There are 25 variables:

* **ID**: ID of each client
* **LIMIT_BAL**: Amount of given credit in NT dollars (includes individual and family/supplementary credit
* **SEX**: Gender (1=male, 2=female)
* **EDUCATION**: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
* **MARRIAGE**: Marital status (1=married, 2=single, 3=others)
* **AGE**: Age in years
* **PAY_0**: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, ... 8=payment delay for eight months, 9=payment delay for nine months and above)
* **PAY_2**: Repayment status in August, 2005 (scale same as above)
* **PAY_3**: Repayment status in July, 2005 (scale same as above)
* **PAY_4**: Repayment status in June, 2005 (scale same as above)
* **PAY_5**: Repayment status in May, 2005 (scale same as above)
* **PAY_6**: Repayment status in April, 2005 (scale same as above)
* **BILL_AMT1**: Amount of bill statement in September, 2005 (NT dollar)
* **BILL_AMT2**: Amount of bill statement in August, 2005 (NT dollar)
* **BILL_AMT3**: Amount of bill statement in July, 2005 (NT dollar)
* **BILL_AMT4**: Amount of bill statement in June, 2005 (NT dollar)
* **BILL_AMT5**: Amount of bill statement in May, 2005 (NT dollar)
* **BILL_AMT6**: Amount of bill statement in April, 2005 (NT dollar)
* **PAY_AMT1**: Amount of previous payment in September, 2005 (NT dollar)
* **PAY_AMT2**: Amount of previous payment in August, 2005 (NT dollar)
* **PAY_AMT3**: Amount of previous payment in July, 2005 (NT dollar)
* **PAY_AMT4**: Amount of previous payment in June, 2005 (NT dollar)
* **PAY_AMT5**: Amount of previous payment in May, 2005 (NT dollar)
* **PAY_AMT6**: Amount of previous payment in April, 2005 (NT dollar)
* **default.payment.next.month**: Default payment (1=yes, 0=no)

### Inspiration
Some ideas for exploration:

1. How does the probability of default payment vary by categories of different demographic variables?
2. Which variables are the strongest predictors of default payment?

### Acknowledgements
Any publications based on this dataset should acknowledge the following:

Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.

The original dataset can be found here at the UCI Machine Learning Repository.

### Import
Import all the libraries for start to work, since EDA until modeling creation. Additional, we import the CSV (UCI Credit Card).

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn import metrics 
from scipy.spatial.distance import cdist 
from sklearn.decomposition import PCA

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


We set options to display all the columns from the dataset.

In [None]:
pd.set_option('display.max_columns', 999)

In [None]:
import pandas as pd
from kmodes.kmodes import KModes
UCI_Credit_Card = pd.read_csv("../input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv")

In [None]:
UCI_Credit_Card.head()

In [None]:
UCI_Credit_Card.describe()

In [None]:
UCI_Credit_Card.info()

In [None]:
fig, axes = plt.subplots(nrows = 2, figsize = (20,20))
g = sns.distplot(UCI_Credit_Card['LIMIT_BAL'], ax = axes[0])
g.axes.set_xlim = (0,1000000)
g = sns.boxplot(UCI_Credit_Card['LIMIT_BAL'], ax = axes[1])
g.axes.set_xlim = (0,1000000)

In [None]:
UCI_Credit_Card['LIM_CREDITO'] = np.where((UCI_Credit_Card['LIMIT_BAL'] < UCI_Credit_Card['LIMIT_BAL'].quantile(0.25)),1,
                                     np.where((UCI_Credit_Card['LIMIT_BAL'] >= UCI_Credit_Card['LIMIT_BAL'].quantile(0.25)) & (UCI_Credit_Card['LIMIT_BAL'] < UCI_Credit_Card['LIMIT_BAL'].quantile(0.50)),2,
                                      np.where((UCI_Credit_Card['LIMIT_BAL'] >= UCI_Credit_Card['LIMIT_BAL'].quantile(0.50)) & (UCI_Credit_Card['LIMIT_BAL'] < UCI_Credit_Card['LIMIT_BAL'].quantile(0.75)),3,
                                      np.where((UCI_Credit_Card['LIMIT_BAL'] >= UCI_Credit_Card['LIMIT_BAL'].quantile(0.75)) & (UCI_Credit_Card['LIMIT_BAL'] < UCI_Credit_Card['LIMIT_BAL'].quantile(0.95)),4,
                                      5))))

In [None]:
sns.countplot(UCI_Credit_Card['LIM_CREDITO'])

In [None]:
sns.countplot(UCI_Credit_Card['SEX'])

In [None]:
sns.countplot(UCI_Credit_Card['EDUCATION'])

In [None]:
sns.countplot(UCI_Credit_Card['MARRIAGE'])

In [None]:
fig, axes = plt.subplots(nrows = 2, figsize = (20,20))
g = sns.distplot(UCI_Credit_Card['AGE'], ax = axes[0])
g.axes.set_xlim = (0,90)
g = sns.boxplot(UCI_Credit_Card['AGE'], ax = axes[1])
g.axes.set_xlim = (0,90)
plt.show()

UCI_Credit_Card['AGE'].describe()

In [None]:
UCI_Credit_Card['EDAD'] = np.where(UCI_Credit_Card['AGE'] <= UCI_Credit_Card['AGE'].quantile(0.25), 1,
                                   np.where((UCI_Credit_Card['AGE'] > UCI_Credit_Card['AGE'].quantile(0.25)) & (UCI_Credit_Card['AGE'] <= UCI_Credit_Card['AGE'].quantile(0.50)), 2,
                                            np.where((UCI_Credit_Card['AGE'] > UCI_Credit_Card['AGE'].quantile(0.50)) & (UCI_Credit_Card['AGE'] <= UCI_Credit_Card['AGE'].quantile(0.75)), 3,
                                                     np.where((UCI_Credit_Card['AGE'] > UCI_Credit_Card['AGE'].quantile(0.75)) & (UCI_Credit_Card['AGE'] <= UCI_Credit_Card['AGE'].quantile(0.95)), 4, 5                                                     
                                                     )
                                            )
                                   )
                                  )

In [None]:
sns.countplot(UCI_Credit_Card['EDAD'])

In [None]:
sns.countplot(UCI_Credit_Card['PAY_0'])
plt.show()
UCI_Credit_Card['PAY_0'].describe()

In [None]:
UCI_Credit_Card['PAT_0'] = np.where(UCI_Credit_Card['PAY_0'] < 0.0, 1, 
                                     np.where(UCI_Credit_Card['PAY_0'] == 0.0, 2,
                                              np.where(UCI_Credit_Card['PAY_0'] > 0.0, 3,4
                                              )
                                             )
                                     )

In [None]:
UCI_Credit_Card.head(10)

In [None]:
sns.countplot(UCI_Credit_Card['PAT_0'])
plt.show()
UCI_Credit_Card['PAT_0'].describe()
UCI_Credit_Card['PAT_0'].value_counts()

In [None]:
sns.countplot(UCI_Credit_Card['PAY_2'])
plt.show()
UCI_Credit_Card['PAY_2'].describe()

In [None]:
UCI_Credit_Card['PAT_2'] = np.where(UCI_Credit_Card['PAY_2'] < 0.0, 1, 
                                     np.where(UCI_Credit_Card['PAY_2'] == 0.0, 2,
                                              np.where(UCI_Credit_Card['PAY_2'] > 0.0, 3,4
                                              )
                                             )
                                     )

In [None]:
sns.countplot(UCI_Credit_Card['PAY_3'])
plt.show()
UCI_Credit_Card['PAY_3'].describe()

In [None]:
UCI_Credit_Card['PAT_3'] = np.where(UCI_Credit_Card['PAY_3'] < 0.0, 1, 
                                     np.where(UCI_Credit_Card['PAY_3'] == 0.0, 2,
                                              np.where(UCI_Credit_Card['PAY_3'] > 0.0, 3,4
                                              )
                                             )
                                     )

In [None]:
sns.countplot(UCI_Credit_Card['PAT_3'])

In [None]:
sns.countplot(UCI_Credit_Card['PAY_4'])
plt.show()
UCI_Credit_Card['PAY_4'].describe()

In [None]:
UCI_Credit_Card['PAT_4'] = np.where(UCI_Credit_Card['PAY_4'] < 0.0, 1, 
                                     np.where(UCI_Credit_Card['PAY_4'] == 0.0, 2,
                                              np.where(UCI_Credit_Card['PAY_4'] > 0.0, 3,4
                                              )
                                             )
                                     )

In [None]:
sns.countplot(UCI_Credit_Card['PAY_5'])
plt.show()
UCI_Credit_Card['PAY_5'].describe()

In [None]:
UCI_Credit_Card['PAT_5'] = np.where(UCI_Credit_Card['PAY_5'] < 0.0, 1, 
                                     np.where(UCI_Credit_Card['PAY_5'] == 0.0, 2,
                                              np.where(UCI_Credit_Card['PAY_5'] > 0.0, 3,4
                                              )
                                             )
                                     )

In [None]:
sns.countplot(UCI_Credit_Card['PAY_6'])
plt.show()
UCI_Credit_Card['PAY_6'].describe()

In [None]:
UCI_Credit_Card['PAT_6'] = np.where(UCI_Credit_Card['PAY_6'] < 0.0, 1, 
                                     np.where(UCI_Credit_Card['PAY_6'] == 0.0, 2,
                                              np.where(UCI_Credit_Card['PAY_6'] > 0.0, 3,4
                                              )
                                             )
                                     )

In [None]:
UCI_Credit_Card.describe()

In [None]:
fig, axes = plt.subplots(nrows = 2, figsize = (10,10))
g = sns.distplot(UCI_Credit_Card['BILL_AMT1'], ax = axes[0])
g.axes.set_xlim = (0,1000000)
g = sns.boxplot(UCI_Credit_Card['BILL_AMT1'], ax = axes[1])
g.axes.set_xlim = (0,1000000)
plt.show()
UCI_Credit_Card['BILL_AMT1'].describe()

In [None]:
UCI_Credit_Card['B_AMT1'] = np.where((UCI_Credit_Card['BILL_AMT1'] < UCI_Credit_Card['BILL_AMT1'].quantile(0.25)),1,
                                     np.where((UCI_Credit_Card['BILL_AMT1'] >= UCI_Credit_Card['BILL_AMT1'].quantile(0.25)) & (UCI_Credit_Card['BILL_AMT1'] < UCI_Credit_Card['BILL_AMT1'].quantile(0.50)),2,
                                      np.where((UCI_Credit_Card['BILL_AMT1'] >= UCI_Credit_Card['BILL_AMT1'].quantile(0.50)) & (UCI_Credit_Card['BILL_AMT1'] < UCI_Credit_Card['BILL_AMT1'].quantile(0.75)),3,
                                      np.where((UCI_Credit_Card['BILL_AMT1'] >= UCI_Credit_Card['BILL_AMT1'].quantile(0.75)) & (UCI_Credit_Card['BILL_AMT1'] < UCI_Credit_Card['BILL_AMT1'].quantile(0.95)),4,
                                      5))))

#UCI_Credit_Card[['BILL_AMT1','B_AMT1']].head(10)
UCI_Credit_Card.head(10)

In [None]:
UCI_Credit_Card.describe()

In [None]:
sns.countplot(UCI_Credit_Card['B_AMT1'])

In [None]:
fig, axes = plt.subplots(nrows = 2, figsize = (10,10))
g = sns.distplot(UCI_Credit_Card['BILL_AMT2'], ax = axes[0])
g.axes.set_xlim = (0,1000000)
g = sns.boxplot(UCI_Credit_Card['BILL_AMT2'], ax = axes[1])
g.axes.set_xlim = (0,1000000)
plt.show()
UCI_Credit_Card['BILL_AMT2'].describe()

In [None]:
UCI_Credit_Card['B_AMT2'] = np.where((UCI_Credit_Card['BILL_AMT2'] < UCI_Credit_Card['BILL_AMT2'].quantile(0.25)),1,
                                     np.where((UCI_Credit_Card['BILL_AMT2'] >= UCI_Credit_Card['BILL_AMT2'].quantile(0.25)) & (UCI_Credit_Card['BILL_AMT2'] < UCI_Credit_Card['BILL_AMT2'].quantile(0.50)),2,
                                      np.where((UCI_Credit_Card['BILL_AMT2'] >= UCI_Credit_Card['BILL_AMT2'].quantile(0.50)) & (UCI_Credit_Card['BILL_AMT2'] < UCI_Credit_Card['BILL_AMT2'].quantile(0.75)),3,
                                      np.where((UCI_Credit_Card['BILL_AMT2'] >= UCI_Credit_Card['BILL_AMT2'].quantile(0.75)) & (UCI_Credit_Card['BILL_AMT2'] < UCI_Credit_Card['BILL_AMT2'].quantile(0.95)),4,
                                      5))))

UCI_Credit_Card.head(10)

In [None]:
fig, axes = plt.subplots(nrows = 2, figsize = (10,10))
g = sns.distplot(UCI_Credit_Card['BILL_AMT3'], ax = axes[0])
g.axes.set_xlim = (0,1000000)
g = sns.boxplot(UCI_Credit_Card['BILL_AMT3'], ax = axes[1])
g.axes.set_xlim = (0,1000000)
plt.show()
UCI_Credit_Card['BILL_AMT3'].describe()

In [None]:
UCI_Credit_Card['B_AMT3'] = np.where((UCI_Credit_Card['BILL_AMT3'] < UCI_Credit_Card['BILL_AMT3'].quantile(0.25)),1,
                                     np.where((UCI_Credit_Card['BILL_AMT3'] >= UCI_Credit_Card['BILL_AMT3'].quantile(0.25)) & (UCI_Credit_Card['BILL_AMT3'] < UCI_Credit_Card['BILL_AMT3'].quantile(0.50)),2,
                                      np.where((UCI_Credit_Card['BILL_AMT3'] >= UCI_Credit_Card['BILL_AMT3'].quantile(0.50)) & (UCI_Credit_Card['BILL_AMT3'] < UCI_Credit_Card['BILL_AMT3'].quantile(0.75)),3,
                                      np.where((UCI_Credit_Card['BILL_AMT3'] >= UCI_Credit_Card['BILL_AMT3'].quantile(0.75)) & (UCI_Credit_Card['BILL_AMT3'] < UCI_Credit_Card['BILL_AMT3'].quantile(0.95)),4,
                                      5))))

#UCI_Credit_Card[['BILL_AMT1','B_AMT1']].head(10)
UCI_Credit_Card.head(10)

In [None]:
fig, axes = plt.subplots(nrows = 2, figsize = (10,10))
g = sns.distplot(UCI_Credit_Card['BILL_AMT4'], ax = axes[0])
g.axes.set_xlim = (0,1000000)
g = sns.boxplot(UCI_Credit_Card['BILL_AMT4'], ax = axes[1])
g.axes.set_xlim = (0,1000000)
plt.show()
UCI_Credit_Card['BILL_AMT4'].describe()

In [None]:
UCI_Credit_Card['B_AMT4'] = np.where((UCI_Credit_Card['BILL_AMT4'] < UCI_Credit_Card['BILL_AMT4'].quantile(0.25)),1,
                                     np.where((UCI_Credit_Card['BILL_AMT4'] >= UCI_Credit_Card['BILL_AMT4'].quantile(0.25)) & (UCI_Credit_Card['BILL_AMT4'] < UCI_Credit_Card['BILL_AMT4'].quantile(0.50)),2,
                                      np.where((UCI_Credit_Card['BILL_AMT4'] >= UCI_Credit_Card['BILL_AMT4'].quantile(0.50)) & (UCI_Credit_Card['BILL_AMT4'] < UCI_Credit_Card['BILL_AMT4'].quantile(0.75)),3,
                                      np.where((UCI_Credit_Card['BILL_AMT4'] >= UCI_Credit_Card['BILL_AMT4'].quantile(0.75)) & (UCI_Credit_Card['BILL_AMT4'] < UCI_Credit_Card['BILL_AMT4'].quantile(0.95)),4,
                                      5))))

#UCI_Credit_Card[['BILL_AMT1','B_AMT1']].head(10)
UCI_Credit_Card.head(10)

In [None]:
fig, axes = plt.subplots(nrows = 2, figsize = (10,10))
g = sns.distplot(UCI_Credit_Card['BILL_AMT5'], ax = axes[0])
g.axes.set_xlim = (0,1000000)
g = sns.boxplot(UCI_Credit_Card['BILL_AMT5'], ax = axes[1])
g.axes.set_xlim = (0,1000000)
plt.show()
UCI_Credit_Card['BILL_AMT5'].describe()

In [None]:
UCI_Credit_Card['B_AMT5'] = np.where((UCI_Credit_Card['BILL_AMT5'] < UCI_Credit_Card['BILL_AMT5'].quantile(0.25)),1,
                                     np.where((UCI_Credit_Card['BILL_AMT5'] >= UCI_Credit_Card['BILL_AMT5'].quantile(0.25)) & (UCI_Credit_Card['BILL_AMT5'] < UCI_Credit_Card['BILL_AMT5'].quantile(0.50)),2,
                                      np.where((UCI_Credit_Card['BILL_AMT5'] >= UCI_Credit_Card['BILL_AMT5'].quantile(0.50)) & (UCI_Credit_Card['BILL_AMT5'] < UCI_Credit_Card['BILL_AMT5'].quantile(0.75)),3,
                                      np.where((UCI_Credit_Card['BILL_AMT5'] >= UCI_Credit_Card['BILL_AMT5'].quantile(0.75)) & (UCI_Credit_Card['BILL_AMT5'] < UCI_Credit_Card['BILL_AMT5'].quantile(0.95)),4,
                                      5))))

#UCI_Credit_Card[['BILL_AMT1','B_AMT1']].head(10)
UCI_Credit_Card.head(10)

In [None]:
fig, axes = plt.subplots(nrows = 2, figsize = (10,10))
g = sns.distplot(UCI_Credit_Card['BILL_AMT6'], ax = axes[0])
g.axes.set_xlim = (0,1000000)
g = sns.boxplot(UCI_Credit_Card['BILL_AMT6'], ax = axes[1])
g.axes.set_xlim = (0,1000000)
plt.show()
UCI_Credit_Card['BILL_AMT6'].describe()

In [None]:
UCI_Credit_Card['B_AMT6'] = np.where((UCI_Credit_Card['BILL_AMT6'] < UCI_Credit_Card['BILL_AMT6'].quantile(0.25)),1,
                                     np.where((UCI_Credit_Card['BILL_AMT6'] >= UCI_Credit_Card['BILL_AMT6'].quantile(0.25)) & (UCI_Credit_Card['BILL_AMT6'] < UCI_Credit_Card['BILL_AMT6'].quantile(0.50)),2,
                                      np.where((UCI_Credit_Card['BILL_AMT6'] >= UCI_Credit_Card['BILL_AMT6'].quantile(0.50)) & (UCI_Credit_Card['BILL_AMT6'] < UCI_Credit_Card['BILL_AMT6'].quantile(0.75)),3,
                                      np.where((UCI_Credit_Card['BILL_AMT6'] >= UCI_Credit_Card['BILL_AMT6'].quantile(0.75)) & (UCI_Credit_Card['BILL_AMT6'] < UCI_Credit_Card['BILL_AMT6'].quantile(0.95)),4,
                                      5))))

#UCI_Credit_Card[['BILL_AMT1','B_AMT1']].head(10)
UCI_Credit_Card.head(10)

In [None]:
fig, axes = plt.subplots(nrows = 2, figsize = (10,10))
g = sns.distplot(UCI_Credit_Card['PAY_AMT1'], ax = axes[0])
g.axes.set_xlim = (0,1000000)
g = sns.boxplot(UCI_Credit_Card['PAY_AMT1'], ax = axes[1])
g.axes.set_xlim = (0,1000000)
plt.show()
UCI_Credit_Card['PAY_AMT1'].describe()

In [None]:
UCI_Credit_Card['P_AMT1'] = np.where(UCI_Credit_Card['PAY_AMT1'] <= UCI_Credit_Card['PAY_AMT1'].quantile(0.25),1,
                                     np.where((UCI_Credit_Card['PAY_AMT1'] > UCI_Credit_Card['PAY_AMT1'].quantile(0.25)) & (UCI_Credit_Card['PAY_AMT1'] <= UCI_Credit_Card['PAY_AMT1'].quantile(0.50)),2,
                                              np.where((UCI_Credit_Card['PAY_AMT1'] > UCI_Credit_Card['PAY_AMT1'].quantile(0.50)) & (UCI_Credit_Card['PAY_AMT1'] <= UCI_Credit_Card['PAY_AMT1'].quantile(0.75)),3,
                                                                np.where((UCI_Credit_Card['PAY_AMT1'] > UCI_Credit_Card['PAY_AMT1'].quantile(0.75)) & (UCI_Credit_Card['PAY_AMT1'] <= UCI_Credit_Card['PAY_AMT1'].quantile(0.95)),4,
                                                                         5))))

In [None]:
UCI_Credit_Card[['PAY_AMT1','P_AMT1']].head(10)

In [None]:
fig, axes = plt.subplots(nrows = 2, figsize = (10,10))
g = sns.distplot(UCI_Credit_Card['PAY_AMT2'], ax = axes[0])
g.axes.set_xlim = (0,1000000)
g = sns.boxplot(UCI_Credit_Card['PAY_AMT2'], ax = axes[1])
g.axes.set_xlim = (0,1000000)
plt.show()
UCI_Credit_Card['PAY_AMT2'].describe()

In [None]:
UCI_Credit_Card['P_AMT2'] = np.where(UCI_Credit_Card['PAY_AMT2'] <= UCI_Credit_Card['PAY_AMT2'].quantile(0.25),1,
                                     np.where((UCI_Credit_Card['PAY_AMT2'] > UCI_Credit_Card['PAY_AMT2'].quantile(0.25)) & (UCI_Credit_Card['PAY_AMT2'] <= UCI_Credit_Card['PAY_AMT2'].quantile(0.50)),2,
                                              np.where((UCI_Credit_Card['PAY_AMT2'] > UCI_Credit_Card['PAY_AMT2'].quantile(0.50)) & (UCI_Credit_Card['PAY_AMT2'] <= UCI_Credit_Card['PAY_AMT2'].quantile(0.75)),3,
                                                                np.where((UCI_Credit_Card['PAY_AMT2'] > UCI_Credit_Card['PAY_AMT2'].quantile(0.75)) & (UCI_Credit_Card['PAY_AMT2'] <= UCI_Credit_Card['PAY_AMT2'].quantile(0.95)),4,
                                                                         5))))

In [None]:
fig, axes = plt.subplots(nrows = 2, figsize = (10,10))
g = sns.distplot(UCI_Credit_Card['PAY_AMT3'], ax = axes[0])
g.axes.set_xlim = (0,1000000)
g = sns.boxplot(UCI_Credit_Card['PAY_AMT3'], ax = axes[1])
g.axes.set_xlim = (0,1000000)
plt.show()
UCI_Credit_Card['PAY_AMT3'].describe()

In [None]:
UCI_Credit_Card['P_AMT3'] = np.where(UCI_Credit_Card['PAY_AMT3'] <= UCI_Credit_Card['PAY_AMT3'].quantile(0.25),1,
                                     np.where((UCI_Credit_Card['PAY_AMT3'] > UCI_Credit_Card['PAY_AMT3'].quantile(0.25)) & (UCI_Credit_Card['PAY_AMT3'] <= UCI_Credit_Card['PAY_AMT1'].quantile(0.50)),2,
                                              np.where((UCI_Credit_Card['PAY_AMT3'] > UCI_Credit_Card['PAY_AMT3'].quantile(0.50)) & (UCI_Credit_Card['PAY_AMT1'] <= UCI_Credit_Card['PAY_AMT1'].quantile(0.75)),3,
                                                                np.where((UCI_Credit_Card['PAY_AMT3'] > UCI_Credit_Card['PAY_AMT3'].quantile(0.75)) & (UCI_Credit_Card['PAY_AMT1'] <= UCI_Credit_Card['PAY_AMT1'].quantile(0.95)),4,
                                                                         5))))

In [None]:
fig, axes = plt.subplots(nrows = 2, figsize = (10,10))
g = sns.distplot(UCI_Credit_Card['PAY_AMT4'], ax = axes[0])
g.axes.set_xlim = (0,1000000)
g = sns.boxplot(UCI_Credit_Card['PAY_AMT4'], ax = axes[1])
g.axes.set_xlim = (0,1000000)
plt.show()
UCI_Credit_Card['PAY_AMT4'].describe()

In [None]:
UCI_Credit_Card['P_AMT4'] = np.where(UCI_Credit_Card['PAY_AMT4'] <= UCI_Credit_Card['PAY_AMT4'].quantile(0.25),1,
                                     np.where((UCI_Credit_Card['PAY_AMT4'] > UCI_Credit_Card['PAY_AMT4'].quantile(0.25)) & (UCI_Credit_Card['PAY_AMT4'] <= UCI_Credit_Card['PAY_AMT4'].quantile(0.50)),2,
                                              np.where((UCI_Credit_Card['PAY_AMT4'] > UCI_Credit_Card['PAY_AMT4'].quantile(0.50)) & (UCI_Credit_Card['PAY_AMT4'] <= UCI_Credit_Card['PAY_AMT4'].quantile(0.75)),3,
                                                                np.where((UCI_Credit_Card['PAY_AMT4'] > UCI_Credit_Card['PAY_AMT4'].quantile(0.75)) & (UCI_Credit_Card['PAY_AMT4'] <= UCI_Credit_Card['PAY_AMT4'].quantile(0.95)),4,
                                                                         5))))

In [None]:
fig, axes = plt.subplots(nrows = 2, figsize = (10,10))
g = sns.distplot(UCI_Credit_Card['PAY_AMT5'], ax = axes[0])
g.axes.set_xlim = (0,1000000)
g = sns.boxplot(UCI_Credit_Card['PAY_AMT5'], ax = axes[1])
g.axes.set_xlim = (0,1000000)
plt.show()
UCI_Credit_Card['PAY_AMT5'].describe()

In [None]:
UCI_Credit_Card['P_AMT5'] = np.where(UCI_Credit_Card['PAY_AMT5'] <= UCI_Credit_Card['PAY_AMT5'].quantile(0.25),1,
                                     np.where((UCI_Credit_Card['PAY_AMT5'] > UCI_Credit_Card['PAY_AMT5'].quantile(0.25)) & (UCI_Credit_Card['PAY_AMT5'] <= UCI_Credit_Card['PAY_AMT5'].quantile(0.50)),2,
                                              np.where((UCI_Credit_Card['PAY_AMT5'] > UCI_Credit_Card['PAY_AMT5'].quantile(0.50)) & (UCI_Credit_Card['PAY_AMT5'] <= UCI_Credit_Card['PAY_AMT5'].quantile(0.75)),3,
                                                                np.where((UCI_Credit_Card['PAY_AMT5'] > UCI_Credit_Card['PAY_AMT5'].quantile(0.75)) & (UCI_Credit_Card['PAY_AMT5'] <= UCI_Credit_Card['PAY_AMT5'].quantile(0.95)),4,
                                                                         5))))

In [None]:
fig, axes = plt.subplots(nrows = 2, figsize = (10,10))
g = sns.distplot(UCI_Credit_Card['PAY_AMT6'], ax = axes[0])
g.axes.set_xlim = (0,1000000)
g = sns.boxplot(UCI_Credit_Card['PAY_AMT6'], ax = axes[1])
g.axes.set_xlim = (0,1000000)
plt.show()
UCI_Credit_Card['PAY_AMT6'].describe()

In [None]:
UCI_Credit_Card['P_AMT6'] = np.where(UCI_Credit_Card['PAY_AMT6'] <= UCI_Credit_Card['PAY_AMT6'].quantile(0.25),1,
                                     np.where((UCI_Credit_Card['PAY_AMT6'] > UCI_Credit_Card['PAY_AMT6'].quantile(0.25)) & (UCI_Credit_Card['PAY_AMT6'] <= UCI_Credit_Card['PAY_AMT6'].quantile(0.50)),2,
                                              np.where((UCI_Credit_Card['PAY_AMT6'] > UCI_Credit_Card['PAY_AMT6'].quantile(0.50)) & (UCI_Credit_Card['PAY_AMT6'] <= UCI_Credit_Card['PAY_AMT6'].quantile(0.75)),3,
                                                                np.where((UCI_Credit_Card['PAY_AMT6'] > UCI_Credit_Card['PAY_AMT6'].quantile(0.75)) & (UCI_Credit_Card['PAY_AMT6'] <= UCI_Credit_Card['PAY_AMT6'].quantile(0.95)),4,
                                                                         5))))

In [None]:
UCI_Credit_Card.info()

In [None]:
#pd.get_dummies(df_test, columns=['PAY_0'])

In [None]:
df_test = UCI_Credit_Card.filter([
    'PAT_0',
    'PAT_2',
    'PAT_3',
    'PAT_4',
    'PAT_5',
    'PAT_6',
    'B_AMT1',
    'B_AMT2',
    'B_AMT3',
    'B_AMT4',
    'B_AMT5',
    'B_AMT6',
    'P_AMT1',
    'P_AMT2',
    'P_AMT3',
    'P_AMT4',
    'P_AMT5',
    'P_AMT6'])

In [None]:
df_test_dummies = pd.get_dummies(df_test,
columns = ['PAT_0',
    'PAT_2',
    'PAT_3',
    'PAT_4',
    'PAT_5',
    'PAT_6',
    'B_AMT1',
    'B_AMT2',
    'B_AMT3',
    'B_AMT4',
    'B_AMT5',
    'B_AMT6',
    'P_AMT1',
    'P_AMT2',
    'P_AMT3',
    'P_AMT4',
    'P_AMT5',
    'P_AMT6'])

In [None]:
 df_test_dummies.info()

km = KModes(n_clusters=4, init='Huang', n_init=1, verbose=1)
clusters = km.fit_predict(df_test_dummies)
kmodes = km.cluster_centroids_
shape = kmodes.shape
for i in range(shape[0]):
    if sum(kmodes[i,:]) == 0:
        print("\ncluster " + str(i) + ": ")
        print("no-skills cluster")
    else:
        print("\ncluster " + str(i) + ": ")
        cent = kmodes[i,:]
        for j in df_test_dummies.columns[np.nonzero(cent)]:
            print(j)

Parameters
    -----------
    n_clusters : int, optional, default: 8
        The number of clusters to form as well as the number of
        centroids to generate.
    max_iter : int, default: 300
        Maximum number of iterations of the k-modes algorithm for a
        single run.
    cat_dissim : func, default: matching_dissim
        Dissimilarity function used by the algorithm for categorical variables.
        Defaults to the matching dissimilarity function.
    init : {'Huang', 'Cao', 'random' or an ndarray}, default: 'Cao'
        Method for initialization:
        'Huang': Method in Huang [1997, 1998]
        'Cao': Method in Cao et al. [2009]
        'random': choose 'n_clusters' observations (rows) at random from
        data for the initial centroids.
        If an ndarray is passed, it should be of shape (n_clusters, n_features)
        and gives the initial centroids.
    n_init : int, default: 10
        Number of time the k-modes algorithm will be run with different
        centroid seeds. The final results will be the best output of
        n_init consecutive runs in terms of cost.
    verbose : int, optional
        Verbosity mode.

In [None]:
cost = []
for num_clusters in list(range(1,4)):
    kmode = KModes(n_clusters=num_clusters, init = "Cao", n_init = 1, verbose=1)
    kmode.fit_predict(df_test_dummies)
    cost.append(kmode.cost_)

In [None]:
y = np.array([i for i in range(1,4,1)])
plt.plot(y,cost)

In [None]:
km = KModes(n_clusters = 2, init = "Cao", n_init = 1, verbose=1)
fitClusters = km.fit_predict(df_test_dummies)

In [None]:
df_test_fin = UCI_Credit_Card.reset_index()
clustersDf = pd.DataFrame(fitClusters)
clustersDf.columns = ['cluster']
combinedDf = pd.concat([df_test_fin, clustersDf], axis = 1).reset_index()
combinedDf = combinedDf.drop(['index', 'level_0'], axis = 1)

In [None]:
combinedDf.head()

In [None]:
combinedDf.info()

In [None]:
sns.countplot(combinedDf['cluster'])

In [None]:
plt.subplots(figsize = (20,10))
sns.countplot(x = combinedDf['EDUCATION'], order = combinedDf['EDUCATION'].value_counts().index,hue=combinedDf['cluster'])
plt.show()

In [None]:
plt.subplots(figsize = (10,5))
sns.countplot(x = combinedDf['MARRIAGE'], order = combinedDf['MARRIAGE'].value_counts().index,hue=combinedDf['cluster'])
plt.show()

In [None]:
dissimilarity = combinedDf[['ID','PAY_0','PAY_6','cluster']]

In [None]:
dissimilarity_c1 = dissimilarity[dissimilarity['cluster'] == 0]

In [None]:
pd.crosstab(dissimilarity_c1['PAY_0'],
            dissimilarity_c1['PAY_6'],
            margins = True
)

In [None]:
dissimilarity_c2 = dissimilarity[dissimilarity['cluster'] == 1]

In [None]:
pd.crosstab(dissimilarity_c2['PAY_0'],
            dissimilarity_c2['PAY_6'],
            margins = True
)

In [None]:
dissimilarity_c3 = dissimilarity[dissimilarity['cluster'] == 2]

In [None]:
pd.crosstab(dissimilarity_c3['PAY_0'],
            dissimilarity_c3['PAY_6'],
            margins = True
)

In [None]:
dissimilarity_c4 = dissimilarity[dissimilarity['cluster'] == 3]

In [None]:
pd.crosstab(dissimilarity_c4['PAY_0'],
            dissimilarity_c4['PAY_6'],
            margins = True
)

In [None]:
transicion_c1 = pd.crosstab(dissimilarity['PAY_0'],
            dissimilarity['PAY_6'],
            margins = True
)