"""
Perform clustering on Bankruptcy.csv and discuss your results. You can change the
number of clusters to different values, and also feel free to use a subset of the attributes in the
data.
You can refer to the following link for the descriptions of the attributes (X1~X18).
https://www.kaggle.com/datasets/utkarshx27/american-companies-bankruptcy-prediction-dataset



Approach
===================
1. Read the data into a dataframe
2. Optionally fix/massage the data if needed (Here we'll not do it) as the data is pretty clean. 
We'll only remove some columns which are random and doesn't play any role in clustering. This will help in
meaningful clustering.
3. Convert categorical fields into numerical field for clustering
4. Normalize the data for each column so that scale of difference for each column is between 0 and 1. This way 
each field will get equivalent influence in clustering.
5. Optionally add weight to specific column, if their possible values are too far from each other and they can 
have handful possible values.
6. Run KMeans analyis on the weighted and unweighted dataset
    a) k=3
        a.1: KMeans on unweighted dataset
        a.2: KMeans on weighted dataset
        
    b) k=4
        b.1: KMeans on unweighted dataset
        b.2: KMeans on weighted dataset

"""

In [1]:
# Import required packages
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler 

In [2]:
# 1. Read the data into a dataframe
# Read the input file into a dataframe
data = pd.read_csv('Bankruptcy.csv')
# Examine the data
data.head(10)

Unnamed: 0,company_name,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18
0,C_6,6838.0,18138.0,995.0,1303.0,594.0,-471.0,738.0,2597.5755,22170.0,25088.0,9253.0,308.0,4032.0,8780.0,-8362.0,22170.0,29033.0,20867.0
1,C_14,502.627,566.984,32.812,15.625,137.076,-316.121,256.91,103.1365,711.359,767.024,302.916,-17.187,144.375,302.518,-127.622,711.359,671.739,695.734
2,C_20,192.518,1064.284,18.792,24.846,13.07,2.513,89.897,167.2437,1099.54,246.872,0.0,6.054,35.256,86.074,139.197,1099.54,95.56,1074.694
3,C_38,1.677,3.088,1.339,-1.644,0.0,-3.613,0.671,26.6986,3.572,30.881,2.144,-2.983,0.484,4.329,-1.411,3.572,11.572,5.216
4,C_53,70.018,349.167,24.162,37.433,19.004,-109.681,39.979,13.3995,423.92,230.324,175.184,13.271,74.753,54.886,-119.894,423.92,272.259,386.487
5,C_131,7.436,15.279,1.34,-5.654,3.267,-9.769,2.348,1.3248,21.799,14.165,4.892,-6.994,6.52,11.041,-63.158,21.799,15.933,27.453
6,C_139,10.981,2.813,0.126,-7.022,2.329,-7.143,0.375,25.647,5.328,12.249,0.0,-7.148,2.515,1.587,-52.772,5.328,1.587,12.35
7,C_154,207.229,205.114,9.006,-50.556,118.205,-0.341,16.566,50.452,355.958,357.354,25.767,-59.562,150.844,140.459,-66.668,355.958,264.469,406.514
8,C_186,109.314,21.934,0.448,-118.65,4.325,-118.312,4.341,76.7372,18.023,119.206,223.087,-119.098,-3.911,31.616,-536.219,18.023,257.503,136.673
9,C_195,21.877,70.493,6.036,10.711,9.33,-6.959,10.981,1.2555,88.408,52.367,0.005,4.675,17.915,87.771,-50.366,88.408,88.308,77.697


In [3]:
# 2. Optionally fix/massage the data if needed (Here we'll not do it) as the data is pretty clean. 
# We'll only remove some columns which are random and doesn't play any role in clustering. This will help in
# meaningful clustering.

# Setup data for clustering
# The id doesn't have any significance here in grouping, so dropping id column.
df = data.drop('id',axis=1)
df.head(10)

Unnamed: 0,age,region,income,married,children,save_act
0,48,INNER_CITY,17546.0,NO,1,NO
1,40,TOWN,30085.1,YES,3,NO
2,51,INNER_CITY,16575.4,YES,0,YES
3,23,TOWN,20375.4,YES,3,NO
4,57,RURAL,50576.3,YES,0,YES
5,57,TOWN,37869.6,YES,2,YES
6,22,RURAL,8877.07,NO,0,NO
7,58,TOWN,24946.6,YES,0,YES
8,37,SUBURBAN,25304.3,YES,2,NO
9,54,TOWN,24212.1,YES,2,YES


In [4]:
# 3. Convert categorical fields into numerical field for clustering
# Analysis of data before clustering
# Here age and income are numerical
# But there are multiple categorical or nomninal fields.
# - region
# - married
# - save_act

# So first act will be to convert them into numerical columns. To do that first we need to find unique values 
# in each categorical columns.
for column in ('region','married','save_act'):
    print("\n\nUnique values in column " + column + ":\n")
    print(df[column].unique())




Unique values in column region:

['INNER_CITY' 'TOWN' 'RURAL' 'SUBURBAN']


Unique values in column married:

['NO' 'YES']


Unique values in column save_act:

['NO' 'YES']


In [5]:
# Based on this create new columns -

df['region_INNER_CITY'] = (df['region'] == 'INNER_CITY').astype(int) # astype converts True to 1 and False to 0
df['region_TOWN'] = (df['region'] == 'TOWN').astype(int) # astype converts True to 1 and False to 0
df['region_RURAL'] = (df['region'] == 'RURAL').astype(int) # astype converts True to 1 and False to 0
df['region_SUBURBAN'] = (df['region'] == 'SUBURBAN').astype(int) # astype converts True to 1 and False to 0


df['Married'] = (df['married'] == 'YES').astype(int) # astype converts True to 1 and False to 0


df['Has_save_act'] = (df['save_act'] == 'YES').astype(int) # astype converts True to 1 and False to 0


df.head(10)

# drop reduandant categorical columns
df.drop(['region','married','save_act'], axis=1, inplace=True)

df.head(10)

Unnamed: 0,age,income,children,region_INNER_CITY,region_TOWN,region_RURAL,region_SUBURBAN,Married,Has_save_act
0,48,17546.0,1,1,0,0,0,0,0
1,40,30085.1,3,0,1,0,0,1,0
2,51,16575.4,0,1,0,0,0,1,1
3,23,20375.4,3,0,1,0,0,1,0
4,57,50576.3,0,0,0,1,0,1,1
5,57,37869.6,2,0,1,0,0,1,1
6,22,8877.07,0,0,0,1,0,0,0
7,58,24946.6,0,0,1,0,0,1,1
8,37,25304.3,2,0,0,0,1,1,0
9,54,24212.1,2,0,1,0,0,1,1


In [6]:
# 4. Normalize the data for each column so that scale of difference for each column is between 0 and 1. 
# This way each field will get equivalent influence in clustering.
# For example income can vary a lot if we just go by numbers, so represent them with a value between 0 and 1
# With max value being considered as 1 and min value being 0. After that every value is normalized to -
#                      current_value
#   new_value = -----------------------------
#                  (max value - min value)
# This is known as MinMax Scaler
# 
# We'll apply this on age, inclome and children

sc = MinMaxScaler()
df[['age', 'income', 'children']] = sc.fit_transform(df[['age', 'income', 'children']])
df.head(10)

Unnamed: 0,age,income,children,region_INNER_CITY,region_TOWN,region_RURAL,region_SUBURBAN,Married,Has_save_act
0,0.612245,0.215634,0.333333,1,0,0,0,0,0
1,0.44898,0.431395,1.0,0,1,0,0,1,0
2,0.673469,0.198933,0.0,1,0,0,0,1,1
3,0.102041,0.26432,1.0,0,1,0,0,1,0
4,0.795918,0.783987,0.0,0,0,1,0,1,1
5,0.795918,0.565343,0.666667,0,1,0,0,1,1
6,0.081633,0.066468,0.0,0,0,1,0,0,0
7,0.816327,0.342977,0.0,0,1,0,0,1,1
8,0.387755,0.349132,0.666667,0,0,0,1,1,0
9,0.734694,0.330338,0.666667,0,1,0,0,1,1


In [7]:
# 5. Optionally add weight to specific column, if their possible values are too far from each other 
# and they can have handful possible values.
# 
# Analysis
# ================
# The region columns have value 0 and 1 and their impact positive or negative is both huge. There are 4 potential 
# values to region ['INNER_CITY' 'TOWN' 'RURAL' 'SUBURBAN']. So ideally their inflence should be 1/4= 0.25
# So these columns can be multiplied by 4.
#
# Similarly Married, Unmarried can be weighted as 0.5(if married, instead of being 0)
#
# And finally having savings account can have a weight of 0.5(instead of being 0)
# So a new dataset will be created with weight. We are naming it df_weight.

df_weight = df.copy() #make a copy of only values
df_weight['region_INNER_CITY'] = df_weight['region_INNER_CITY']*0.25
df_weight['region_TOWN']=df_weight['region_TOWN']*0.25
df_weight['region_RURAL']=df_weight['region_RURAL']*0.25
df_weight['region_SUBURBAN']=df_weight['region_SUBURBAN']*0.25
df_weight['Married']=df_weight['Married']*0.5
df_weight['Has_save_act']=df_weight['Has_save_act']*0.5
        
df_weight.head(10)

Unnamed: 0,age,income,children,region_INNER_CITY,region_TOWN,region_RURAL,region_SUBURBAN,Married,Has_save_act
0,0.612245,0.215634,0.333333,0.25,0.0,0.0,0.0,0.0,0.0
1,0.44898,0.431395,1.0,0.0,0.25,0.0,0.0,0.5,0.0
2,0.673469,0.198933,0.0,0.25,0.0,0.0,0.0,0.5,0.5
3,0.102041,0.26432,1.0,0.0,0.25,0.0,0.0,0.5,0.0
4,0.795918,0.783987,0.0,0.0,0.0,0.25,0.0,0.5,0.5
5,0.795918,0.565343,0.666667,0.0,0.25,0.0,0.0,0.5,0.5
6,0.081633,0.066468,0.0,0.0,0.0,0.25,0.0,0.0,0.0
7,0.816327,0.342977,0.0,0.0,0.25,0.0,0.0,0.5,0.5
8,0.387755,0.349132,0.666667,0.0,0.0,0.0,0.25,0.5,0.0
9,0.734694,0.330338,0.666667,0.0,0.25,0.0,0.0,0.5,0.5


In [8]:
#Run KMeans analyis on the weighted and unweighted dataset 
# 
# a) k=3
# a.1: KMeans on unweighted dataset
# a.2: KMeans on weighted dataset
#


# a.1: KMeans on unweighted dataset
# random_state 30 ensures random numbers are consistent across multiple runs
km_unweighted = KMeans(n_clusters=3, random_state=30).fit(df)

# Add the cluster column in the dataframe

df['cluster'] = km_unweighted.labels_
df.head(10)


Unnamed: 0,age,income,children,region_INNER_CITY,region_TOWN,region_RURAL,region_SUBURBAN,Married,Has_save_act,cluster
0,0.612245,0.215634,0.333333,1,0,0,0,0,0,0
1,0.44898,0.431395,1.0,0,1,0,0,1,0,1
2,0.673469,0.198933,0.0,1,0,0,0,1,1,0
3,0.102041,0.26432,1.0,0,1,0,0,1,0,1
4,0.795918,0.783987,0.0,0,0,1,0,1,1,2
5,0.795918,0.565343,0.666667,0,1,0,0,1,1,1
6,0.081633,0.066468,0.0,0,0,1,0,0,0,2
7,0.816327,0.342977,0.0,0,1,0,0,1,1,1
8,0.387755,0.349132,0.666667,0,0,0,1,1,0,2
9,0.734694,0.330338,0.666667,0,1,0,0,1,1,1


In [9]:
#Run KMeans analyis on the weighted and unweighted dataset 
# a.2: KMeans on weighted dataset


# a.2 : KMeans on weighted dataset
# random_state 30 ensures random numbers are consistent across multiple runs
km_weighted = KMeans(n_clusters=3, random_state=30).fit(df_weight)

# Add the cluster column in the dataframe
df_weight['cluster'] = km_weighted.labels_
df_weight.head(10)

Unnamed: 0,age,income,children,region_INNER_CITY,region_TOWN,region_RURAL,region_SUBURBAN,Married,Has_save_act,cluster
0,0.612245,0.215634,0.333333,0.25,0.0,0.0,0.0,0.0,0.0,2
1,0.44898,0.431395,1.0,0.0,0.25,0.0,0.0,0.5,0.0,0
2,0.673469,0.198933,0.0,0.25,0.0,0.0,0.0,0.5,0.5,2
3,0.102041,0.26432,1.0,0.0,0.25,0.0,0.0,0.5,0.0,0
4,0.795918,0.783987,0.0,0.0,0.0,0.25,0.0,0.5,0.5,1
5,0.795918,0.565343,0.666667,0.0,0.25,0.0,0.0,0.5,0.5,0
6,0.081633,0.066468,0.0,0.0,0.0,0.25,0.0,0.0,0.0,2
7,0.816327,0.342977,0.0,0.0,0.25,0.0,0.0,0.5,0.5,1
8,0.387755,0.349132,0.666667,0.0,0.0,0.0,0.25,0.5,0.0,0
9,0.734694,0.330338,0.666667,0.0,0.25,0.0,0.0,0.5,0.5,0


In [10]:
# Print the cluster size and cluster mean for unweighted dataset having 3 clusters
print("Cluster wise total") 
print(df.groupby('cluster').size())
df.groupby('cluster').mean()

Cluster wise total
cluster
0    269
1    173
2    158
dtype: int64


Unnamed: 0_level_0,age,income,children,region_INNER_CITY,region_TOWN,region_RURAL,region_SUBURBAN,Married,Has_save_act
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.489644,0.375625,0.317224,1.0,0.0,0.0,0.0,0.66171,0.643123
1,0.493925,0.374633,0.333333,0.0,1.0,0.0,0.0,0.66474,0.739884
2,0.516146,0.421147,0.375527,0.0,0.0,0.607595,0.392405,0.651899,0.71519


In [11]:
# Print the cluster size and cluster mean for weighted dataset having 3 clusters
print("Cluster wise total") 
print(df_weight.groupby('cluster').size())
df_weight.groupby('cluster').mean()

Cluster wise total
cluster
0    178
1    192
2    230
dtype: int64


Unnamed: 0_level_0,age,income,children,region_INNER_CITY,region_TOWN,region_RURAL,region_SUBURBAN,Married,Has_save_act
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.449094,0.345176,0.794007,0.103933,0.070225,0.054775,0.021067,0.317416,0.339888
1,0.784332,0.591099,0.194444,0.110677,0.070312,0.036458,0.032552,0.317708,0.429688
2,0.296451,0.249841,0.102899,0.119565,0.075,0.031522,0.023913,0.35,0.278261


In [12]:
#Run KMeans analyis on the weighted and unweighted dataset b) k=4

# b.1: KMeans on unweighted dataset
# b.2: KMeans on weighted dataset

df_k4 = df.copy()
df_k4.drop('cluster', axis=1)
df_weight_k4 = df_weight.copy()
df_weight_k4.drop('cluster', axis=1)
print("Copied to new dataset and dropped previous cluster column before applying KMeans")

Copied to new dataset and dropped previous cluster column before applying KMeans


In [13]:

# b.1: KMeans on unweighted dataset
# random_state 30 ensures random numbers are consistent across multiple runs
km_unweighted = KMeans(n_clusters=4, random_state=30).fit(df_k4)

# Add the cluster column in the dataframe

df_k4['cluster'] = km_unweighted.labels_
df_k4.head(10)

Unnamed: 0,age,income,children,region_INNER_CITY,region_TOWN,region_RURAL,region_SUBURBAN,Married,Has_save_act,cluster
0,0.612245,0.215634,0.333333,1,0,0,0,0,0,0
1,0.44898,0.431395,1.0,0,1,0,0,1,0,3
2,0.673469,0.198933,0.0,1,0,0,0,1,1,0
3,0.102041,0.26432,1.0,0,1,0,0,1,0,3
4,0.795918,0.783987,0.0,0,0,1,0,1,1,2
5,0.795918,0.565343,0.666667,0,1,0,0,1,1,3
6,0.081633,0.066468,0.0,0,0,1,0,0,0,2
7,0.816327,0.342977,0.0,0,1,0,0,1,1,3
8,0.387755,0.349132,0.666667,0,0,0,1,1,0,1
9,0.734694,0.330338,0.666667,0,1,0,0,1,1,3


In [14]:

# b.2: KMeans on weighted dataset
# random_state 30 ensures random numbers are consistent across multiple runs
km_unweighted = KMeans(n_clusters=4, random_state=30).fit(df_weight_k4)

# Add the cluster column in the dataframe

df_weight_k4['cluster'] = km_unweighted.labels_
df_weight_k4.head(10)

Unnamed: 0,age,income,children,region_INNER_CITY,region_TOWN,region_RURAL,region_SUBURBAN,Married,Has_save_act,cluster
0,0.612245,0.215634,0.333333,0.25,0.0,0.0,0.0,0.0,0.0,1
1,0.44898,0.431395,1.0,0.0,0.25,0.0,0.0,0.5,0.0,2
2,0.673469,0.198933,0.0,0.25,0.0,0.0,0.0,0.5,0.5,3
3,0.102041,0.26432,1.0,0.0,0.25,0.0,0.0,0.5,0.0,2
4,0.795918,0.783987,0.0,0.0,0.0,0.25,0.0,0.5,0.5,0
5,0.795918,0.565343,0.666667,0.0,0.25,0.0,0.0,0.5,0.5,2
6,0.081633,0.066468,0.0,0.0,0.0,0.25,0.0,0.0,0.0,1
7,0.816327,0.342977,0.0,0.0,0.25,0.0,0.0,0.5,0.5,0
8,0.387755,0.349132,0.666667,0.0,0.0,0.0,0.25,0.5,0.0,2
9,0.734694,0.330338,0.666667,0.0,0.25,0.0,0.0,0.5,0.5,2


In [15]:
# Print the cluster size and cluster mean for unweighted dataset having 4 clusters
print("Cluster wise total") 
print(df_k4.groupby('cluster').size())
df_k4.groupby('cluster').mean()

Cluster wise total
cluster
0    269
1     62
2     96
3    173
dtype: int64


Unnamed: 0_level_0,age,income,children,region_INNER_CITY,region_TOWN,region_RURAL,region_SUBURBAN,Married,Has_save_act
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.489644,0.375625,0.317224,1.0,0.0,0.0,0.0,0.66171,0.643123
1,0.525016,0.40681,0.322581,0.0,0.0,0.0,1.0,0.677419,0.693548
2,0.510417,0.430405,0.409722,0.0,0.0,1.0,0.0,0.635417,0.729167
3,0.493925,0.374633,0.333333,0.0,1.0,0.0,0.0,0.66474,0.739884


In [16]:
# Print the cluster size and cluster mean for weighted dataset having 4 clusters
print("Cluster wise total") 
print(df_weight_k4.groupby('cluster').size())
df_weight_k4.groupby('cluster').mean()

Cluster wise total
cluster
0    192
1    102
2    178
3    128
dtype: int64


Unnamed: 0_level_0,age,income,children,region_INNER_CITY,region_TOWN,region_RURAL,region_SUBURBAN,Married,Has_save_act
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.784332,0.591099,0.194444,0.110677,0.070312,0.036458,0.032552,0.317708,0.429688
1,0.303121,0.250297,0.104575,0.129902,0.056373,0.036765,0.026961,0.343137,0.0
2,0.449094,0.345176,0.794007,0.103933,0.070225,0.054775,0.021067,0.317416,0.339888
3,0.291135,0.249478,0.101563,0.111328,0.089844,0.027344,0.021484,0.355469,0.5
