In [None]:
!mkdir /root/.kaggle/
!pip install pandas

import scipy.stats
import pandas as pd
import numpy as np
import os
import json
from sklearn.cluster import KMeans


# Installing the Kaggle package
!pip install kaggle 

#Important Note: complete this with your own key - after running this for the first time remmember to **remove** your API_KEY
api_token = {"username":"username","key":"keykeykey"}

# creating kaggle.json file with the personal API-Key details 
# You can also put this file on your Google Drive
with open('/root/.kaggle/kaggle.json', 'w') as file:
  json.dump(api_token, file)
!chmod 600 /root/.kaggle/kaggle.json



In [None]:
# download the dataset from Kaggle and unzip it
!kaggle datasets download -d arjunbhasin2013/ccdata -p ./datasets/credit/
!unzip ./datasets/credit/*.zip  -d ./datasets/credit/

credits_df = pd.read_csv('/content/datasets/credit/CC GENERAL.csv')


Downloading ccdata.zip to ./datasets/credit
  0% 0.00/340k [00:00<?, ?B/s]
100% 340k/340k [00:00<00:00, 48.2MB/s]
Archive:  ./datasets/credit/ccdata.zip
  inflating: ./datasets/credit/CC GENERAL.csv  


#1

In [None]:
for col in credits_df:
  credits_df[col] = credits_df[col].fillna(0)

In [None]:
def k_mean(df, k):
  k_est = KMeans(n_clusters=k)
  k_est.fit(df) 
  df[f"{k}_mean"] = k_est.labels_
  return df[[f"{k}_mean"]]

In [None]:
k_mean(credits_df[[col for col in credits_df if col != "CUST_ID"]], 3)

Unnamed: 0,3_mean
0,0
1,1
2,1
3,1
4,0
...,...
8945,0
8946,0
8947,0
8948,0


#2

In [None]:
def norm_col(df_obj, col_name):
  l = df_obj[col_name].values
  max_val, min_val = max(l), min(l)
  df_obj[col_name] = df_obj[col_name].apply(lambda v: (v-min_val)/(max_val-min_val))
  return df_obj

In [None]:
for i in credits_df:
  if '_mean' not in i and i != 'CUST_ID':
    norm_col(credits_df, i)

In [None]:
k_mean(credits_df[[col for col in credits_df if col != "CUST_ID"]], 3)

Unnamed: 0,3_mean
0,1
1,1
2,2
3,1
4,1
...,...
8945,0
8946,0
8947,0
8948,1


#3

In [None]:
def km_euclidean(df, k):
  df = df.copy()
  km = KMeans(n_clusters=k)
  km.fit(df)
  df['res'] = km.labels_
  dist = 0
  for row in df.values:
    dist += (sum((row[:-1] - km.cluster_centers_[int(row[-1])])**2))**0.5
  avg = dist/len(df.values)
  return df['res'].values, dist, avg
      
df2 = credits_df[[col for col in credits_df if col != "CUST_ID"]]
labels, total_dis, avg_dis = km_euclidean(df2, 3)
print('avg_dis:', avg_dis, '\ntotal_dis:', total_dis)

avg_dis: 0.5239800160681672 
total_dis: 4689.621143810097


#4

In [None]:
def km_man(df, k):
  df = df.copy()
  km = KMeans(n_clusters=k)
  km.fit(df)
  df['res'] = km.labels_
  dist = 0
  for row in df.values:
    dist += sum(abs(row[:-1] - km.cluster_centers_[int(row[-1])]))
  avg = dist/len(df.values)
  return df['res'].values, dist, avg
      
df2 = credits_df[[col for col in credits_df if col != "CUST_ID"]]
labels, total_dis, avg_dis = km_man(df2, 3)
print('avg_dis:', avg_dis, '\ntotal_dis:', total_dis)

avg_dis: 1.3313869866820274 
total_dis: 11915.913530804144


In [None]:
import plotly.express as px

rng = 11
euc = []
man = []
for k in range(1, rng, 2):
  _, _, avg = km_euclidean(df2, k)
  euc.append(avg)
  _, _, avg = km_man(df2, k)
  man.append(avg)

fig = px.line(x=[k for k in range(1, rng, 2)], y=euc, title='Euclidean')
fig.update_xaxes(title_text="<b>K")
fig.update_yaxes(title_text="<b>Euclidean Distance")
fig.show()

fig2 = px.line(x=[k for k in range(1, rng, 2)], y=man, title='Manhattan')
fig2.update_xaxes(title_text="<b>K")
fig2.update_yaxes(title_text="<b>Manhattan Distance")
fig2.show()

