# K-means

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [2]:
from sklearn.preprocessing import StandardScaler

In [3]:
from sklearn.cluster import KMeans

# Data of age and salary

In [9]:
age = [20,27,21,37,46,53,55,47,52,32,39,41,39,48,48]
salary = [1000,1200,2900,1850,900,950,2000,2100,3000,5900,4100,5100,7000,5000,6500]

In [10]:
plot = px.scatter(x=age,y=salary)
plot.show()

In [45]:
data_age_salary = np.array(list(zip(age,salary)))
data_age_salary

array([[  20, 1000],
       [  27, 1200],
       [  21, 2900],
       [  37, 1850],
       [  46,  900],
       [  53,  950],
       [  55, 2000],
       [  47, 2100],
       [  52, 3000],
       [  32, 5900],
       [  39, 4100],
       [  41, 5100],
       [  39, 7000],
       [  48, 5000],
       [  48, 6500]])

In [46]:
scaler_salary = StandardScaler()
data_age_salary_stand = scaler_salary.fit_transform(data_age_salary)
data_age_salary_stand

array([[-1.87963884, -1.11413572],
       [-1.23255006, -1.01725435],
       [-1.78719758, -0.19376273],
       [-0.30813751, -0.70238991],
       [ 0.52383377, -1.1625764 ],
       [ 1.17092255, -1.13835606],
       [ 1.35580506, -0.62972888],
       [ 0.61627503, -0.5812882 ],
       [ 1.0784813 , -0.14532205],
       [-0.77034379,  1.25945777],
       [-0.12325501,  0.38752547],
       [ 0.0616275 ,  0.8719323 ],
       [-0.12325501,  1.79230528],
       [ 0.70871628,  0.82349162],
       [ 0.70871628,  1.55010187]])

In [47]:
kmeans_salary = KMeans(n_clusters=3)
kmeans_salary.fit(data_age_salary_stand)

In [48]:
centroids = kmeans_salary.cluster_centers_
centroids

array([[ 0.94906354, -0.73145432],
       [-1.301881  , -0.75688568],
       [ 0.07703438,  1.11413572]])

In [49]:
centroids_inverse = scaler_salary.inverse_transform(centroids)
centroids_inverse

array([[  50.6       , 1790.        ],
       [  26.25      , 1737.5       ],
       [  41.16666667, 5600.        ]])

In [50]:
labels = kmeans_salary.labels_
labels

array([1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2])

In [62]:
plot1 = px.scatter(x = age, y = salary, color = labels)
#plot1.show()
plotc = px.scatter(x = centroids_inverse[:,0], y= centroids_inverse[:,1], size = [10,10,10], color=[0,1,2])
final_plot = go.Figure(data = plot1.data+plotc.data)
final_plot.show()

# Random data

In [63]:
from sklearn.datasets import make_blobs

In [75]:
X_random, Y_random = make_blobs(n_samples=200, centers=5, random_state=1)
X_random, Y_random

(array([[-1.96576392e+00,  5.23446451e+00],
        [-5.16022348e+00, -7.04217141e+00],
        [-6.17937069e+00, -2.16733539e+00],
        [-7.39138168e+00, -9.49590389e+00],
        [-6.38481234e+00, -8.47302970e+00],
        [-6.26144310e+00, -3.78347905e+00],
        [-2.04278768e+00,  3.07660864e-01],
        [-4.46426086e+00, -4.39451238e+00],
        [-6.40386190e+00, -6.36106990e+00],
        [ 2.42271161e-04,  5.14853403e+00],
        [-5.99212006e+00, -3.91488289e+00],
        [-6.46137477e+00, -3.14560994e+00],
        [-4.10185174e+00, -1.16625450e+00],
        [-1.01341572e+01, -4.07240274e+00],
        [-5.99004766e+00, -2.82631801e+00],
        [-1.14663009e+00,  4.10839703e+00],
        [-1.17104176e+00,  4.33091816e+00],
        [-9.19585147e+00, -3.90678125e+00],
        [-8.30173556e+00, -7.27738918e+00],
        [-7.56309575e+00, -8.46421308e+00],
        [-1.86845414e+00,  4.99311306e+00],
        [-4.22362233e+00, -2.50312346e+00],
        [-1.90838667e+00,  5.861

In [82]:
px.scatter(x=X_random[:,0],y=X_random[:,1], color=Y_random, title="Before K-Means")

In [77]:
kmeans_random = KMeans(n_clusters=5)
kmeans_random.fit(X_random)

In [78]:
labels = kmeans_random.predict(X_random)

In [79]:
centroids = kmeans_random.cluster_centers_
centroids

array([[-5.90368078, -3.04489641],
       [-2.17069756,  1.02591979],
       [-6.87958999, -8.11648104],
       [-9.85620522, -3.91021738],
       [-1.58338528,  4.50520457]])

In [85]:
centroids[:,0]

array([-5.90368078, -2.17069756, -6.87958999, -9.85620522, -1.58338528])

In [90]:
plot1 = px.scatter(x=X_random[:,0],y=X_random[:,1], color=labels, title="After K-Means")
plot2 = px.scatter(x=centroids[:,0],
                 y=centroids[:,1],
                 size=[10,10,10,10,10],
                 color = [0,1,2,3,4])
go.Figure(data = plot1.data+plot2.data)

# Credit Card Clients data
## 1 atribute

In [93]:
credit_data = pd.read_csv('credit_card_clients.csv',header=1)
credit_data

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000,1,3,1,39,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
29996,29997,150000,1,3,2,43,-1,-1,-1,-1,...,8979,5190,0,1837,3526,8998,129,0,0,0
29997,29998,30000,1,2,2,37,4,3,2,-1,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
29998,29999,80000,1,3,1,41,1,-1,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1


In [95]:
credit_data['TOTAL_BILL'] = credit_data['BILL_AMT1']+credit_data['BILL_AMT2']+credit_data['BILL_AMT3']+credit_data['BILL_AMT4']+credit_data['BILL_AMT5']+credit_data['BILL_AMT6']
credit_data

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,TOTAL_BILL
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,689,0,0,0,0,1,7704
1,2,120000,2,2,2,26,-1,2,0,0,...,3455,3261,0,1000,1000,1000,0,2000,1,17077
2,3,90000,2,2,2,34,0,0,0,0,...,14948,15549,1518,1500,1000,1000,1000,5000,0,101653
3,4,50000,2,2,1,37,0,0,0,0,...,28959,29547,2000,2019,1200,1100,1069,1000,0,231334
4,5,50000,1,2,1,57,-1,0,-1,0,...,19146,19131,2000,36681,10000,9000,689,679,0,109339
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000,1,3,1,39,0,0,0,0,...,31237,15980,8500,20000,5003,3047,5000,1000,0,725349
29996,29997,150000,1,3,2,43,-1,-1,-1,-1,...,5190,0,1837,3526,8998,129,0,0,0,21182
29997,29998,30000,1,2,2,37,4,3,2,-1,...,20582,19357,0,0,22000,4200,2000,3100,1,70496
29998,29999,80000,1,3,1,41,1,-1,0,0,...,11855,48944,85900,3409,1178,1926,52964,1804,1,266611


In [96]:
X_credit = credit_data.iloc[:,[1,25]].values
X_credit

array([[ 20000,   7704],
       [120000,  17077],
       [ 90000, 101653],
       ...,
       [ 30000,  70496],
       [ 80000, 266611],
       [ 50000, 230874]], dtype=int64)

In [97]:
scaler_credit = StandardScaler()
X_credit = scaler_credit.fit_transform(X_credit)
X_credit

array([[-1.13672015, -0.69069198],
       [-0.3659805 , -0.66599747],
       [-0.59720239, -0.44316987],
       ...,
       [-1.05964618, -0.52525745],
       [-0.67427636, -0.00856436],
       [-0.90549825, -0.10271861]])

In [99]:
wcss = []

for k in range(1,11):
    print(k)
    kmeans_credit = KMeans(n_clusters=k,
                           random_state=0)
    kmeans_credit.fit(X_credit)
    wcss.append(kmeans_credit.inertia_)

1
2
3
4
5
6
7
8
9
10


In [100]:
wcss

[59999.99999999988,
 35197.834391257304,
 20128.20263562338,
 15892.249716910732,
 10708.603774943513,
 8604.404544864527,
 7684.904846136834,
 6593.001391781401,
 5669.721586002743,
 5055.083531903292]

In [101]:
# elbow plot
px.line(x=range(1,11),y=wcss) #k = 4 or 5

In [104]:
kmeans_credit = KMeans(n_clusters=4,
                       random_state=0)
labels = kmeans_credit.fit_predict(X_credit)

In [105]:
px.scatter(x = X_credit[:,0],
           y = X_credit[:,1],
           color=labels)

In [106]:
X_credit_original = scaler_credit.inverse_transform(X_credit)
X_credit_original

array([[ 20000.,   7704.],
       [120000.,  17077.],
       [ 90000., 101653.],
       ...,
       [ 30000.,  70496.],
       [ 80000., 266611.],
       [ 50000., 230874.]])

In [107]:
px.scatter(x = X_credit_original[:,0], #limit
           y = X_credit_original[:,1], #bill
           color=labels,)

## 2 atributes

In [112]:
credit_data.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month', 'TOTAL_BILL'],
      dtype='object')

In [133]:
X_credit2 = credit_data.iloc[:,[1,2,3,4,5,25]].values
X_credit2

array([[ 20000,      2,      2,      1,     24,   7704],
       [120000,      2,      2,      2,     26,  17077],
       [ 90000,      2,      2,      2,     34, 101653],
       ...,
       [ 30000,      1,      2,      2,     37,  70496],
       [ 80000,      1,      3,      1,     41, 266611],
       [ 50000,      1,      2,      1,     46, 230874]], dtype=int64)

In [134]:
scaler_credit2 = StandardScaler()
X_credit2 = scaler_credit2.fit_transform(X_credit2)
X_credit2

array([[-1.13672015,  0.81016074,  0.18582826, -1.05729503, -1.24601985,
        -0.69069198],
       [-0.3659805 ,  0.81016074,  0.18582826,  0.85855728, -1.02904717,
        -0.66599747],
       [-0.59720239,  0.81016074,  0.18582826,  0.85855728, -0.16115646,
        -0.44316987],
       ...,
       [-1.05964618, -1.23432296,  0.18582826,  0.85855728,  0.16430256,
        -0.52525745],
       [-0.67427636, -1.23432296,  1.45111372, -1.05729503,  0.59824792,
        -0.00856436],
       [-0.90549825, -1.23432296,  0.18582826, -1.05729503,  1.14067961,
        -0.10271861]])

In [135]:
wcss2 = []

for k in range(1,11):
    print(k)
    kmeans_credit2 = KMeans(n_clusters=k,
                            random_state=0)
    kmeans_credit2.fit(X_credit2)
    wcss2.append(kmeans_credit2.inertia_)

1
2
3
4
5
6
7
8
9
10


In [136]:
# elbow plot
px.line(x=range(1,11),y=wcss2) #k = 4 or 5

In [144]:
kmeans_credit2 = KMeans(n_clusters=4,
                       random_state=0)
labels2 = kmeans_credit2.fit_predict(X_credit2)

In [145]:
labels2

array([0, 1, 1, ..., 3, 0, 0])

### PCA

Reduce dimention

In [139]:
from sklearn.decomposition import PCA

In [146]:
pca = PCA(n_components=2)
X_credit2_pca = pca.fit_transform(X_credit2)

In [147]:
X_credit2_pca.shape

(30000, 2)

In [148]:
X_credit2_pca

array([[-0.74082054, -1.13671858],
       [-1.48027121, -0.30100547],
       [-0.94737386, -0.48666789],
       ...,
       [-0.79468657, -0.90012663],
       [ 1.17562376, -1.54746987],
       [ 1.13614987, -1.14039836]])

In [149]:
px.scatter(x = X_credit2_pca[:,0],
           y = X_credit2_pca[:,1],
           color=labels2)