# K-Means Clustering - Cardio Train
- 10122088 - Azhar Fachrezi
- 10122094 - Mochamad Nabil Ramdhani
- 10122095 - Muhamad Singgih Prasetyo
- 10122099 - Muhammad Raffy Abdillah

# Import Library

In [788]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import copy

# Input Data

In [789]:
df = pd.read_excel("cardio_train.xlsx")

# Membuang fitur yang tidak diperlukan
df = df.drop(['id','CLASS cardio'], axis=1)

# Mengubah umur harian menjadi tahunan
df['age'] = df['age'] // 365

df

Unnamed: 0,age,gender,height,weight,cholesterol,gluc,smoke,alco,active
0,50,2,168,62.0,1,1,0,0,1
1,55,1,156,85.0,3,1,0,0,1
2,51,1,165,64.0,3,1,0,0,0
3,48,2,169,82.0,1,1,0,0,1
4,47,1,156,56.0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...
69995,52,2,168,76.0,1,1,1,0,1
69996,61,1,158,126.0,2,2,0,0,1
69997,52,2,183,105.0,3,1,0,1,0
69998,61,1,163,72.0,1,2,0,0,0


# Pengecekan Data

In [790]:
missing_value = df.isnull()
for column in missing_value.columns.values.tolist():
    print(column)
    print(missing_value[column].value_counts())
    print('')

age
age
False    70000
Name: count, dtype: int64

gender
gender
False    70000
Name: count, dtype: int64

height
height
False    70000
Name: count, dtype: int64

weight
weight
False    70000
Name: count, dtype: int64

cholesterol
cholesterol
False    70000
Name: count, dtype: int64

gluc
gluc
False    70000
Name: count, dtype: int64

smoke
smoke
False    70000
Name: count, dtype: int64

alco
alco
False    70000
Name: count, dtype: int64

active
active
False    70000
Name: count, dtype: int64



In [791]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          70000 non-null  int64  
 1   gender       70000 non-null  int64  
 2   height       70000 non-null  int64  
 3   weight       70000 non-null  float64
 4   cholesterol  70000 non-null  int64  
 5   gluc         70000 non-null  int64  
 6   smoke        70000 non-null  int64  
 7   alco         70000 non-null  int64  
 8   active       70000 non-null  int64  
dtypes: float64(1), int64(8)
memory usage: 4.8 MB


# K-Means Clustering

In [792]:
# Normalisasi data agar memudahkan proses perhitungan
normalisasi = (df - df.min()) / (df.max() - df.min())
df = normalisasi
df

Unnamed: 0,age,gender,height,weight,cholesterol,gluc,smoke,alco,active
0,0.600000,1.0,0.579487,0.273684,0.0,0.0,0.0,0.0,1.0
1,0.742857,0.0,0.517949,0.394737,1.0,0.0,0.0,0.0,1.0
2,0.628571,0.0,0.564103,0.284211,1.0,0.0,0.0,0.0,0.0
3,0.542857,1.0,0.584615,0.378947,0.0,0.0,0.0,0.0,1.0
4,0.514286,0.0,0.517949,0.242105,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
69995,0.657143,1.0,0.579487,0.347368,0.0,0.0,1.0,0.0,1.0
69996,0.914286,0.0,0.528205,0.610526,0.5,0.5,0.0,0.0,1.0
69997,0.657143,1.0,0.656410,0.500000,1.0,0.0,0.0,1.0,0.0
69998,0.914286,0.0,0.553846,0.326316,0.0,0.5,0.0,0.0,0.0


In [793]:
fitur = ['age','gender','height','weight','cholesterol','gluc','smoke','alco','active']

#Rumus inisialisasi centroid 
def inisialisasi_centroid(df, k, random_state=0):
    
    # Mengambil sample untuk centroid
    centroid = df.sample(n=k, random_state=random_state).values.tolist()
    
    # Mengubah centroid dari list ke dictionary
    centroid_dict = {i + 1: centroid[i] for i in range(len(centroid))}
    return centroid_dict

#### Melakukan Percobaan Menggunakan Centroid Acak

In [794]:
k = 2
centro = inisialisasi_centroid(df, k)
print(centro)

{1: [0.34285714285714286, 0.0, 0.5641025641025641, 0.26842105263157895, 0.0, 0.0, 0.0, 0.0, 1.0], 2: [0.7142857142857143, 0.0, 0.558974358974359, 0.23157894736842105, 0.0, 0.0, 0.0, 0.0, 1.0]}


In [795]:
# Rumus Menghitung Jarak Setiap Data Ke Setiap Centroid
def jarak_euclid(df, fitur, centroid2):
    for i in centroid2.keys():
        squared_diff_sum = np.sum([(df[feature] - centroid2[i][j]) ** 2 for j, feature in enumerate(fitur)], axis=0)
        df[str(i)] = np.sqrt(squared_diff_sum)
    return df

In [796]:
df_euc = jarak_euclid(df, fitur,centro)
df_euc

Unnamed: 0,age,gender,height,weight,cholesterol,gluc,smoke,alco,active,1,2
0,0.600000,1.0,0.579487,0.273684,0.0,0.0,0.0,0.0,1.0,1.032660,1.007599
1,0.742857,0.0,0.517949,0.394737,1.0,0.0,0.0,0.0,1.0,1.085397,1.014455
2,0.628571,0.0,0.564103,0.284211,1.0,0.0,0.0,0.0,0.0,1.442873,1.417795
3,0.542857,1.0,0.584615,0.378947,0.0,0.0,0.0,0.0,1.0,1.025981,1.025555
4,0.514286,0.0,0.517949,0.242105,0.0,0.0,0.0,0.0,0.0,1.015978,1.020683
...,...,...,...,...,...,...,...,...,...,...,...
69995,0.657143,1.0,0.579487,0.347368,0.0,0.0,1.0,0.0,1.0,1.450946,1.420244
69996,0.914286,0.0,0.528205,0.610526,0.5,0.5,0.0,0.0,1.0,0.972037,0.827374
69997,0.657143,1.0,0.656410,0.500000,1.0,0.0,0.0,1.0,0.0,2.039835,2.021091
69998,0.914286,0.0,0.553846,0.326316,0.0,0.5,0.0,0.0,0.0,1.256976,1.139737


In [797]:
# Rumus Untuk Menentukan Cluster Setiap Data
def cluster_baru(df, centroid2):
    df['Cluster'] = df[[str(i) for i in centroid2.keys()]].idxmin(axis=1).astype('int')
    df = df[[col for col in df.columns if col != 'Cluster'] + ['Cluster']]
    return df

In [798]:
df_euc = cluster_baru(df_euc, centro)
df_euc

Unnamed: 0,age,gender,height,weight,cholesterol,gluc,smoke,alco,active,1,2,Cluster
0,0.600000,1.0,0.579487,0.273684,0.0,0.0,0.0,0.0,1.0,1.032660,1.007599,2
1,0.742857,0.0,0.517949,0.394737,1.0,0.0,0.0,0.0,1.0,1.085397,1.014455,2
2,0.628571,0.0,0.564103,0.284211,1.0,0.0,0.0,0.0,0.0,1.442873,1.417795,2
3,0.542857,1.0,0.584615,0.378947,0.0,0.0,0.0,0.0,1.0,1.025981,1.025555,2
4,0.514286,0.0,0.517949,0.242105,0.0,0.0,0.0,0.0,0.0,1.015978,1.020683,1
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,0.657143,1.0,0.579487,0.347368,0.0,0.0,1.0,0.0,1.0,1.450946,1.420244,2
69996,0.914286,0.0,0.528205,0.610526,0.5,0.5,0.0,0.0,1.0,0.972037,0.827374,2
69997,0.657143,1.0,0.656410,0.500000,1.0,0.0,0.0,1.0,0.0,2.039835,2.021091,2
69998,0.914286,0.0,0.553846,0.326316,0.0,0.5,0.0,0.0,0.0,1.256976,1.139737,2


In [799]:
# Melakukan perhitungan rata-rata hasil iterasi untuk digunakan sebagai centroid baru
def centroid_baru(df, fitur, centroid2):
    for i in centroid2.keys():
        for j, feature in enumerate(fitur):
            centroid2[i][j] = np.mean(df[df['Cluster'] == i][feature])
    return centroid2

In [800]:
# Rumus untuk melakukan perulangan iterasi hingga tidak ada data yang berpindah cluster
def clusterisasi(df, fitur, k):
    centroid2 = inisialisasi_centroid(df, k)
    df_euc = copy.deepcopy(df)
    df_euc = jarak_euclid(df_euc, fitur, centroid2)
    df_euc = cluster_baru(df_euc, centroid2)
    centroid2 = centroid_baru(df_euc, fitur, centroid2)
    
    while True:
        centroid_lama = copy.deepcopy(centroid2)
        df_euc = jarak_euclid(df_euc, fitur, centroid2)
        df_euc = cluster_baru(df_euc, centroid2)
        centroid2 = centroid_baru(df_euc, fitur, centroid2)
        
        if (centroid_lama == centroid2):
            break
    
    return(df_euc, centroid2)

# Tes Clusterisasi K = 5

In [801]:
k = 5
df_euc, centroid2 = clusterisasi(df, fitur, k)

df_euc

Unnamed: 0,age,gender,height,weight,cholesterol,gluc,smoke,alco,active,1,2,3,4,5,Cluster
0,0.600000,1.0,0.579487,0.273684,0.0,0.0,0.0,0.0,1.0,1.090576,1.007256,1.400301,0.182785,1.229342,4
1,0.742857,0.0,0.517949,0.394737,1.0,0.0,0.0,0.0,1.0,1.665910,0.948167,0.612526,1.337567,1.369901,3
2,0.628571,0.0,0.564103,0.284211,1.0,0.0,0.0,0.0,0.0,1.859798,1.375368,1.090861,1.668963,0.934699,5
3,0.542857,1.0,0.584615,0.378947,0.0,0.0,0.0,0.0,1.0,1.091855,1.013328,1.405533,0.199992,1.233630,4
4,0.514286,0.0,0.517949,0.242105,0.0,0.0,0.0,0.0,0.0,1.691958,1.018742,1.429194,1.436957,0.392011,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,0.657143,1.0,0.579487,0.347368,0.0,0.0,1.0,0.0,1.0,0.424415,1.407639,1.704271,1.010825,1.573624,1
69996,0.914286,0.0,0.528205,0.610526,0.5,0.5,0.0,0.0,1.0,1.575355,0.738686,0.534870,1.209806,1.247297,3
69997,0.657143,1.0,0.656410,0.500000,1.0,0.0,0.0,1.0,0.0,1.691200,1.972417,1.699163,1.644531,1.496663,5
69998,0.914286,0.0,0.553846,0.326316,0.0,0.5,0.0,0.0,0.0,1.744803,1.127880,1.292283,1.505551,0.594705,5
