In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [51]:
df = pd.read_excel('cardio_train.xlsx')

# Menghapus data yang tidak diperlukan.
df = df.drop('CLASS cardio', axis=1)
df = df.drop('id', axis=1)

# Mengubah kolom 'age' dari hitungan hari menjadi tahun.
df['age'] = df['age'] // 365
df

Unnamed: 0,age,gender,height,weight,cholesterol,gluc,smoke,alco,active
0,50,2,168,62.0,1,1,0,0,1
1,55,1,156,85.0,3,1,0,0,1
2,51,1,165,64.0,3,1,0,0,0
3,48,2,169,82.0,1,1,0,0,1
4,47,1,156,56.0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...
69995,52,2,168,76.0,1,1,1,0,1
69996,61,1,158,126.0,2,2,0,0,1
69997,52,2,183,105.0,3,1,0,1,0
69998,61,1,163,72.0,1,2,0,0,0


In [52]:
missing_value = df.isnull()
for column in missing_value.columns.values.tolist():
    print(column)
    print(missing_value[column].value_counts())
    print('')

age
age
False    70000
Name: count, dtype: int64

gender
gender
False    70000
Name: count, dtype: int64

height
height
False    70000
Name: count, dtype: int64

weight
weight
False    70000
Name: count, dtype: int64

cholesterol
cholesterol
False    70000
Name: count, dtype: int64

gluc
gluc
False    70000
Name: count, dtype: int64

smoke
smoke
False    70000
Name: count, dtype: int64

alco
alco
False    70000
Name: count, dtype: int64

active
active
False    70000
Name: count, dtype: int64



In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          70000 non-null  int64  
 1   gender       70000 non-null  int64  
 2   height       70000 non-null  int64  
 3   weight       70000 non-null  float64
 4   cholesterol  70000 non-null  int64  
 5   gluc         70000 non-null  int64  
 6   smoke        70000 non-null  int64  
 7   alco         70000 non-null  int64  
 8   active       70000 non-null  int64  
dtypes: float64(1), int64(8)
memory usage: 4.8 MB


In [54]:
df

Unnamed: 0,age,gender,height,weight,cholesterol,gluc,smoke,alco,active
0,50,2,168,62.0,1,1,0,0,1
1,55,1,156,85.0,3,1,0,0,1
2,51,1,165,64.0,3,1,0,0,0
3,48,2,169,82.0,1,1,0,0,1
4,47,1,156,56.0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...
69995,52,2,168,76.0,1,1,1,0,1
69996,61,1,158,126.0,2,2,0,0,1
69997,52,2,183,105.0,3,1,0,1,0
69998,61,1,163,72.0,1,2,0,0,0


In [74]:
def euclidean_distance(a, b):
    return np.sqrt(np.sum((a - b) ** 2))

# Fungsi untuk mengelompokkan data ke centroid terdekat
def assign_clusters(df, centroids):
    clusters = []
    for i in range(len(df)):
        distances = [euclidean_distance(df.iloc[i].values, centroid) for centroid in centroids]
        cluster = np.argmin(distances)
        clusters.append(cluster)
    return np.array(clusters)

# Fungsi untuk memperbarui centroid
def update_centroids(df, clusters, k):
    new_centroids = []
    for i in range(k):
        cluster_points = df[clusters == i]
        if len(cluster_points) > 0:
            new_centroid = cluster_points.mean(axis=0).values
            new_centroids.append(new_centroid)
        else:
            # Handle empty clusters by reinitializing to a random point
            new_centroid = df.sample(n=1).values[0]
            new_centroids.append(new_centroid)
    return new_centroids

# Fungsi utama KMeans
def kmeans(df, k):
    # Inisialisasi centroid
    centroids = df.sample(n=k).values
    clusters = assign_clusters(df, centroids)
    
    while True:
        new_centroids = update_centroids(df, clusters, k)
        new_clusters = assign_clusters(df, new_centroids)
        
        # Cek apakah ada data yang berpindah kelas
        if np.array_equal(clusters, new_clusters):
            break
        
        clusters = new_clusters
        centroids = new_centroids
        
    return centroids, clusters

In [75]:
k = 2
centroids, clusters = kmeans(df, k)
df['Cluster'] = clusters
df

Unnamed: 0,age,gender,height,weight,cholesterol,gluc,smoke,alco,active,Cluster
0,0.080009,-0.457443,-0.408506,-0.071506,0.673733,1.000000,-0.393408,-0.308407,-0.215722,0
1,-0.165893,-0.241569,-0.242700,-0.296622,-0.200942,-0.215722,-0.180937,-0.158463,1.000000,0
2,0.080009,-0.457443,-0.408506,-0.071506,0.673733,1.000000,-0.393408,-0.308407,-0.215722,0
3,1.000000,-0.397047,-0.442184,-0.148597,0.164350,0.080009,-0.407774,-0.332725,-0.165893,0
4,1.000000,-0.397047,-0.442184,-0.148597,0.164350,0.080009,-0.407774,-0.332725,-0.165893,0
...,...,...,...,...,...,...,...,...,...,...
69995,-0.165893,-0.241569,-0.242700,-0.296622,-0.200942,-0.215722,-0.180937,-0.158463,1.000000,0
69996,-0.148597,0.057151,0.309378,1.000000,-0.039662,-0.071506,-0.164921,-0.175420,-0.296622,1
69997,-0.165893,-0.241569,-0.242700,-0.296622,-0.200942,-0.215722,-0.180937,-0.158463,1.000000,0
69998,-0.407774,0.438983,0.208264,-0.164921,-0.396114,-0.393408,1.000000,0.467702,-0.180937,1
