In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [2]:
df = pd.read_csv('data/clean_data.csv')

In [3]:
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,67,male,2,own,unknown,little,1169,6,radio/TV
1,22,female,2,own,little,moderate,5951,48,radio/TV
2,49,male,1,own,little,unknown,2096,12,education
3,45,male,2,free,little,little,7882,42,furniture/equipment
4,53,male,2,free,little,little,4870,24,car


In [4]:
kategorik = df.select_dtypes(include=['object']).columns

kolom_numerik = ['Credit amount']
kolom_onehot = df[kategorik].columns[df[kategorik].nunique() <= 4]
kolom_ordinal = df[kategorik].columns[df[kategorik].nunique() > 4]

preprocessor = ColumnTransformer(
    transformers=[
        ('standard', StandardScaler(), kolom_numerik), # Standard Scaler
        ('ordinal', OrdinalEncoder(dtype=int), kolom_ordinal),  # OrdinalEncoder
        ('onehot', OneHotEncoder(dtype=int, handle_unknown='ignore'), kolom_onehot)  # OneHotEncoder
    ], remainder='passthrough')

kmeans = KMeans(n_clusters=3, random_state=42)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', kmeans)
])

model.fit(df)

# Prediksi Data Baru

In [5]:
df['Cluster'] = kmeans.labels_
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Cluster
0,67,male,2,own,unknown,little,1169,6,radio/TV,2
1,22,female,2,own,little,moderate,5951,48,radio/TV,0
2,49,male,1,own,little,unknown,2096,12,education,2
3,45,male,2,free,little,little,7882,42,furniture/equipment,0
4,53,male,2,free,little,little,4870,24,car,2


In [6]:
df.groupby('Cluster')['Age', 'Credit amount', 'Duration'].mean()

  df.groupby('Cluster')['Age', 'Credit amount', 'Duration'].mean()


Unnamed: 0_level_0,Age,Credit amount,Duration
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,33.755,6369.395,40.46
1,29.620141,2408.609541,15.95053
2,51.410256,2709.854701,16.166667


In [7]:
data_baru = pd.DataFrame({
    'Age': [69, 20, 34],
    'Sex': ['male', 'male', 'male'],
    'Job': [2, 2, 1],
    'Housing': ['own', 'own', 'own'],
    'Saving accounts': ['unknown', 'little', 'little'],
    'Checking account': ['little', 'moderate', 'unknown'],
    'Credit amount': [1200, 5951, 2096],
    'Duration': [6, 48, 12],
    'Purpose': ['radio/TV', 'radio/TV', 'education']
})

# Menggunakan pipeline untuk memprediksi cluster
cluster_prediksi = model.predict(data_baru)

print(cluster_prediksi)

[2 0 1]
