In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets
import seaborn as sns
import matplotlib.pyplot as plt

### Membuka Dataset iris dan tennis

In [2]:
iris = datasets.load_iris()
X_iris = iris.data
y_iris = iris.target

In [3]:
tennis = pd.read_csv('tennis.csv')
X_tennis = tennis.drop(['play'], axis=1)
y_tennis = tennis['play']

### LabelEncoder untuk data tennis menjadi tipe integer

In [4]:
X_tennis

Unnamed: 0,outlook,temp,humidity,windy
0,sunny,hot,high,False
1,sunny,hot,high,True
2,overcast,hot,high,False
3,rainy,mild,high,False
4,rainy,cool,normal,False
5,rainy,cool,normal,True
6,overcast,cool,normal,True
7,sunny,mild,high,False
8,sunny,cool,normal,False
9,rainy,mild,normal,False


In [5]:
outlook = []
temp = []
humidity = []
wind = []
for i in range(len(X_tennis)):
    if(X_tennis['outlook'][i] == 'rainy'):
        outlook.append(0)
    elif(X_tennis['outlook'][i] == 'overcast'):
        outlook.append(1)
    elif(X_tennis['outlook'][i] == 'sunny'):
        outlook.append(2)
    if(X_tennis['temp'][i] == 'cool'):
        temp.append(0)
    elif(X_tennis['temp'][i] == 'mild'):
        temp.append(1)
    elif(X_tennis['temp'][i] == 'hot'):
        temp.append(2)
    if(X_tennis['humidity'][i] == 'normal'):
        humidity.append(0)
    elif(X_tennis['humidity'][i] == 'high'):
        humidity.append(1)
    if(not X_tennis['windy'][i]):
        wind.append(0)
    elif(X_tennis['windy'][i]):
        wind.append(1)
        
X_tennis['outlook'] = outlook
X_tennis['temp'] = temp
X_tennis['humidity'] = humidity
X_tennis['windy'] = wind

In [6]:
X_tennis

Unnamed: 0,outlook,temp,humidity,windy
0,2,2,1,0
1,2,2,1,1
2,1,2,1,0
3,0,1,1,0
4,0,0,0,0
5,0,0,0,1
6,1,0,0,1
7,2,1,1,0
8,2,0,0,0
9,0,1,0,0


### Eksplorasi scikit learn Kmeans, Agglomerative Clustering, DBSCAN, Gaussian Mixtures

In [7]:
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture

#### KMeans
Parameter pada KMeans: <br><br>
n_clusters = jumlah cluster ingin dicari<br>
init<br>
n_init<br>
max_iter<br>
tol<br>
precompute_distances<br>
verbose<br>
random_state<br>
copy_x<br>
n_jobs<br>
algorithm<br>

In [8]:
k_means_iris = KMeans(n_clusters=3, random_state=42)
k_means_iris.fit(X_iris)
print('pusat cluster kMeans iris :')
print(k_means_iris.cluster_centers_)

k_means_tennis = KMeans(n_clusters=2, random_state=42)
k_means_tennis.fit(X_tennis)
print('pusat cluster kMeans tennis :')
print(k_means_tennis.cluster_centers_)

pusat cluster kMeans iris :
[[5.9016129  2.7483871  4.39354839 1.43387097]
 [5.006      3.418      1.464      0.244     ]
 [6.85       3.07368421 5.74210526 2.07105263]]
pusat cluster kMeans tennis :
[[1.625      1.375      0.625      0.375     ]
 [0.16666667 0.5        0.33333333 0.5       ]]


#### Agglomerative Clustering
Parameter pada Agglomerative Clustering: <br><br>
n_clusters = jumlah cluster yang ingin dicari<br>
affinity <br>
memory <br>
connectivity <br>
compute_full_tree <br>
linkage <br>
pooling_func <br>

In [9]:
agglo_iris = AgglomerativeClustering(n_clusters=3, affinity='manhattan', linkage='complete')
agglo_iris.fit(X_iris)

agglo_tennis = AgglomerativeClustering(n_clusters=2, affinity='manhattan', linkage='complete')
agglo_tennis.fit(X_tennis)

agglo_iris.labels_
agglo_tennis.labels_

array([1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1], dtype=int64)

#### DBSCAN
Parameter pada DBSCAN: <br><br>
eps = jarak ketetanggaan maksimal dua sampel <br>
min_samples <br>
metric <br>
metric_params <br>
algorithm <br>
leaf_size <br>
p <br>
n_jobs <br>

In [10]:
dbscan_iris = DBSCAN(eps=1, metric='manhattan')
dbscan_iris.fit(X_iris)

dbscan_tennis = DBSCAN(eps=1, metric='manhattan')
dbscan_tennis.fit(X_tennis)

dbscan_iris.labels_
dbscan_tennis.labels_

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
      dtype=int64)

#### Gaussian Mixture
Parameter pada Gaussian Mixture: <br><br>
n_components = jumlah komponen mixture <br>
covariance_type <br>
tol <br>
reg_covar <br>
max_iter <br>
n_init <br>
init_params <br>
weights_init <br>
means_init <br>
precisions_init <br>
random_state <br>
warm_start <br>
verbose <br>
verbose_interval <br>

In [11]:
gaussian_mix_iris = GaussianMixture(n_components=3)
gaussian_mix_iris.fit(X_iris)

gaussian_mix_tennis = GaussianMixture(n_components=2)
gaussian_mix_tennis.fit(X_tennis)

gaussian_mix_iris.weights_
gaussian_mix_tennis.weights_

array([0.49983298, 0.50016702])

### Eksplorasi k-medoids (PAM), Graph-based clustering (MST), grid clustering (BASE)

In [12]:
from pyclustering.cluster.kmedoids import kmedoids
from pyclustering.utils import read_sample
from pyclustering.utils import timedcall
from mst_clustering import MSTClustering
from pyclustering.cluster.bang import bang, bang_visualizer

#### KMedoids

In [13]:
kmedoids_iris = kmedoids(X_iris, [1, 90, 120], ccore=False)
kmedoids_iris.process()

X_tennis_array = X_tennis.values
kmedoids_tennis = kmedoids(X_tennis_array, [0, 2], ccore=False)
kmedoids_tennis.process()

In [14]:
X_tennis_array

array([[2, 2, 1, 0],
       [2, 2, 1, 1],
       [1, 2, 1, 0],
       [0, 1, 1, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 1],
       [1, 0, 0, 1],
       [2, 1, 1, 0],
       [2, 0, 0, 0],
       [0, 1, 0, 0],
       [2, 1, 0, 1],
       [1, 1, 1, 1],
       [1, 2, 0, 0],
       [0, 1, 1, 1]], dtype=int64)

In [15]:
kmedoids_iris.get_clusters()

[[7,
  0,
  1,
  2,
  3,
  4,
  5,
  6,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49],
 [55,
  51,
  53,
  54,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  101,
  106,
  113,
  119,
  121,
  123,
  126,
  127,
  133,
  138,
  142,
  149],
 [112,
  50,
  52,
  76,
  77,
  86,
  100,
  102,
  103,
  104,
  105,
  107,
  108,
  109,
  110,
  111,
  114,
  115,
  116,
  117,
  118,
  120,
  122,
  124,
  125,
  128,
  129,
  130,
  131,
  132,
  134,
  135,
  136,
  137,
  139,
  140,
  141,
  143,
  144,
  145,
  146,
  147,
  148]]

#### MST

In [16]:
mst_iris = MSTClustering(cutoff_scale=2)
mst_iris.fit(X_iris)

mst_tennis = MSTClustering(cutoff_scale=2)
mst_tennis.fit(X_tennis)

mst_iris.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

#### BANG

In [17]:
levels = 11

bang_iris = bang(X_iris, levels)
bang_iris.process()


bang_tennis = bang(X_tennis_array, levels)
bang_tennis.process()

  return self.__get_amount_points() / self.__spatial_block.get_volume()
  return self.__get_amount_points() / self.__spatial_block.get_volume()


In [18]:
clusters = bang_iris.get_clusters()
noise = bang_iris.get_noise()
directory = bang_iris.get_directory()
dendrogram = bang_iris.get_dendrogram()
print(clusters)

[[0, 149]]


In [None]:
bang_visualizer.show_blocks(directory)
bang_visualizer.show_dendrogram(dendrogram)
bang_visualizer.show_clusters(X_iris, clusters, noise)

  density_scale = bang_visualizer.__maximum_density_alpha * block.get_density() / density_scale
