In [24]:
from collections import defaultdict
# from datetime import datetime

import numpy as np
from matplotlib import pyplot as plt
import random
import math
# import time
# from sklearn import decomposition
import pandas as pd

In [25]:
color_dict = defaultdict()


def init_color_dict(number_of_clusters):
    color_dict[0] = '#448aff'
    color_dict[1] = '#ec407a'
    color_dict[2] = '#00e676'
    color_dict[3] = '#ff6f00'
    color_dict[4] = '#ef9a9a'
    color_dict[5] = '#964B00'
    for i in range(6, number_of_clusters):
        color_dict[i] = np.random.rand(1, 3)

In [26]:
# print dataset
def print_dataset(dataset):
    print(np.asarray(dataset))

In [27]:
# print plot 2d dataset
def plot_2d_dataset(dataset, markers=None, labels=None):
    # x, y = np.asarray(dataset).T
    if labels is None:
        labels = []
    if markers is None:
        markers = []

    for i in range(len(dataset)):
        point = dataset[i]
        plt.scatter(point[0], point[1], c=color_dict[labels[i]])

    for marker in markers:
        plt.scatter(marker[0], marker[1], s=200, marker='*')
    plt.show()
    # plt.savefig(str(time.time()) + '.png')
    # plt.close()

In [28]:
# plot data final
def plot_2d_with_markers(clusters, dataset, membership):
    labels = get_cluster_labels(membership)
    plot_2d_dataset(dataset, markers=clusters, labels=labels)

In [29]:
# return a n * c membership function, generate random fungsi keanggotaan
# n = number of data
# c = number of clusters
def generate_random_membership_function(n, c):
    membership = np.random.rand(n, c)
    # print(membership)
    summation = [sum(center) for center in membership]
    normalized = []
    for i in range(len(membership)):
        tmp = []
        for d in membership[i]:
            tmp.append(d / summation[i])
        normalized.append(tmp)
    return normalized

In [30]:
# calculate weighted average of data points, menghitung dengan bobot, langkah 4
# for each cluster i calculate sigma[k from 1 to n]((U_ik)^m * DATA_k)/sigma[k from 1 to n]((U_ik)^m)
# m is fuzziness parameter
def update_cluster_centers(dataset, membership_matrix, m):
    number_of_clusters = len(membership_matrix[0])
    cluster_centers = []
    for i in range(number_of_clusters):
        u_ik = list(zip(*membership_matrix))[i]
        u_ik_m = [x ** m for x in u_ik]
        sigma_u_ik_m = sum(u_ik_m)  # i is fixed
        weighted_data = []
        for k in range(len(dataset)):
            weighted_vector = []
            for f in range(len(dataset[k])):  # iterate over features
                weighted_vector.append(u_ik_m[k] * dataset[k][f])
            weighted_data.append(weighted_vector)
        sigma_data_u_ik_m = [sum(x) for x in list(zip(*weighted_data))]
        cluster_centers.append([sigma_data_u_ik_m[d]/sigma_u_ik_m for d in range(len(sigma_data_u_ik_m))])
        # print("Cek", cluster_centers)
    return cluster_centers

In [31]:
# euclidean distance
def euclidean_distance(p, q):
    summation = 0
    for i in range(len(p)):
        summation += (p[i] - q[i]) ** 2
    return math.sqrt(summation)

In [32]:
# langkah 6
def update_membership_matrix(dataset, clusters, m):
    membership_matrix = []
    fuzzy_power = float(2 / (m-1))
    n = len(dataset)
    c = len(clusters)
    for i in range(n):
        denom = sum([(1/euclidean_distance(dataset[i], clusters[x])) ** fuzzy_power for x in range(c)])
        membership = []
        for j in range(c):
            num = (1/euclidean_distance(dataset[i], clusters[j])) ** fuzzy_power
            membership.append(num/denom)
        membership_matrix.append(membership)
    return membership_matrix

In [33]:
def get_cluster_labels(membership_matrix):
    res = []
    for membership in membership_matrix:
        max_index = membership.index(max(membership))
        res.append(max_index)
    return res

In [34]:
def fkm(cluster_no, iterations, dataset, m, error):
    c = cluster_no
    n = len(dataset)
    vdata = np.mean(np.var(dataset, 0))
    membership = generate_random_membership_function(n, c)

    clusters = []
    iterasi = 0
    for i in range(iterations):
        centers_old = clusters.copy()
        clusters = update_cluster_centers(dataset, membership, m)
        membership = update_membership_matrix(dataset, clusters, m)
        iterasi += 1
        if (np.sum(centers_old) - np.sum(clusters)) ** 2 < error * vdata:
            break
#     print("Jumlah iterasi ", iterasi)
    return clusters, membership

In [35]:
data = pd.read_csv("Mall_Customers.csv")

In [36]:
data.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [37]:
# drop kolom yang tidak diperlukan
data = data.drop(columns=['CustomerID'])

# jalankan proses one-hote encoding dengan pd.get_dummies(data)
data = pd.get_dummies(data)

data

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100),Gender_Female,Gender_Male
0,19,15,39,0,1
1,21,15,81,0,1
2,20,16,6,1,0
3,23,16,77,1,0
4,31,17,40,1,0
...,...,...,...,...,...
195,35,120,79,1,0
196,45,126,28,1,0
197,32,126,74,0,1
198,32,137,18,0,1


In [38]:
train_data = np.array(data)

number_of_clusters = 6
iterations = 3000
m = 2
error = 0.0001


init_color_dict(number_of_clusters)  # init colors for plotting
# start_time = time.time()
cluster_centers, final_memberships = fkm(number_of_clusters, iterations, train_data, m, error)  # run FCM
# runKmeans = round((time.time() - start_time), 3)
# plot_2d_with_markers(cluster_centers, train_data, final_memberships)  # plot final result

final_labels = get_cluster_labels(final_memberships)  # get labels

# print("Running time : %s seconds" % runKmeans)
print(final_labels)
# print("cluster centers :")


[1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 3, 4, 1, 4, 1, 4, 3, 2, 2, 2, 3, 2, 2, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, 2, 2, 3, 3, 2, 3, 2, 2, 2, 3, 2, 3, 2, 2, 3, 3, 2, 3, 2, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 5, 2, 5, 0, 5, 0, 5, 0, 5, 2, 5, 0, 5, 0, 5, 0, 5, 0, 5, 2, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5, 0, 5]


In [39]:
print("cluster centers :")
print_dataset(cluster_centers)

cluster centers :
[[42.2275956  87.08705086 17.44534865  0.43886908  0.56113092]
 [46.86049763 28.87017229 18.58811799  0.64788861  0.35211139]
 [26.39292342 58.78665793 49.91669759  0.5983245   0.4016755 ]
 [54.66258995 54.85967312 48.86457672  0.538638    0.461362  ]
 [25.24351904 26.08526301 77.63535975  0.58066626  0.41933374]
 [32.60636718 84.08269857 83.12839372  0.58856964  0.41143036]]


In [40]:
# Converted the data to a dataframe and create a cluster Map
cluster_map = pd.DataFrame(data)

# Add a column 'cluster' and to assign the labels
cluster_map['Label'] = final_labels
cluster_map

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100),Gender_Female,Gender_Male,Label
0,19,15,39,0,1,1
1,21,15,81,0,1,4
2,20,16,6,1,0,1
3,23,16,77,1,0,4
4,31,17,40,1,0,1
...,...,...,...,...,...,...
195,35,120,79,1,0,5
196,45,126,28,1,0,0
197,32,126,74,0,1,5
198,32,137,18,0,1,0


In [43]:
data.to_csv('Iris_fuzzyclustered.csv')