In [27]:
from collections import defaultdict
from datetime import datetime

import numpy as np
from matplotlib import pyplot as plt
import random
import math
import time
from sklearn import decomposition
import pandas as pd

In [28]:
color_dict = defaultdict()


def init_color_dict(number_of_clusters):
    color_dict[0] = '#448aff'
    color_dict[1] = '#ec407a'
    color_dict[2] = '#00e676'
    color_dict[3] = '#ff6f00'
    color_dict[4] = '#ef9a9a'
    for i in range(5, number_of_clusters):
        color_dict[i] = np.random.rand(1, 3)

In [29]:
# print dataset
def print_dataset(dataset):
    print(np.asarray(dataset))

In [30]:
# print plot 2d dataset
def plot_2d_dataset(dataset, markers=None, labels=None):
    x, y = np.asarray(dataset).T
    if labels is None:
        labels = []
    if markers is None:
        markers = []

    for i in range(len(dataset)):
        point = dataset[i]
        plt.scatter(point[0], point[1], c=color_dict[labels[i]])

    for marker in markers:
        plt.scatter(marker[0], marker[1], s=200, marker='*')
    plt.show()
    plt.savefig(str(time.time()) + '.png')
    plt.close()

In [31]:
# plot data final
def plot_2d_with_markers(clusters, dataset, membership):
    labels = get_cluster_labels(membership)
    plot_2d_dataset(dataset, markers=clusters, labels=labels)

In [32]:
# return a n * c membership function, generate random fungsi keanggotaan
# n = number of data
# c = number of clusters
def generate_random_membership_function(n, c):
    membership = np.random.rand(n, c)
    # print(membership)
    summation = [sum(center) for center in membership]
    normalized = []
    for i in range(len(membership)):
        tmp = []
        for d in membership[i]:
            tmp.append(d / summation[i])
        normalized.append(tmp)
    return normalized

In [33]:
# calculate weighted average of data points, menghitung dengan bobot, langkah 4
# for each cluster i calculate sigma[k from 1 to n]((U_ik)^m * DATA_k)/sigma[k from 1 to n]((U_ik)^m)
# m is fuzziness parameter
def update_cluster_centers(dataset, membership_matrix, m):
    number_of_clusters = len(membership_matrix[0])
    cluster_centers = []
    for i in range(number_of_clusters):
        u_ik = list(zip(*membership_matrix))[i]
        u_ik_m = [x ** m for x in u_ik]
        sigma_u_ik_m = sum(u_ik_m)  # i is fixed
        weighted_data = []
        for k in range(len(dataset)):
            weighted_vector = []
            for f in range(len(dataset[k])):  # iterate over features
                weighted_vector.append(u_ik_m[k] * dataset[k][f])
            weighted_data.append(weighted_vector)
        sigma_data_u_ik_m = [sum(x) for x in list(zip(*weighted_data))]
        cluster_centers.append([sigma_data_u_ik_m[d]/sigma_u_ik_m for d in range(len(sigma_data_u_ik_m))])
        print("Cek", cluster_centers)
    return cluster_centers

In [34]:
# euclidean distance
def euclidean_distance(p, q):
    summation = 0
    for i in range(len(p)):
        summation += (p[i] - q[i]) ** 2
    return math.sqrt(summation)

In [35]:
# langkah 6
def update_membership_matrix(dataset, clusters, m):
    membership_matrix = []
    fuzzy_power = float(2 / (m-1))
    n = len(dataset)
    c = len(clusters)
    for i in range(n):
        denom = sum([(1/euclidean_distance(dataset[i], clusters[x])) ** fuzzy_power for x in range(c)])
        membership = []
        for j in range(c):
            num = (1/euclidean_distance(dataset[i], clusters[j])) ** fuzzy_power
            membership.append(num/denom)
        membership_matrix.append(membership)
    return membership_matrix

In [36]:
def get_cluster_labels(membership_matrix):
    res = []
    for membership in membership_matrix:
        max_index = membership.index(max(membership))
        res.append(max_index)
    return res

In [37]:
def fkm(cluster_no, iterations, dataset, m, error):
    c = cluster_no
    n = len(dataset)
    vdata = np.mean(np.var(dataset, 0))
    membership = generate_random_membership_function(n, c)

    clusters = []
    iterasi = 0
    for i in range(iterations):
        centers_old = clusters.copy()
        clusters = update_cluster_centers(dataset, membership, m)
        membership = update_membership_matrix(dataset, clusters, m)
        iterasi += 1
        if (np.sum(centers_old) - np.sum(clusters)) ** 2 < error * vdata:
            break
        print("Jumlah iterasi ", iterasi)
    return clusters, membership

In [38]:
data = pd.read_csv("IrisOri.csv")

In [39]:
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,1,5.1,3.5,1.4,0.2
1,2,4.9,3.0,1.4,0.2
2,3,4.7,3.2,1.3,0.2
3,4,4.6,3.1,1.5,0.2
4,5,5.0,3.6,1.4,0.2


In [40]:
train_data = np.array(data)

number_of_clusters = 3
iterations = 3000
m = 2
error = 0.0001


init_color_dict(number_of_clusters)  # init colors for plotting
start_time = time.time()
cluster_centers, final_memberships = fkm(number_of_clusters, iterations, train_data, m, error)  # run FCM
runKmeans = round((time.time() - start_time), 3)
plot_2d_with_markers(cluster_centers, train_data, final_memberships)  # plot final result

final_labels = get_cluster_labels(final_memberships)  # get labels

print("Running time : %s seconds" % runKmeans)
print(final_labels)
print("cluster centers :")


Cek [[76.835279255593, 5.879046520306814, 3.0255697847338, 3.838002175550708, 1.2136118735071364]]
Cek [[76.835279255593, 5.879046520306814, 3.0255697847338, 3.838002175550708, 1.2136118735071364], [69.56307190287127, 5.7979288747539925, 3.1073033455461085, 3.625940456951894, 1.183882623094091]]
Cek [[76.835279255593, 5.879046520306814, 3.0255697847338, 3.838002175550708, 1.2136118735071364], [69.56307190287127, 5.7979288747539925, 3.1073033455461085, 3.625940456951894, 1.183882623094091], [79.42492645579325, 5.859043670583211, 3.0609347379197196, 3.798051296106067, 1.1982630380675359]]
Jumlah iterasi  1
Cek [[80.93173979843237, 6.000207972156578, 3.05019298508945, 3.9998575635645, 1.2902513371643647]]
Cek [[80.93173979843237, 6.000207972156578, 3.05019298508945, 3.9998575635645, 1.2902513371643647], [59.79965107117144, 5.677008077508299, 3.027573696495462, 3.395637800609777, 1.0259759648502325]]
Cek [[80.93173979843237, 6.000207972156578, 3.05019298508945, 3.9998575635645, 1.290251337

ValueError: too many values to unpack (expected 2)

In [41]:
print("cluster centers :")
print_dataset(cluster_centers)

cluster centers :
[[ 75.82815207   5.93587329   2.78977775   4.24064052   1.32289197]
 [ 23.67553271   5.05453375   3.4202198    1.53758339   0.27095966]
 [127.61903065   6.59000023   2.97424357   5.50392642   1.9996166 ]]


In [42]:
# Converted the data to a dataframe and create a cluster Map
cluster_map = pd.DataFrame(data)

# Add a column 'cluster' and to assign the labels
cluster_map['Species'] = final_labels
cluster_map

NameError: name 'final_labels' is not defined

In [43]:
data.to_csv('Iris_fuzzyclusteredp.csv')