In [8]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from scipy.spatial import distance
from sklearn.cluster import KMeans

In [3]:
def my_kmeans(datasets, k, init_centroids=None, max_iterations = 10, isPlot=False):
  N, f = datasets.shape
  
  # plot initial centroids which are selected randomly
  if init_centroids == None:
    datasets1 = np.array(datasets)
    np.random.shuffle(datasets1)
    init_centroids = datasets1[0:k, :]
  else:
    init_centroids = np.array(init_centroids)
  
  centroids = np.array(init_centroids)

  if isPlot:
    plt.title("Initial centroids")
    plt.scatter(datasets[:, 0], datasets[:, 1], s = 20)
    plt.scatter(centroids[:, 0], centroids[:, 1], marker="*", color = "red", s = 200)
    plt.show()
    
  labels = np.zeros(N, dtype='int')

  for iter_no in range(max_iterations): # iterates for maximum number of iterations
    # assigning closest centeroid to each data point
  
    distances = distance.cdist(datasets, centroids)
    labels = distances.argmin(axis=1)

    # finding the new centroids by taking the mean value
    if isPlot:
      plt.title("result after iteration " + str(iter_no))
      plt.scatter(datasets[:, 0], datasets[:, 1], c=labels, s=20)
      plt.scatter(centroids[:, 0], centroids[:, 1], marker = "*", s = 200, color = "red")
    
    new_centroids = np.zeros((k, f))
    for cluster_no in range(k):
      points = datasets[np.where(labels==cluster_no)]
      new_centroids[cluster_no] = np.mean(points, axis = 0)

    # scatter plot of each iteration
    if isPlot:
      plt.scatter(new_centroids[:, 0], new_centroids[:, 1], marker = "o", s = 100, color = "blue")
      plt.show()
    
    # calculating difference between previous centroids and new centroids
    diff = (new_centroids - centroids).sum()

    centroids = new_centroids

    # if there is no change in the previous centroids and new centroids then loop will terminated
    if diff == 0:
      break
    
  return labels, init_centroids, centroids

In [4]:
def bisecting_kmeans(datasets, k):
    clusters = {}
    sse_values = {}

    # Initialize the clusters with all data points
    clusters[0] = datasets
    sse_values[0] = np.inf

    for i in range(1, k):
        max_sse_cluster = None
        max_sse_value = -np.inf

        # Find the cluster with the highest SSE
        for j in range(i):
            sse = np.sum((clusters[j] - np.mean(clusters[j], axis=0)) ** 2)
            if sse > max_sse_value:
                max_sse_value = sse
                max_sse_cluster = j

        # Bisect the cluster with the highest SSE
        cluster_to_bisect = clusters[max_sse_cluster]
        labels, _, new_centroids = my_kmeans(cluster_to_bisect, 2, max_iterations=100)

        # Update the clusters and SSE values
        clusters[max_sse_cluster] = cluster_to_bisect[np.where(labels == 0)]
        clusters[i] = cluster_to_bisect[np.where(labels == 1)]
        sse_values[max_sse_cluster] = np.sum((clusters[max_sse_cluster] - np.mean(clusters[max_sse_cluster], axis=0)) ** 2)
        sse_values[i] = np.sum((clusters[i] - np.mean(clusters[i], axis=0)) ** 2)

    # Find the final centroids and cluster labels
    # centroids = np.zeros((k, datasets.shape[1]))
    # for i in range(k):
    #     centroids[i] = np.mean(clusters[i], axis=0)

    labels=[0]*len(datasets)
    for i in range(0,k):
      for c in clusters[i]:
        idx=np.where(datasets==c)[0][0]
        labels[idx]=i
    return labels

In [5]:
def kmeans_pp(datasets, k):
    N, f = datasets.shape
    
    # randomly select the first centroid
    centroids = [datasets[np.random.choice(N)]]
    
    # repeat for k-1 more centroids
    for i in range(k-1):
        # calculate distances from the last selected centroid
        distances = distance.cdist(datasets, [centroids[-1]])
        
        # find the minimum distance for each point
        min_distances = np.min(distances, axis=1)
        
        # select the next centroid with probability proportional to square distance
        probs = min_distances**2 / np.sum(min_distances**2)
        idx = np.random.choice(N, p=probs)
        centroids.append(datasets[idx])
    
    labels,initial_centroids,centroids=my_kmeans(datasets,k,centroids)
    # clusters=[]
    # for i in range(k):
    #   c=datasets[np.where(labels==i)]
    #   clusters.append(c)
    return labels

In [6]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import numpy as np

def plot(datasets, labels):
    #colour pattern
    colorscale = ['white','red','yellow','green','orange','violet','blue']

    # Define the layout for the plot
    layout = go.Layout(
        margin=dict(l=0, r=0, b=0, t=0),
        scene=dict(
            xaxis=dict(title='X'),
            yaxis=dict(title='Y'),
            zaxis=dict(title='Z'),
            aspectmode='data',
            aspectratio=dict(x=1, y=1, z=1),
            bgcolor='black',
           
            camera=dict(
                up=dict(x=0, y=0, z=1),
            ),
        ),
        paper_bgcolor='black',
    )

    #figure 1
    trace = go.Scatter3d(
        x=datasets[:, 0],
        y=datasets[:, 1],
        z=datasets[:, 2],
        mode='markers',
        marker=dict(
            color="white",
            size=3,
            opacity=1,
            
        ),
    )

    #figure 2
    trace1 = go.Scatter3d(
        x=datasets[:,0],
        y=datasets[:,1],
        z=datasets[:,2],
        mode='markers',
        marker=dict(
            color=labels,
            colorscale=colorscale,
            size=3,
            opacity=1,
            
        ),
    )

    #figure 3
    # dataset=datasets[np.where(labels!=-1)]
    # label=labels[np.where(labels!=-1)]
    # colorscale=colorscale[1:]
    # trace2 = go.Scatter3d(
    #     x=dataset[:,0],
    #     y=dataset[:,1],
    #     z=dataset[:,2],
    #     mode='markers',
    #     marker=dict(
    #         color=label,
    #         colorscale=colorscale,
    #         size=1,
    #         opacity=1,
            
    #     ),
    # )
    fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'scatter3d'}]*2]*1)
    fig.update_layout(layout)

    fig.add_trace(trace, row=1, col=1)
    fig.add_trace(trace1, row=1, col=2)
    # fig.add_trace(trace2, row=1, col=3)
    
    for template in ["plotly", "plotly_white", "plotly_dark", "ggplot2", "seaborn", "simple_white", "none"]:
        fig.update_layout(template=template)
    fig.show()    
    return fig






In [9]:
import os
path="../ds3.csv"

# fig, ax=plt.subplots(1, 3,figsize=(25,60))
# print(f"{'FileName':<30}{'Bisecting K-Mean SSE':>30}{'new bisecting SSE':^30}")
for i in range(0,1):
  df = pd.read_csv(path, sep="\t", header=None)
  X, labels = df.iloc[:, :-1], df.iloc[:, -1]
  if(X.shape[1]==0):
    df = pd.read_csv(path, sep=",", header=None)
    X, labels = df.iloc[:, :-1], df.iloc[:, -1]
  if(X.shape[1]==0):
    df = pd.read_csv(path, sep=" ", header=None)
    X, labels = df.iloc[:, :-1], df.iloc[:, -1]
  # print(X.shape)
  X, actual_labels = np.array(X), np.array(labels)
  K=len(set(actual_labels))

  bisecting_labels = bisecting_kmeans(X,K)
  kmeans_pp_labels = kmeans_pp(X,K)
  kmeans_labels=my_kmeans(X,K)
  print(kmeans_labels)
  bisecting_fig=plot(X,bisecting_labels)
  kmeans_pp_fig=plot(X,kmeans_pp_labels)
  kmeans_fig=plot(X,kmeans_labels)
  bisecting_fig.write_html('cluster/'+path[3:6]+'bisecting.html')
  kmeans_fig.write_html('cluster/'+path[3:6]+'kmeans.html')
  kmeans_pp_fig.write_html('cluster/'+path[3:6]+'kmeansPP.html')





KMeans(n_clusters=3, random_state=0)


ValueError: 
    Invalid value of type 'sklearn.cluster._kmeans.KMeans' received for the 'color' property of scatter3d.marker
        Received value: KMeans(n_clusters=3, random_state=0)

    The 'color' property is a color and may be specified as:
      - A hex string (e.g. '#ff0000')
      - An rgb/rgba string (e.g. 'rgb(255,0,0)')
      - An hsl/hsla string (e.g. 'hsl(0,100%,50%)')
      - An hsv/hsva string (e.g. 'hsv(0,100%,100%)')
      - A named CSS color:
            aliceblue, antiquewhite, aqua, aquamarine, azure,
            beige, bisque, black, blanchedalmond, blue,
            blueviolet, brown, burlywood, cadetblue,
            chartreuse, chocolate, coral, cornflowerblue,
            cornsilk, crimson, cyan, darkblue, darkcyan,
            darkgoldenrod, darkgray, darkgrey, darkgreen,
            darkkhaki, darkmagenta, darkolivegreen, darkorange,
            darkorchid, darkred, darksalmon, darkseagreen,
            darkslateblue, darkslategray, darkslategrey,
            darkturquoise, darkviolet, deeppink, deepskyblue,
            dimgray, dimgrey, dodgerblue, firebrick,
            floralwhite, forestgreen, fuchsia, gainsboro,
            ghostwhite, gold, goldenrod, gray, grey, green,
            greenyellow, honeydew, hotpink, indianred, indigo,
            ivory, khaki, lavender, lavenderblush, lawngreen,
            lemonchiffon, lightblue, lightcoral, lightcyan,
            lightgoldenrodyellow, lightgray, lightgrey,
            lightgreen, lightpink, lightsalmon, lightseagreen,
            lightskyblue, lightslategray, lightslategrey,
            lightsteelblue, lightyellow, lime, limegreen,
            linen, magenta, maroon, mediumaquamarine,
            mediumblue, mediumorchid, mediumpurple,
            mediumseagreen, mediumslateblue, mediumspringgreen,
            mediumturquoise, mediumvioletred, midnightblue,
            mintcream, mistyrose, moccasin, navajowhite, navy,
            oldlace, olive, olivedrab, orange, orangered,
            orchid, palegoldenrod, palegreen, paleturquoise,
            palevioletred, papayawhip, peachpuff, peru, pink,
            plum, powderblue, purple, red, rosybrown,
            royalblue, rebeccapurple, saddlebrown, salmon,
            sandybrown, seagreen, seashell, sienna, silver,
            skyblue, slateblue, slategray, slategrey, snow,
            springgreen, steelblue, tan, teal, thistle, tomato,
            turquoise, violet, wheat, white, whitesmoke,
            yellow, yellowgreen
      - A number that will be interpreted as a color
        according to scatter3d.marker.colorscale
      - A list or array of any of the above