In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
from IPython.display import display

from statistics import mean
import functools

## utils

### classes

In [93]:
from typing import Any
from collections.abc import Iterable


class Cluster:
    def __init__(self, cluster_list=[]):
        self.__cluster_list = cluster_list
        self.__center = self.computeCentroid()

    def __setattr__(self, name: str, value: Any) -> None:
        

        if name == '_Cluster__cluster_list':
            if isinstance(value, Iterable) and not isinstance(value, (str, bytes)):
                value = [x for x in value]
            self.__dict__[name] = value
            self.__center = self.computeCentroid() # -- automatically reruns computeCentroid when cluster_list is updated

        else:
            self.__dict__[name] = value

    def __str__(self):
        return f'Cluster: {self.cluster_list};\n\tCentroid: {self.center}'
    def __repr__(self):
        return f'Cluster(cluster_list={self.cluster_list})'
    
    @property
    def cluster_list(self):
        return self.__cluster_list
    @cluster_list.setter
    def cluster_list(self, value):
        self.__cluster_list = value
    @cluster_list.deleter
    def cluster_list(self):
        del self.__cluster_list

    @property
    def center(self):
        return self.__center
    # @center.setter shouldnt be allowed

    

    def __iter__(self):
        return iter(self.cluster_list)

    def __add__(self, other):
        if type(other) != Cluster:
            other = Cluster([other])
        return Cluster(self.cluster_list + other.cluster_list)
    
    def __sub__(self, other):
        if type(other) != Cluster :
            other = Cluster([other])
            # print('other:', other) #debug
        return Cluster([x for x in self if x not in other]) # set difference
    
    def __len__(self):
        return len(self.cluster_list)
    
    def __getitem__(self, index):
        return self.cluster_list[index]
    

    def computeCentroid(self, algo='mean'):
        if len(self.__cluster_list) == 0:
            return None
        
        if len(self.__cluster_list) == 1: 
            if isinstance(self.__cluster_list[0], Iterable) and not isinstance(self.__cluster_list[0], (str, bytes)):
                return list(self.__cluster_list[0])
            else:
                return float(self.__cluster_list[0]) #there's only one element in the cluster, so the centroid is the element itself
        # if each item is a primitive type, i wanna find the mean; if each is an nd array, i wanna return an nd array

        centroid = None
        for item in self.__cluster_list:
            if isinstance(item, Iterable) and not isinstance(item, (str, bytes)):
                transposed = zip(*self.__cluster_list)
                centroid = [float(mean(coordinate)) for coordinate in transposed]
            else:
                centroid = float(mean(self.__cluster_list))
        return centroid
    

        

    
c1=Cluster()
c2=Cluster([1,25,6])
c1+=c2

cl1 = Cluster([[1,2,3,4,5]])
cl1.computeCentroid()

test = Cluster([(1,2,3)])
print(test)
# print(c1)

Cluster: [(1, 2, 3)];
	Centroid: [1, 2, 3]


In [28]:

vectors=[1,4,7]
mean(vectors) #works

vectors = [(1,2,3), (4,5,6), (7,8,9)] #works
vectors = [np.array([10., 12.]), np.array([10., 14.]), np.array([5., 7.]), np.array([10. , 12.7])]
transposed = zip(*vectors)
transposed
# for coord in transposed:
#     for element in coord:
#         float(element)
[float(mean(coordinate)) for coordinate in transposed]

[8.75, 11.425]

In [80]:
data[0]

array([10. , 12.7])

In [82]:
cluster_4= Cluster (data[4])
print(cluster_4)

Cluster: [np.float64(5.0), np.float64(7.0)];
	Centroid: 6.0


In [36]:
cl1 = Cluster([np.array([10., 12.]), np.array([10., 14.]), np.array([5., 7.]), np.array([10. , 12.7])])
print(cl1)

Cluster: [array([10., 12.]), array([10., 14.]), array([5., 7.]), array([10. , 12.7])];
	Centroid: [8.75, 11.425]


In [37]:
c4=Cluster((9,1))
c4

Cluster(cluster_list=[9, 1])

In [94]:
# CLUSTER: Cluster: [array([5., 7.])];
# 	Centroid: [5. 7.]
cluster_5= Cluster ([np.array([5., 7.])])
print(cluster_5.center)

[np.float64(5.0), np.float64(7.0)]


### datasets

In [10]:
data=np.array([
    [10, 12.7],
    [10, 12],
    [10, 14],
    [9, 14],
    [5,7]]
)
x=data[:,0]; y=data[:,1]
n=len(x)
display(pd.DataFrame(data, columns=['x', 'y']))

Unnamed: 0,x,y
0,10.0,12.7
1,10.0,12.0
2,10.0,14.0
3,9.0,14.0
4,5.0,7.0


### distances

In [7]:
def validate_dimensions(func):
    @functools.wraps(func) # --->preserves teh function signature
    def wrapper(point1, point2, *args, **kwargs):
        # -- convert from scalar to vector (1D array)
        if isinstance(point1, (float, int)):
            point1 = [point1]
        if isinstance(point2, (float, int)):
            point2 = [point2]

        # -- check if the two points have the same number of dimensions (same domain)
        if len(point1) != len(point2):
            raise ValueError(f'The two points should have the same number of dimensions (as in # of cols); {len(point1)} != {len(point2)}')

        return func(point1, point2, *args, **kwargs)
    return wrapper

@validate_dimensions
def manhattan_distance(point1, point2):
    '''
    Computes and returns the Manhattan distance between two points x=(x1, x2... xm) and y=(y1, y2,... ym)
    '''

    # -- convert from scalar to vector (1D array)
    if isinstance(point1, float) or isinstance(point1, int):
        point1 = [point1]
    if isinstance(point2, float) or isinstance(point2, int):
        point2 = [point2]

    if len(point1) != len(point2):
        raise ValueError(f'The two points should have the same number of dimensions (as in # of cols); {len(point1)} != {len(point2)}')
    
    distance = sum(abs(point1[i] - point2[i]) for i in range(len(point1)))
    return float(round(distance,5))

def cluster_distance(c1,c2, func=manhattan_distance, method='centroid'):
    '''
    Computes the distance between two clusters c1 and c2 using the specified method.
    The method can be one of 'single', 'complete', 'average', 'centroid'

    In kmeans clustering, the method is by default centroid (force it in the function)
    '''

    c1=Cluster(c1)
    c2=Cluster(c2)
        
    # print(f'c1: {c1}; \nc2: {c2}') #debug
    if method == 'single':
        return min(func(x1,x2) for x1 in c1 for x2 in c2)
    
    elif method == 'complete':
        return max(func(x1,x2) for x1 in c1 for x2 in c2)
    
    elif method == 'average':
        return sum(func(x1,x2) for x1 in c1 for x2 in c2) / (len(c1) * len(c2))
    
    elif method == 'centroid':
        centroid_1 = c1.center
        centroid_2 = c2.center

        return func(centroid_1, centroid_2)
    


In [8]:
manhattan_distance((10,12.7), (10,12)) #works

c1=Cluster([1,2,4,0])
c2=Cluster([3]) #these are indices of the data points

data_c1 = data[c1]
data_c2 = data[3].reshape(1,data[3].shape[0]) # bcs its one item, should have 1 row

cluster_distance(data_c1, data_c2)

# data_c2.shape

NameError: name 'data' is not defined

In [None]:
display(data_c1)
display(np.array(data_c2))
clust_1 = Cluster(data_c1); clust_2 = Cluster(data_c2)
print(clust_2)

array([[10. , 12. ],
       [10. , 14. ],
       [ 5. ,  7. ],
       [10. , 12.7]])

array([[ 9., 14.]])

Cluster: [array([ 9., 14.])];
	Centroid: [ 9. 14.]


In [None]:
cluster_distance([10,12.7], [10,12])

0.35

In [None]:
c1 = np.array([[1, 2, 3],
               [4, 5, 6],
               [7, 8, 9]])

# mean_c1 = np.mean(c1, axis=0)
# print(mean_c1)

data_c1=Cluster(data_c1)
mean_data_c1 = np.mean(data_c1, axis=0)
mean_data_c1

array([ 8.75 , 11.425])

### algo

#### args and vars

In [13]:
df=pd.DataFrame(data, columns=['Nicotine (cg)', 'Tar(mg)'], index=['Peter Stuyvesant', 'Gitanes','Malboro','Lucky Strike', 'Light Delight'])
display(df)

k=2; n=len(data)
distance=cluster_distance
max_iter=100

cache_distance =[]
cache_means = []

Unnamed: 0,Nicotine (cg),Tar(mg)
Peter Stuyvesant,10.0,12.7
Gitanes,10.0,12.0
Malboro,10.0,14.0
Lucky Strike,9.0,14.0
Light Delight,5.0,7.0


In [14]:
# 1st step: assign each data point to a cluster, we will set means based on data points
M1=np.array(df.loc['Gitanes'])
M2=np.array(df.loc['Lucky Strike'])

means=[M1,M2]; cache_means.append(means)
c1=Cluster()
c2=Cluster()

In [15]:
#2nd step: 
# -- compute the distances between each data point and the two means
distances = np.zeros((n,k))
for i in range(n):
    for j in range(k):
        distances[i,j] = distance(data[i], means[j])
distance_df = pd.DataFrame(distances, index=df.index, columns=['Cluster 1', 'Cluster 2'])
cache_distance.append(distance_df)
display(distance_df)

Unnamed: 0,Cluster 1,Cluster 2
Peter Stuyvesant,0.35,0.15
Gitanes,0.0,0.5
Malboro,1.0,0.5
Lucky Strike,0.5,0.0
Light Delight,5.0,5.5


In [16]:
# -- assign each data point to the cluster with the closest mean
index_c1=Cluster(); index_c2=Cluster()
c1=Cluster(); c2=Cluster()

for i in range(n):
    if distances[i,0] < distances[i,1]:
        c1+=data[i]
        index_c1+=i
    else:
        c2+=data[i]
        index_c2+=i
print(c1)

Cluster: [array([10., 12.]), array([5., 7.])];
	Centroid: [np.float64(7.5), np.float64(9.5)]


In [None]:
# 3rd step: compute the new means

M1=c1.center
M2=c2.center
means=[M1,M2]


# need to generalize it to k 
convergence = (cache_means[-1][0]==means[0] and cache_means[-1][1]==means[1])
convergence

Unnamed: 0,Nicotine (cg),Tar(mg)
Cluster 1,7.5,9.5
Cluster 2,9.666667,13.566667


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

Unnamed: 0,Nicotine (cg),Tar(mg)
Peter Stuyvesant,10.0,12.7
Gitanes,10.0,12.0
Malboro,10.0,14.0
Lucky Strike,9.0,14.0
Light Delight,5.0,7.0


#### convergence generalized formula
make sure to do this before appending means

In [None]:
convergence=all([cache_means[-1][i]==means[i] for i in range(k)])


True

#### loop

In [57]:
df=pd.DataFrame(data, columns=['Nicotine (cg)', 'Tar(mg)'], index=['Peter Stuyvesant', 'Gitanes','Malboro','Lucky Strike', 'Light Delight'])
display(df)

k=2; n=len(data)
distance=cluster_distance
max_iter=100
verbose=True

cache_distance =[]
cache_means = []

Unnamed: 0,Nicotine (cg),Tar(mg)
Peter Stuyvesant,10.0,12.7
Gitanes,10.0,12.0
Malboro,10.0,14.0
Lucky Strike,9.0,14.0
Light Delight,5.0,7.0


In [None]:
################################## 1. Assign means ##################################

M1=list(np.array(df.loc['Gitanes'],dtype=float))
M2=list(np.array(df.loc['Lucky Strike'],dtype=float))
means=[M1,M2]

# -- assign k means randomly

# assign_means(random=True)

for mu in means:
    for _ in range(len(mu)):
        mu[_] = float(mu[_])



cache_means.append(means)
if verbose:
    print(f'\t-------------- Initial means: --------------\n')
    display(pd.DataFrame(means, columns=[df.columns],index=[f'Cluster {_}' for _ in range(1,k+1)]))

print('CACHE MEANS:', cache_means)

	-------------- Initial means: --------------



Unnamed: 0,Nicotine (cg),Tar(mg)
Cluster 1,10.0,12.0
Cluster 2,9.0,14.0


CACHE MEANS: [[[10.0, 12.0], [9.0, 14.0]]]


In [None]:
sss=[np.array([5., 7.]), [1,2]]

for mu in sss:
    for _ in range(len(mu)):
        mu[_] = float(mu[_])
    mu=

sss[0]

np.float64(5.0)

#### main

In [112]:
np.random.choice([1,2,3,4,5], size=2, replace=False)

array([1, 4])

In [None]:
def assign_means(data, k, seed=42):
    means = []
    np.random.seed(seed)

    indices= np.random.choice(range(len(data)), size=k, replace=False) #avoids dups
    for _ in range(k):
        mu=[]
        for __ in range(data.shape[1]):
            mu.append(float(data[indices[_], __])) #fixes the freakinggggggggg format
        means.append(mu)
    return means

assign_means(data, 2)

[[10.0, 12.0], [5.0, 7.0]]

In [None]:
df=pd.DataFrame(data, columns=['Nicotine (cg)', 'Tar(mg)'], index=['Peter Stuyvesant', 'Gitanes','Malboro','Lucky Strike', 'Light Delight'])
display(df)

k=2; n=len(data)
distance=cluster_distance
max_iter=100
verbose=True

cache_distance =[]
cache_means = []

################################## 1. Assign means ##################################

# -- assign k means randomly

# assign_means(random=True)
M1=list(np.array(df.loc['Gitanes'],dtype=float))
M2=list(np.array(df.loc['Lucky Strike'],dtype=float))
means=[M1,M2]

# --

for mu in means:
    for _ in range(len(mu)):
        mu[_] = float(mu[_])

means=[list(mu) for mu in means]

cache_means.append(means)
if verbose:
    print(f'\t-------------- Initial means: --------------\n')
    display(pd.DataFrame(means, columns=[df.columns],index=[f'Cluster {_}' for _ in range(1,k+1)]))


# -- loop prep
iter_count=0; convergence=False

while (not convergence and iter_count<max_iter):
    # ---- counting the iterations ----
    iter_count+=1
    if verbose:
        print(f'Iteration: {iter_count}')
    # ---------------------------------

    ################################## 2. Assign each data point to a cluster ##################################
    ################################## 2.a Compute the distances between each data point and the means ##################################

    distances = np.zeros((n,k))
    
    for i in range(n):
        for j in range(k):
            distances[i,j] = distance(data[i], means[j])
    distance_df = pd.DataFrame(distances, index=df.index, columns=[f'Cluster {i+1}' for i in range(k)])
    
    cache_distance.append(distance_df)
    if verbose:
        print('\t\t--- Distances between points and means ---')
        display(distance_df)

    ################################## 2.b Assign each data point to the cluster with the closest mean ##################################

    cluster_set=[Cluster() for _ in range(k)]
    index_cluster_set=[Cluster() for _ in range(k)]

    for i in range(n):
        min_index=np.argmin(distances[i])
        cluster_set[min_index]+=data[i]
        index_cluster_set[min_index]+=i

    ################################## 3. Compute the new means of the clusters ##################################

    means=[]
    for i in range(k):
        means.append(cluster_set[i].center)
        print('CLUSTER:',cluster_set[i])

    if verbose:
        print('\t\t--- Newly assigned means ---')
        display(pd.DataFrame(means, columns=[df.columns],index=[f'Cluster {_}' for _ in range(1,k+1)]))
    print('MEANS:',means)
    cache_means.append(means)

    # --------- checks if the means have changed -------

    if iter_count>1:
        convergence=[cache_means[-1][_]==cache_means[-2][_] for _ in range(k)]
        convergence=all(convergence)

    if verbose:
        print(f'Convergence state: {convergence}\n\n')

    ################################## 4. repeat until stopping criterion is met ##################################
    print(not convergence, iter_count<max_iter)

Unnamed: 0,Nicotine (cg),Tar(mg)
Peter Stuyvesant,10.0,12.7
Gitanes,10.0,12.0
Malboro,10.0,14.0
Lucky Strike,9.0,14.0
Light Delight,5.0,7.0


	-------------- Initial means: --------------



Unnamed: 0,Nicotine (cg),Tar(mg)
Cluster 1,10.0,12.0
Cluster 2,9.0,14.0


Iteration: 1
		--- Distances between points and means ---


Unnamed: 0,Cluster 1,Cluster 2
Peter Stuyvesant,0.35,0.15
Gitanes,0.0,0.5
Malboro,1.0,0.5
Lucky Strike,0.5,0.0
Light Delight,5.0,5.5


CLUSTER: Cluster: [array([10., 12.]), array([5., 7.])];
	Centroid: [7.5, 9.5]
CLUSTER: Cluster: [array([10. , 12.7]), array([10., 14.]), array([ 9., 14.])];
	Centroid: [9.666666666666666, 13.566666666666666]
		--- Newly assigned means ---


Unnamed: 0,Nicotine (cg),Tar(mg)
Cluster 1,7.5,9.5
Cluster 2,9.666667,13.566667


MEANS: [[7.5, 9.5], [9.666666666666666, 13.566666666666666]]
Convergence state: False


True True
Iteration: 2
		--- Distances between points and means ---


Unnamed: 0,Cluster 1,Cluster 2
Peter Stuyvesant,2.85,0.26667
Gitanes,2.5,0.61667
Malboro,3.5,0.38333
Lucky Strike,3.0,0.11667
Light Delight,2.5,5.61667


CLUSTER: Cluster: [array([5., 7.])];
	Centroid: [np.float64(5.0), np.float64(7.0)]
CLUSTER: Cluster: [array([10. , 12.7]), array([10., 12.]), array([10., 14.]), array([ 9., 14.])];
	Centroid: [9.75, 13.175]
		--- Newly assigned means ---


Unnamed: 0,Nicotine (cg),Tar(mg)
Cluster 1,5.0,7.0
Cluster 2,9.75,13.175


MEANS: [[np.float64(5.0), np.float64(7.0)], [9.75, 13.175]]
Convergence state: False


True True
Iteration: 3
		--- Distances between points and means ---


Unnamed: 0,Cluster 1,Cluster 2
Peter Stuyvesant,5.35,0.1125
Gitanes,5.0,0.4625
Malboro,6.0,0.5375
Lucky Strike,5.5,0.0375
Light Delight,0.0,5.4625


CLUSTER: Cluster: [array([5., 7.])];
	Centroid: [np.float64(5.0), np.float64(7.0)]
CLUSTER: Cluster: [array([10. , 12.7]), array([10., 12.]), array([10., 14.]), array([ 9., 14.])];
	Centroid: [9.75, 13.175]
		--- Newly assigned means ---


Unnamed: 0,Nicotine (cg),Tar(mg)
Cluster 1,5.0,7.0
Cluster 2,9.75,13.175


MEANS: [[np.float64(5.0), np.float64(7.0)], [9.75, 13.175]]
Convergence state: True


False True


In [41]:
cache_distance
cache_means

[[array([10., 12.]), array([ 9., 14.])],
 [[7.5, 9.5], [9.666666666666666, 13.566666666666666]],
 [array([5., 7.]), [9.75, 13.175]]]

In [None]:
l=[]
l.app