In [25]:
from IPython.display import display, Markdown
import math
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import accuracy_score

Task 1

In [2]:
data = pd.read_csv('Data/data.csv', header=None)
labels = pd.read_csv('Data/label.csv', header=None)

In [70]:
class KMeans:
    def __init__(self, k=3, similarity='euclidean', max_iters=100):
        self.k = k
        self.similarity = similarity
        self.max_iters = max_iters
        self.centroids = None
        self.sse = None

    def euclidean_distance(self, point, centroid):
        return np.sqrt(np.sum((point - centroid)**2, axis=1))

    def sse_calc(self, centroids, data, labels):
        print(data)
        error_sum = 0.0
        for i, dat in enumerate(data):
            print(i)
            print(dat)
            error_sum += np.sum((centroids[labels[i]] - dat)**2)
        return error_sum
        
    def fit(self, X, y):
        n_samples, n_features = X.shape
        # Initialize centroids randomly
        centroid_indices = np.random.choice(n_samples, self.k, replace=False)
        self.centroids = X.iloc[centroid_indices].values
        
        for _ in range(self.max_iters):
            # Assign labels based on closest centroid
            if self.similarity == 'euclidean':
                distances = np.array([self.euclidean_distance(X, centroid) for centroid in self.centroids])
            elif self.similarity == 'cosine':
                distances = np.array([1 - cosine_similarity(X, centroid.reshape(1, -1)) for centroid in self.centroids])
            elif self.similarity == 'jaccard':
                distances = np.array([1 - np.array([jaccard_score(X[i], centroid) for i in range(n_samples)]) for centroid in self.centroids])
            
            labels = np.argmin(distances, axis=0)
            
            # Update centroids
            new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(self.k)])
            
            # Check for convergence
            if np.allclose(self.centroids, new_centroids):
                break
            
            self.centroids = new_centroids
        
        self.labels = labels
        
        self.sse = self.sse_calc(self.centroids, X, y)

    def predict(self, X):
        if self.similarity == 'euclidean':
            distances = np.array([math.dist(X, centroid) for centroid in self.centroids])
        elif self.similarity == 'cosine':
            distances = np.array([1 - cosine_similarity(X, centroid.reshape(1, -1)) for centroid in self.centroids])
        elif self.similarity == 'jaccard':
            distances = np.array([1 - np.array([jaccard_score(X, centroid) for centroid in self.centroids])])
            
        return np.argmin(distances, axis=0)

# Example usage:
# Assuming X is your feature matrix and y is your target variable
# X.shape = (n_samples, n_features)
# y.shape = (n_samples,)

# # Instantiate KMeans
# kmeans = KMeans(k=len(np.unique(y)), similarity='euclidean')

# # Fit the model
# kmeans.fit(X, y)

# # Get cluster labels
# cluster_labels = kmeans.labels

# # Predict cluster for new data
# new_data_cluster = kmeans.predict(new_data)


In [71]:
euclidean = KMeans(k=len(np.unique(labels)), similarity='euclidean')
euclidean.fit(data, labels)

      0    1    2    3    4    5    6    7    8    9    ...  774  775  776  \
0       0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
1       0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
2       0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
3       0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
4       0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
...   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
9995    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
9996    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
9997    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
9998    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
9999    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   

      777  778  779  780  781  782  783  
0       0    0    0  

KeyError: 1

In [48]:
sse = euclidean.sse
sse

KeyError: 1

In [41]:
# euclidean = KMeans(k=len(np.unique(labels)), similarity='euclidean')
# euclidean.fit(data)

# Get cluster labels
# cluster_labels = euclidean.labels

# Predict cluster for new data
# new_data_cluster = euclidean.predict(new_data)

cosine = KMeans(k=len(np.unique(labels)), similarity='cosine')

# Fit the model
cosine.fit(data, labels)

# # Get cluster labels
# cluster_labels = kmeans.labels

# # Predict cluster for new data
# new_data_cluster = kmeans.predict(new_data)

TypeError: KMeans.fit() takes 2 positional arguments but 3 were given

In [43]:
# def euclidean_distance(self, data_point, centroids):
#         return np.sqrt(np.sum((centroids - data_point)**2, axis=1))
sse = cosine.sum_of_squared_errors_calc(cosine.centroids, data, labels)
sse

AttributeError: 'KMeans' object has no attribute 'centroids'

In [76]:
class KMeans:
    def __init__(self, n_clusters, similarity='euclidean', max_iter=500, tol=1e-4):
        self.n_clusters = n_clusters
        self.similarity = similarity
        self.max_iter = max_iter
        self.tol = tol
        self.centroids = None
    
    def fit(self, X):
        # Initialize centroids randomly
        idx = np.random.choice(len(X), self.n_clusters, replace=False)
        # self.centroids = X[idx]
        self.centroids = X.iloc[idx].values
        
        for _ in range(self.max_iter):
            # Assign each data point to the nearest centroid
            if self.similarity == 'euclidean':
                distances = pairwise_distances(X, self.centroids, metric='euclidean')
            elif self.similarity == 'cosine':
                distances = 1 - cosine_similarity(X, self.centroids)
            elif self.similarity == 'jaccard':
                distances = 1 - self.generalized_jaccard_similarity(X, self.centroids)
            
            labels = np.argmin(distances, axis=1)
            
            # Update centroids
            new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(self.n_clusters)])
            
            # Check convergence
            if np.allclose(self.centroids, new_centroids, atol=self.tol):
                break
                
            self.centroids = new_centroids
        
        return labels
    
    def generalized_jaccard_similarity(self, X, centroids):
        intersection = np.minimum(X, centroids).sum(axis=1)
        union = np.maximum(X, centroids).sum(axis=1)
        return intersection / union

# Function to calculate SSE
def calculate_sse(X, labels, centroids):
    sse = 0
    for i in range(len(centroids)):
        sse += np.sum((X[labels == i] - centroids[i])**2)
    return sse

# Function to assign cluster labels based on majority vote
def assign_labels(labels, y):
    cluster_labels = []
    for cluster in range(len(np.unique(labels))):
        cluster_indices = np.where(labels == cluster)[0]
        cluster_y = y[cluster_indices]
        unique, counts = np.unique(cluster_y, return_counts=True)
        majority_label = unique[np.argmax(counts)]
        cluster_labels.append(majority_label)
    return cluster_labels

# Function to compute accuracy
def compute_accuracy(labels, y, cluster_labels):
    pred_labels = [cluster_labels[label] for label in labels]
    accuracy = np.mean(pred_labels == y)
    return accuracy

# Function to run K-means with different similarity measures
def run_kmeans(X, y, K):
    kmeans_euclidean = KMeans(n_clusters=K, similarity='euclidean')
    labels_euclidean = kmeans_euclidean.fit(X)
    sse_euclidean = calculate_sse(X, labels_euclidean, kmeans_euclidean.centroids)
    cluster_labels_euclidean = assign_labels(labels_euclidean, y)
    accuracy_euclidean = compute_accuracy(labels_euclidean, y, cluster_labels_euclidean)
    
    kmeans_cosine = KMeans(n_clusters=K, similarity='cosine')
    labels_cosine = kmeans_cosine.fit(X)
    sse_cosine = calculate_sse(X, labels_cosine, kmeans_cosine.centroids)
    cluster_labels_cosine = assign_labels(labels_cosine, y)
    accuracy_cosine = compute_accuracy(labels_cosine, y, cluster_labels_cosine)
    
    kmeans_jaccard = KMeans(n_clusters=K, similarity='jaccard')
    labels_jaccard = kmeans_jaccard.fit(X)
    sse_jaccard = calculate_sse(X, labels_jaccard, kmeans_jaccard.centroids)
    cluster_labels_jaccard = assign_labels(labels_jaccard, y)
    accuracy_jaccard = compute_accuracy(labels_jaccard, y, cluster_labels_jaccard)
    
    return sse_euclidean, sse_cosine, sse_jaccard, accuracy_euclidean, accuracy_cosine, accuracy_jaccard

# Example usage:
# Assuming X is your feature matrix and y is your target labels
# sse_euclidean, sse_cosine, sse_jaccard, accuracy_euclidean, accuracy_cosine, accuracy_jaccard = run_kmeans(X, y, K)


In [77]:
run_kmeans(data, labels, K=len(np.unique(labels)))

  return reduction(axis=axis, out=out, **passkwargs)


KeyError: "None of [Index([   2,    5,   14,   29,   31,   37,   39,   40,   43,   44,\n       ...\n       9931, 9946, 9950, 9955, 9956, 9959, 9969, 9978, 9984, 9994],\n      dtype='int64', length=1614)] are in the [columns]"