In [1]:
# Q1, Q2, Q3
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean, cosine
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import LabelEncoder

# Distance computation functions
def euclidean_distance(x, y):
    return euclidean(x, y)

def cosine_similarity(x, y):
    return 1 - np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def generalized_jaccard_similarity(x, y):
    min_sum = np.minimum(x, y).sum()
    max_sum = np.maximum(x, y).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0

# K-means algorithm
def kmeans(X, k, distance_func, max_iters=100):
    indices = np.random.choice(X.shape[0], k, replace=False)
    centroids = X[indices]

    for iteration in range(max_iters):
        clusters = np.array([np.argmin([distance_func(x, centroid) for centroid in centroids]) for x in X])
        new_centroids = np.array([X[clusters == i].mean(axis=0) for i in range(k)])

        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids

        _, dist = pairwise_distances_argmin_min(X, centroids, metric=distance_func)
        sse = np.sum(dist ** 2)
        if iteration > 0 and sse > previous_sse:
            break
        previous_sse = sse

    return clusters, centroids, sse, iteration

# Helper functions for cluster labeling and accuracy calculation
def label_clusters(clusters, true_labels):
    cluster_labels = {}
    for cluster in np.unique(clusters):
        labels, counts = np.unique(true_labels[clusters == cluster], return_counts=True)
        cluster_labels[cluster] = labels[np.argmax(counts)]
    return cluster_labels

def calculate_accuracy(clusters, cluster_labels, true_labels):
    correct_predictions = sum(cluster_labels[cluster] == true_label for cluster, true_label in zip(clusters, true_labels))
    return correct_predictions / len(true_labels)

# Load your dataset here
# X = (your data points)
# y = (your labels)
X = pd.read_csv('data.csv').values
y = pd.read_csv('label.csv').values.ravel()
# Number of clusters
k = np.unique(y).size

# Run K-means for each distance metric
results = {}
for distance_func, name in [
    (euclidean_distance, 'Euclidean'),
    (cosine_similarity, 'Cosine'),
    (generalized_jaccard_similarity, 'Jaccard')
]:
    clusters, centroids, sse, iterations = kmeans(X, k, distance_func)
    cluster_labels = label_clusters(clusters, y)
    accuracy = calculate_accuracy(clusters, cluster_labels, y)
    results[name] = {'SSE': sse, 'Accuracy': accuracy, 'Iterations': iterations}

print(results)


{'Euclidean': {'SSE': 25318199493.86592, 'Accuracy': 0.58995899589959, 'Iterations': 99}, 'Cosine': {'SSE': 684.8750267842214, 'Accuracy': 0.6233623362336234, 'Iterations': 53}, 'Jaccard': {'SSE': 3663.1980166419507, 'Accuracy': 0.6042604260426042, 'Iterations': 22}}


In [2]:
# Q4 - when there is no change in centroid position
import numpy as np
from scipy.spatial.distance import euclidean, cosine
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import LabelEncoder

# Distance computation functions
def euclidean_distance(x, y):
    return euclidean(x, y)

def cosine_similarity(x, y):
    return 1 - np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def generalized_jaccard_similarity(x, y):
    min_sum = np.minimum(x, y).sum()
    max_sum = np.maximum(x, y).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0

# K-means algorithm
def kmeans(X, k, distance_func, max_iters=100):
    indices = np.random.choice(X.shape[0], k, replace=False)
    centroids = X[indices]
    for iteration in range(max_iters):
        clusters = np.array([np.argmin([distance_func(x, centroid) for centroid in centroids]) for x in X])
        new_centroids = np.array([X[clusters == i].mean(axis=0) for i in range(k)])

        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids

        _, dist = pairwise_distances_argmin_min(X, centroids, metric=distance_func)
        sse = np.sum(dist ** 2)
        # if iteration > 0 and sse > previous_sse:
        #     break
        previous_sse = sse

    return clusters, centroids, sse, iteration

# Helper functions for cluster labeling and accuracy calculation
def label_clusters(clusters, true_labels):
    cluster_labels = {}
    for cluster in np.unique(clusters):
        labels, counts = np.unique(true_labels[clusters == cluster], return_counts=True)
        cluster_labels[cluster] = labels[np.argmax(counts)]
    return cluster_labels

def calculate_accuracy(clusters, cluster_labels, true_labels):
    correct_predictions = sum(cluster_labels[cluster] == true_label for cluster, true_label in zip(clusters, true_labels))
    return correct_predictions / len(true_labels)

# Load your dataset here
# X = (your data points)
# y = (your labels)
X = pd.read_csv('data.csv').values
y = pd.read_csv('label.csv').values.ravel()
# Number of clusters
k = np.unique(y).size

# Run K-means for each distance metric
results = {}
for distance_func, name in [
    (euclidean_distance, 'Euclidean'),
    (cosine_similarity, 'Cosine'),
    (generalized_jaccard_similarity, 'Jaccard')
]:
    clusters, centroids, sse, iterations = kmeans(X, k, distance_func)
    cluster_labels = label_clusters(clusters, y)
    accuracy = calculate_accuracy(clusters, cluster_labels, y)
    results[name] = {'SSE': sse, 'Accuracy': accuracy, 'Iterations': iterations}

print(results)


{'Euclidean': {'SSE': 25499585994.10855, 'Accuracy': 0.6455645564556456, 'Iterations': 42}, 'Cosine': {'SSE': 681.8360508421141, 'Accuracy': 0.6142614261426143, 'Iterations': 81}, 'Jaccard': {'SSE': 3660.8631390293767, 'Accuracy': 0.6052605260526053, 'Iterations': 75}}


In [3]:
# Q4 - when the SSE value increases in the next iteration
import numpy as np
from scipy.spatial.distance import euclidean, cosine
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import LabelEncoder

# Distance computation functions
def euclidean_distance(x, y):
    return euclidean(x, y)

def cosine_similarity(x, y):
    return 1 - np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def generalized_jaccard_similarity(x, y):
    min_sum = np.minimum(x, y).sum()
    max_sum = np.maximum(x, y).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0

# K-means algorithm
def kmeans(X, k, distance_func, max_iters=100):
    indices = np.random.choice(X.shape[0], k, replace=False)
    centroids = X[indices]
    for iteration in range(max_iters):
        clusters = np.array([np.argmin([distance_func(x, centroid) for centroid in centroids]) for x in X])
        new_centroids = np.array([X[clusters == i].mean(axis=0) for i in range(k)])

        # if np.all(centroids == new_centroids):
        #     break
        centroids = new_centroids

        _, dist = pairwise_distances_argmin_min(X, centroids, metric=distance_func)
        sse = np.sum(dist ** 2)
        if iteration > 0 and sse > previous_sse:
            break
        previous_sse = sse

    return clusters, centroids, sse, iteration

# Helper functions for cluster labeling and accuracy calculation
def label_clusters(clusters, true_labels):
    cluster_labels = {}
    for cluster in np.unique(clusters):
        labels, counts = np.unique(true_labels[clusters == cluster], return_counts=True)
        cluster_labels[cluster] = labels[np.argmax(counts)]
    return cluster_labels

def calculate_accuracy(clusters, cluster_labels, true_labels):
    correct_predictions = sum(cluster_labels[cluster] == true_label for cluster, true_label in zip(clusters, true_labels))
    return correct_predictions / len(true_labels)

# Load your dataset here
# X = (your data points)
# y = (your labels)
X = pd.read_csv('data.csv').values
y = pd.read_csv('label.csv').values.ravel()
# Number of clusters
k = np.unique(y).size

# Run K-means for each distance metric
results = {}
for distance_func, name in [
    (euclidean_distance, 'Euclidean'),
    (cosine_similarity, 'Cosine'),
    (generalized_jaccard_similarity, 'Jaccard')
]:
    clusters, centroids, sse, iterations = kmeans(X, k, distance_func)
    cluster_labels = label_clusters(clusters, y)
    accuracy = calculate_accuracy(clusters, cluster_labels, y)
    results[name] = {'SSE': sse, 'Accuracy': accuracy, 'Iterations': iterations}

print(results)


{'Euclidean': {'SSE': 25407571216.548393, 'Accuracy': 0.5957595759575958, 'Iterations': 99}, 'Cosine': {'SSE': 682.1528797027994, 'Accuracy': 0.6178617861786179, 'Iterations': 41}, 'Jaccard': {'SSE': 3673.452051251304, 'Accuracy': 0.6222622262226223, 'Iterations': 21}}


In [4]:
# Q4 - when the maximum preset value (e.g., 100) of iteration is complete
import numpy as np
from scipy.spatial.distance import euclidean, cosine
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import LabelEncoder

# Distance computation functions
def euclidean_distance(x, y):
    return euclidean(x, y)

def cosine_similarity(x, y):
    return 1 - np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def generalized_jaccard_similarity(x, y):
    min_sum = np.minimum(x, y).sum()
    max_sum = np.maximum(x, y).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0

# K-means algorithm
def kmeans(X, k, distance_func, max_iters=100):
    indices = np.random.choice(X.shape[0], k, replace=False)
    centroids = X[indices]
    for iteration in range(max_iters):
        clusters = np.array([np.argmin([distance_func(x, centroid) for centroid in centroids]) for x in X])
        new_centroids = np.array([X[clusters == i].mean(axis=0) for i in range(k)])

        # if np.all(centroids == new_centroids):
        #     break
        centroids = new_centroids

        _, dist = pairwise_distances_argmin_min(X, centroids, metric=distance_func)
        sse = np.sum(dist ** 2)
        # if iteration > 0 and sse > previous_sse:
        #     break
        previous_sse = sse

    return clusters, centroids, sse, iteration

# Helper functions for cluster labeling and accuracy calculation
def label_clusters(clusters, true_labels):
    cluster_labels = {}
    for cluster in np.unique(clusters):
        labels, counts = np.unique(true_labels[clusters == cluster], return_counts=True)
        cluster_labels[cluster] = labels[np.argmax(counts)]
    return cluster_labels

def calculate_accuracy(clusters, cluster_labels, true_labels):
    correct_predictions = sum(cluster_labels[cluster] == true_label for cluster, true_label in zip(clusters, true_labels))
    return correct_predictions / len(true_labels)

# Load your dataset here
# X = (your data points)
# y = (your labels)
X = pd.read_csv('data.csv').values
y = pd.read_csv('label.csv').values.ravel()
# Number of clusters
k = np.unique(y).size

# Run K-means for each distance metric
results = {}
for distance_func, name in [
    (euclidean_distance, 'Euclidean'),
    (cosine_similarity, 'Cosine'),
    (generalized_jaccard_similarity, 'Jaccard')
]:
    clusters, centroids, sse, iterations = kmeans(X, k, distance_func)
    cluster_labels = label_clusters(clusters, y)
    accuracy = calculate_accuracy(clusters, cluster_labels, y)
    results[name] = {'SSE': sse, 'Accuracy': accuracy, 'Iterations': iterations}

print(results)


{'Euclidean': {'SSE': 25321890482.604424, 'Accuracy': 0.6003600360036003, 'Iterations': 99}, 'Cosine': {'SSE': 691.8594824814824, 'Accuracy': 0.6074607460746074, 'Iterations': 99}, 'Jaccard': {'SSE': 3659.69264389171, 'Accuracy': 0.6039603960396039, 'Iterations': 99}}
