<a href="https://colab.research.google.com/github/sowad748/Data-Mining-Assignments-/blob/main/Assignment_3_HeartFailure.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Task 1: Load Dataset
Upload the dataset and read it into a pandas DataFrame**

In [None]:
from google.colab import files
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.metrics import pairwise_distances
import plotly.graph_objects as go


In [None]:
uploaded = files.upload()
df = pd.read_csv("heart_failure_clinical_records.csv")


Saving heart_failure_clinical_records.csv to heart_failure_clinical_records (6).csv


 **Task 2: Extract Relevant Columns
Select the first three dimensions**

In [None]:
data = df[['age', 'creatinine_phosphokinase', 'ejection_fraction']].to_numpy()

ground_truth = df['DEATH_EVENT'].to_numpy()

**Task 3: Initialize Clusters
Set the number of clusters based on the unique values in 'DEATH_EVENT' and initialize centroids**

In [None]:
def initialize_centroids_simple(data, dimension, k):
    # Randomly select K points from the dataset as centroids
    indices = np.random.choice(len(data), k, replace=False)
    centroids = data[indices]
    return centroids


def get_euclidean_distance(p1, p2):
    # Compute Euclidean distance
    return np.linalg.norm(p1 - p2)


**Task 4: Implement K-Means Algorithm
Compute distances, update cluster affiliations, recompute centroids, and terminate based on the defined criteria**

In [None]:
def kmeans(data, k, max_iters=100, tol=1e-5):
    centroids = initialize_centroids_simple(data, data.shape[1], k)
    prev_J = float('inf')

    for _ in range(max_iters):
        # Compute distances and assign clusters
        distances = np.linalg.norm(data[:, np.newaxis] - centroids, axis=2)
        cluster_affiliation = np.argmin(distances, axis=1)

        # Compute new centroids safely (avoid NaN)
        new_centroids = np.array([
            data[cluster_affiliation == i].mean(axis=0) if np.any(cluster_affiliation == i) else centroids[i]
            for i in range(k)
        ])

        # Compute clustering objective function J
        J = np.sum((data - centroids[cluster_affiliation])**2) / data.shape[0]

        # Termination check
        if abs(prev_J - J) <= tol * J:
            break

        centroids = new_centroids
        prev_J = J

    return cluster_affiliation, centroids

**Task 5: Run K-Means Algorithm
Execute the K-Means function and obtain final clusters**

In [None]:
# Number of clusters (K) set to number of unique classes in 'DEATH_EVENT'
K = df['DEATH_EVENT'].nunique()

# Run K-Means
cluster_affiliation, final_centroids = kmeans(data, K)

**Task 6: Visualize Clusters
Create a 3D scatter plot to visualize cluster assignments and ground truth labels**

In [None]:
def plot_clusters(data, cluster_affiliation, ground_truth):
    # Define symbol map for different ground truth labels
    symbol_map = {0: "circle", 1: "square", 2: "triangle-up"}
    color_map = {0: "blue", 1: "red", 2: "yellow"}  # Cluster colors

    fig = go.Figure()

    for cluster in np.unique(cluster_affiliation):
        for label in np.unique(ground_truth):
            indices = np.where((cluster_affiliation == cluster) & (ground_truth == label))[0]

            fig.add_trace(go.Scatter3d(
                x=data[indices, 0],
                y=data[indices, 1],
                z=data[indices, 2],
                mode='markers',
                marker=dict(
                    size=8,
                    color=color_map[cluster],  # Assign color based on cluster
                    symbol=symbol_map[label],  # Assign shape based on ground truth
                    opacity=0.8
                ),
                name=f'Cluster {cluster}, Class {label}'  # Legend label
            ))

    # Update layout for aesthetics and clarity
    fig.update_layout(
        title="K-Means Clustering with Ground Truth",
        scene=dict(
            xaxis_title="Age",
            yaxis_title="Creatinine Phosphokinase",
            zaxis_title="Ejection Fraction",
            bgcolor="rgb(30, 30, 30)"  # Dark background
        ),
        template="plotly_dark",
        legend=dict(
            title="Legend", font=dict(size=12), bgcolor="rgba(0,0,0,0.5)"
        )
    )

    fig.show()

# Run the function
plot_clusters(data, cluster_affiliation, ground_truth)




def count_misclassified(class_labels, cluster_affiliation):
    misclassification_count = 0
    for label in np.unique(class_labels):
        points_in_class = np.where(class_labels == label)[0]
        clusters_in_class = cluster_affiliation[points_in_class]
        cluster_counts = Counter(clusters_in_class)
        if len(cluster_counts) > 1:
            misclassification_count += 1
    return misclassification_count

misclassified_clusters = count_misclassified(ground_truth, cluster_affiliation)
print(f"Misclassified clusters: {misclassified_clusters}")

Misclassified clusters: 2
