In [45]:
#Import libraries
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier      # Import Decision Tree Classifier
from sklearn.cluster import KMeans                   # Import K-Means Clustering
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics                          # Import scikit-learn metrics module for accuracy calculation

In [46]:
# Function to Read CSV Funtion
def readCsv(fileName,colList):
  dataSet = pd.read_csv(fileName, header=None, names=colList)
  # Remove the first row the CSV contains label
  dataSet = dataSet.drop(dataSet.index[0])
  # print(dataSet.head())
  return dataSet

In [82]:
def cluter_infer(kmeans,testData,centroids,centroids_quantized,featureList):

  testData_df = pd.DataFrame([testData], columns=featureList)

  # Calculate the Euclidean distance between data and each of the centroids
  distances = np.linalg.norm(centroids - testData_df.values, axis=1)
  print("Euclidian Distance non-quantized : ",distances)

  # Calculate the Euclidean distance between data and each of the quantized centroids
  distances_quantized = np.linalg.norm(centroids_quantized - testData_df.values, axis=1)
  print("Euclidian Distance quantized     : ",distances_quantized)

  # Determine the closest centroid based on minimum distance
  closest_centroid           = np.argmin(distances)
  closest_centroid_quantized = np.argmin(distances_quantized)

  # Prediciton using predict function
  predict = kmeans.predict(testData_df)

  print("Cluster Predicition              : ",predict,closest_centroid,closest_centroid_quantized)

In [84]:
# Function for K-Means Clustering
def kMeansClusteringModel(dataSet,featureList):
  # Seting Target Variable
  X = dataSet[featureList]
  # Clustering
  kmeans = KMeans(n_clusters=2, random_state=0)
  kmeans.fit(X)
  # Prediction
  predict = kmeans.predict(X)
  # Cluster Distinction Score
  silhouette_score = metrics.silhouette_score(X, predict)

  # Get the centroids of the clusters
  centroids = kmeans.cluster_centers_
  print("\nCentroids non-quantized")
  print(centroids)

  # Quantizing to integer
  centroids_quantized = np.round(centroids).astype(int)
  print("\nCentroids quantized")
  print(centroids_quantized)

  # Testing for both clusters
  testData0    = [56, 1, 1, 134, 290, 0, 1, 122, 1, 0, 0, 0, 2]
  testData1    = [58, 1, 2, 140, 211, 1, 0, 165, 0, 0, 2, 0, 2]

  print("\nCluster Infering for testData0")
  cluter_infer(kmeans,testData0,centroids,centroids_quantized,featureList)

  print("\nCluster Infering for testData1")
  cluter_infer(kmeans,testData1,centroids,centroids_quantized,featureList)

  return silhouette_score

In [85]:
fileName = "./heart.csv"
colList  = ["age", "sex", "cp", "trestbps", "chol" ,"fbs", "restecg",
            "thalach", "exang", "oldpeak", "slope", "ca","thal","target"]

featureList = [item for item in colList if item not in ["target"]]

# Reading CSV
dataSet = readCsv(fileName,colList)

# Target Label
target = dataSet.target

# K-Mean Clustering
kmSilhouetteScore = kMeansClusteringModel(dataSet,featureList)
# print("\nK-Means Model Score          :", kmSilhouetteScore)


Centroids non-quantized
[[5.30223285e+01 7.48006380e-01 1.02073365e+00 1.29711324e+02
  2.14159490e+02 1.46730463e-01 6.02870813e-01 1.51523126e+02
  2.95055821e-01 1.00430622e+00 1.39872408e+00 6.77830941e-01
  2.26634769e+00]
 [5.66582915e+01 6.13065327e-01 8.19095477e-01 1.34605528e+02
  2.96160804e+02 1.53266332e-01 4.14572864e-01 1.45319095e+02
  4.02010050e-01 1.17738693e+00 1.36432161e+00 8.74371859e-01
  2.41457286e+00]]

Centroids quantized
[[ 53   1   1 130 214   0   1 152   0   1   1   1   2]
 [ 57   1   1 135 296   0   0 145   0   1   1   1   2]]

Cluster Infering for testData0
Euclidian Distance non-quantized :  [81.57752648 24.24099821]
Euclidian Distance quantized     :  [81.88406438 23.91652149]
Cluster Predicition              :  [1] 1 1

Cluster Infering for testData1
Euclidian Distance non-quantized :  [18.06499882 87.61234203]
Euclidian Distance quantized     :  [17.57839583 87.49857142]
Cluster Predicition              :  [0] 0 0


# New Section