# A Naive Kmeans Implementation with Numpy and Pandas.

In [187]:
import pandas as pd
import numpy as np

In [176]:
class NaiveKmeans:
    """This is a naive implementation of the KMeans algorithm for clustering"""
    def __init__(self, df, k, max_iterations=100):
        """Instantiate class attributes"""
        self.df = df
        self.k = k
        self.max_iterations = max_iterations
        self.centroids, self.centroid_coordinates = self.get_centroids(self.df, self.k)
        self.n_iterations = 0
        self.df_columns = self.df.columns
    def get_centroids(self, df, k):
        """Initializes the random centroids and their coordinates"""
        for col in df.columns:
            if df[col].dtype == "object":
                raise Exception(f"All columns must have numeric datatypes. {col} is a string.")
        centroids = df.sample(k)
        centroid_coordinates = centroids.values.tolist()
        return centroids, centroid_coordinates
    def calculate_distances(self):
        """
            Takes in the coordinates and calculate the euclidean 
            distances btwn the each centroid and the datapoints
        """
        names = []
        distances_df = pd.DataFrame()
        for i, coordinates in enumerate(self.centroid_coordinates):
            name = f"distances_{i+1}"
            distance_difference = (self.df[self.df_columns] - coordinates)**2
            distance_sum = distance_difference.sum(axis=1)
            distances_sqrt = np.sqrt(distance_sum)
            distances_df[name] = distances_sqrt
            names.append(name)
        self.df = pd.concat([self.df, distances_df], axis=1)
        return names
    def calculate_clusters(self):
        """Calculates the clusters each record belongs to"""
        distance_cols = self.calculate_distances()
        self.df["clusters"] = self.df[distance_cols].idxmin(axis=1).str.split("_").str[-1]
        return distance_cols
    
    def adjust_centroids(self):
        """Adjusts the centroids by calculating the means for the clusters"""
        distance_cols = self.calculate_clusters()
        self.df = self.df.drop(distance_cols, axis=1)
        new_centroids = round(self.df.groupby("clusters").mean(), 2)
        new_coords = new_centroids.values.tolist()
        return new_centroids, new_coords
    
    def predict(self):
        """
            Iterates the adjustments of the centroids until convergence or until 
            the set number of iterations is reached.
        """
        for i in range(self.max_iterations):
            previous_coordinates = self.centroid_coordinates
            self.centroids, self.centroid_coordinates = self.adjust_centroids()
            if previous_coordinates == self.centroid_coordinates:
                break
        self.n_iterations = i+1
        return self.df

## Test the Algorithm with Data

In [177]:
df = pd.read_csv("./mall_customers.csv")
df = df.rename(columns={"Annual Income (k$)": "Annual_Income", "Spending Score (1-100)": "Spending_Score"})
variables = ["Age", "Annual_Income", "Spending_Score"]
df = df[variables]
df.head(2)

Unnamed: 0,Age,Annual_Income,Spending_Score
0,19,15,39
1,21,15,81


In [184]:
# Instantiate the Algorithm
kmeans = NaiveKmeans(df, 2)
kmeans.centroids

Unnamed: 0,Age,Annual_Income,Spending_Score
88,34,58,60
154,47,78,16


In [185]:
# Predict Cluster
clusters = kmeans.predict()
clusters.head()

Unnamed: 0,Age,Annual_Income,Spending_Score,clusters
0,19,15,39,1
1,21,15,81,1
2,20,16,6,2
3,23,16,77,1
4,31,17,40,1


In [186]:
# Final Centroids
final_centroids = kmeans.centroids
final_centroids

Unnamed: 0_level_0,Age,Annual_Income,Spending_Score
clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,37.6,56.93,62.06
2,42.32,70.64,17.3
