In [63]:
import pandas as pd
import numpy as np

## KMeans from scratch and testing on [Wine](https://archive.ics.uci.edu/ml/datasets/Wine) dataset

In [64]:
columns = [
    "Alcohol",
    "Malic acid",
    "Ash",
    "Alcalinity of ash",
    "Magnesium",
    "Total phenols",
    "Flavanoids",
    "Nonflavanoid phenols",
    "Proanthocyanins",
    "Color intensity",
    "Hue",
    "OD280/OD315 of diluted wines",
    "Proline",
    "Unknown"
]

dataset_link = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
df = pd.read_csv(dataset_link, names=columns)

In [65]:
df.head()

Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline,Unknown
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [66]:
def kmeans(data, k, iterations=100):
    # Takes in `data` dataframe and returns clustered dataframe with the cluster
    # mapping in a new column called `cluster`

    # Randomly generate k points in the neighborhood of data space
    # (x-x_min)/(x_max-x_min) = random * 1.2 (between 0 and 1.2)
    # x = random*(x_max-x_min) + x_min
    n, f = data.shape
    feature_max_val = data.max(axis=0).values.reshape(-1, f)
    feature_min_val = data.min(axis=0).values.reshape(-1, f)

    rvals = np.random.rand(k, f)
    rpoints = rvals * (feature_max_val - feature_min_val) + feature_min_val
    centroids = rpoints

    for itr in range(iterations):

        dist = pd.DataFrame()
        if itr != 0:
            prev_cluster_mapping = cluster_mapping

        # Get squared Euclidean distances of each point from each centroid in a n x k matrix
        dist = np.sum((data.values - centroids.reshape(k, 1, f))**2, axis=2).T

        # Choose new cluster mapping by argmin on distances for each point
        cluster_mapping = np.argmin(dist, axis=1)
        data['cmap'] = cluster_mapping

        # Create new centroid by grouping by on cluster centers and averaging all feature values
        centroids = data.groupby(['cmap']).mean().values
        data = data.drop(['cmap'], axis=1)

        # Continue till cluster mapping does not change from previous iteration
        if itr != 0:
            if (prev_cluster_mapping != cluster_mapping).sum() == 0:
                print ('Converged at iteration {}'.format(itr))
                data['cluster'] = cluster_mapping
                data['cluster'] += 1  # Indexing with 1
                return data

    print ('Did not converge. Try running again.')
    data['cluster'] = cluster_mapping
    data['cluster'] += 1  # Indexing with 1

    return data

In [67]:
%%time
kmdf = kmeans(df.drop(['Alcohol'], axis=1), 3)

Converged at iteration 14
CPU times: user 109 ms, sys: 3.58 ms, total: 113 ms
Wall time: 110 ms


In [68]:
# Original label column from dataset
kmdf['Alcohol'] = df['Alcohol']

In [69]:
# Confusion matrix (sort of, exact Alcohol number and cluster number will not necessarily match)
kmdf.groupby(['Alcohol', 'cluster']).count()['Unknown'].unstack().fillna(0).astype(int)

cluster,1,2,3
Alcohol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,27,31,1
2,0,7,64
3,0,11,37


**Summary**
1. KMeans is *generally* able to cluster '2' type of alcohol almost completely
2. Other two alcohol types are not separated that well

----------------

## Kmeans on [Iris](https://archive.ics.uci.edu/ml/datasets/iris) dataset

In [70]:
iris_data_link = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']

irdf = pd.read_csv(iris_data_link, names=columns)

In [71]:
irdf.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [72]:
%%time
km_irdf = kmeans(irdf.drop(['class'], axis=1), 3)

Converged at iteration 4
CPU times: user 47.9 ms, sys: 3.98 ms, total: 51.9 ms
Wall time: 46.2 ms


In [73]:
km_irdf['class'] = irdf['class']

# Confusion matrix
km_irdf.groupby(['class', 'cluster']).count()['sepal_length'].unstack().fillna(0).astype(int)

cluster,1,2,3
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Iris-setosa,0,50,0
Iris-versicolor,3,0,47
Iris-virginica,36,0,14


**Summary**
1. KMeans is able to cluster Iris-setosa perfectly
2. Kmeans does almost perject job separating Iris-versicolor
3. Iris-virginica is not separated as good as the other two classes
