In [1]:
import numpy as np 
import random 
import pandas as pd
import math

from tqdm import tqdm_notebook as tqdm

digits = pd.read_csv('mnist_train.csv', header=None, dtype=float)

random_centers = digits.iloc[np.random.randint(0, len(digits), size=(10)), 1:]
# random_centers = digits.iloc[[88, 205, 390, 254, 222, 437, 431, 436, 352, 285], 1:]

X = digits.iloc[:10000, 1:]

In [42]:
random_centers.shape

(10, 784)

In [44]:
[c for c in random_centers.values][0].shape

(784,)

In [45]:
n_clusters = 10

In [52]:
X = X.iloc[:, :784].values

In [104]:
centers = X[np.random.randint(0, len(X), size=(n_clusters)), :]

In [74]:
centers = [np.mean(X[cluster_numbers==k], axis=0) for k in range(n_clusters)]

In [105]:
c = _find_closest_centers(X, centers)

(10000,)

In [103]:
def _find_closest_centers(X, centers):
    distances = [np.mean((X - center)**2, axis=1) for center in centers]
    distances = np.array(distances)
    closest_centers = np.argmin(distances, axis=0)
    return closest_centers

In [108]:
np.square([1,2,3])

array([1, 4, 9])

In [130]:
class Kmeans:
    def __init__(self, n_clusters=10, n_iter=100):
        self.n_clusters = n_clusters
        self.centers = None
        self.n_iter = n_iter

    def _find_closest_centers(self, X, centers):
        distances = [np.mean(np.square(X - center), axis=1) for center in centers]
        distances = np.array(distances)
        closest_centers = np.argmin(distances, axis=0)
        return closest_centers
    
    def fit(self, X):
        centers = X[np.random.randint(0, len(X), size=(self.n_clusters)), :]
        
        for _ in tqdm(range(self.n_iter)):
            cluster_numbers = self._find_closest_centers(X, centers)
            centers = [np.mean(X[cluster_numbers==k], axis=0) for k in range(self.n_clusters)]
        
        self.centers = centers
        return self.predict(X)
    
    def predict(self, X):
        assert self.centers is not None, "You should call 'fit' first"
        return self._find_closest_centers(X, self.centers)

In [136]:
km = Kmeans(n_iter=1000)

In [137]:
km.fit(X)




array([6, 5, 4, ..., 3, 4, 8])

In [139]:
p = km.predict(digits.iloc[:, 1:])

In [207]:
np.sum(digits.iloc[:100, 0] == [d[k] for k in p[:100]])

67

In [155]:
results = pd.DataFrame()
results['y'] = digits.iloc[:, 0]

In [156]:
results['p'] = p

In [175]:
g = results.groupby(by='p').y.apply(lambda x: x.mode())

In [200]:
d = dict()
for i in range(10):
    d[i] = g.values[i]

In [201]:
d

{0: 0.0,
 1: 3.0,
 2: 6.0,
 3: 1.0,
 4: 4.0,
 5: 0.0,
 6: 8.0,
 7: 1.0,
 8: 7.0,
 9: 2.0}

In [163]:
s = results['y']

In [172]:
s.mode()

0    1.0
dtype: float64

In [151]:
c.groupby

KeyError: 0

In [147]:
c.groupby()

TypeError: You have to supply one of 'by' and 'level'

In [135]:
digits.iloc[:10, 0]

0    5.0
1    0.0
2    4.0
3    1.0
4    9.0
5    2.0
6    1.0
7    3.0
8    1.0
9    4.0
Name: 0, dtype: float64

In [None]:
%%time
centers = random_centers.values

for i in tqdm(range(500)):
    distances = [((X.iloc[:, :784] - centers[k, :])**2).mean(axis=1) for k in range(len(centers))]
    distances = np.array(distances)
    X['N'] = [np.argmin(distances[:, i]) for i in range(len(X))]
    centers = X.groupby(by='N').mean().values    

In [31]:
confusion_matrix = []
for i in range(len(centers)):
    confusion_matrix.append([sum(X[digits[0]==i]['N']==j) for j in range(10)])
confusion_matrix

  This is separate from the ipykernel package so we can avoid doing imports until


[[2, 0, 66, 1, 784, 1, 44, 50, 39, 14],
 [12, 2, 3, 1051, 0, 2, 54, 2, 1, 0],
 [12, 2, 23, 147, 8, 652, 52, 55, 26, 14],
 [46, 7, 268, 54, 6, 22, 33, 582, 9, 5],
 [248, 193, 0, 14, 0, 0, 123, 1, 14, 387],
 [71, 6, 174, 12, 9, 1, 326, 225, 19, 20],
 [1, 0, 4, 47, 14, 4, 65, 5, 776, 98],
 [358, 514, 0, 50, 1, 3, 35, 2, 0, 107],
 [38, 18, 451, 73, 4, 7, 135, 183, 12, 23],
 [366, 311, 2, 10, 6, 0, 46, 13, 2, 222]]