In [106]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import chart_studio.plotly as py
from numpy.linalg import norm
from csv import reader
from math import sqrt
from sklearn.preprocessing import StandardScaler
import random

In [107]:
irises = pd.read_csv("iris.csv")
irises.head(5)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [108]:
import plotly.express as px
df = px.data.iris()
fig = px.scatter_3d(df, x='sepal_length', y='sepal_width', z='petal_width',
              color='species')
fig.show()

In [119]:
def getdata(filename):
    dataset = np.zeros((150,4))

    with open(filename, "r") as file:
        csv_reader = reader(file)
        i = 0
        for row in csv_reader:
            if i > 0:
                if i == 151:
                    break
                if not row:
                    continue
                for j in range(4):
                    dataset[i-1, j] = float(row[j])
            i += 1
    return dataset

def normalize(dataset):
    max = dataset.max()
    min = dataset.min()
    for i in range(150):
        for j in range(4):
            dataset[i][j] = (dataset[i][j] - min)/(max-min)
    return dataset
    

def mean(dataset):
    col_quant = len(dataset[0])
    dataset = dataset.transpose()
    means = np.zeros(col_quant)
    for i in range(col_quant):
        means[i] = np.sum(dataset[i])/float(len(dataset[i]))
    return means
        
# calculate column standard deviations
def stdev(dataset, means):
    col_quant = len(dataset[0])
    stdevs = np.zeros(col_quant)
    for i in range(col_quant):
        variance = [pow(row[i]-means[i], 2) for row in dataset]
        stdevs[i] = sum(variance)
    stdevs = [sqrt(x/(float(len(dataset)-1))) for x in stdevs]
    return np.array(stdevs)

def standardize_dataset(dataset, means, stdevs):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - means[i]) / stdevs[i]

def normalize_and_standartize(filename, dsformat="pandas"):
    dataset = normalize(getdata(filename))
    
    means = mean(dataset)
    srdevs = stdev(dataset, means)
    
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - means[i]) / stdevs[i]
    if dsformat == "numpy":
        return dataset
    if dsformat == "pandas":
        return pd.DataFrame(data=dataset)
        

dataset = normalize_and_standartize("iris.csv")
print(dataset[:10], "\n...")

# numpy_data = getdata("iris.csv")
# df = pd.DataFrame(data=numpy_data)
# X_std = StandardScaler().fit_transform(df)

# print("This is what sklearn does for standartization - \
#       \n I don't know why my method gives slightly smaller values and it honestly does not matter for Kmeans")
# print(X_std[:10], "\n...")

          0         1         2         3
0 -0.897674  1.015602 -1.335752 -1.311052
1 -1.139200 -0.131539 -1.335752 -1.311052
2 -1.380727  0.327318 -1.392399 -1.311052
3 -1.501490  0.097889 -1.279104 -1.311052
4 -1.018437  1.245030 -1.335752 -1.311052
5 -0.535384  1.933315 -1.165809 -1.048667
6 -1.501490  0.786174 -1.335752 -1.179859
7 -1.018437  0.786174 -1.279104 -1.311052
8 -1.743017 -0.360967 -1.335752 -1.311052
9 -1.139200  0.097889 -1.279104 -1.442245 
...


In [162]:
CLASSES = {
    "0": "Setosa", 
    "1": "Versicolor",
    "2": "Virginica"
}

In [122]:
def init_centers(dataset):
    centers = []
    i =0 
    while i < 150:
        centers.append(dataset[random.randint(i,i+50)])
        i += 50
    return np.array(centers)

dataset = normalize_and_standartize("iris.csv", dsformat="numpy")
centers = init_centers(dataset)
print(centers)

[[-1.50149039  0.09788935 -1.27910398 -1.31105215]
 [-0.41462067 -1.27867961  0.13708732  0.13206729]
 [ 1.75911877 -0.36096697  1.43998331  0.78803068]]


In [125]:
def compute_distance(dataset, centers):
    """
    Compute the distance from each point to 3 possible clusters
    """
    distance = np.zeros((dataset.shape[0], 3))
    for k in range(3):
        row_norm = norm(dataset - centers[k, :], axis=1)
        distance[:, k] = np.square(row_norm)
    return distance

distance = compute_distance(dataset, centers)
print(distance[:10], "\n...")

[[ 1.20999983  9.74891679 21.06434249]
 [ 0.18710021  6.09279623 20.56374497]
 [ 0.08005688  7.93451137 22.76090743]
 [ 0.          7.16441942 22.64170564]
 [ 1.55248138 10.98555367 22.40489702]
 [ 4.38382948 13.42316179 20.69208042]
 [ 0.494156    9.33531195 23.52480024]
 [ 0.70707593  8.71640565 20.8303335 ]
 [ 0.27209318  6.85868159 24.37580811]
 [ 0.14846549  6.90401446 20.97836815]] 
...


In [148]:
def find_closest(distance):
    return np.argmin(distance, axis=1)

closest = find_closest(distance)
print(closest)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 1 2 1 2 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1
 1 2 2 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 2 2 2 2
 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 1 2 2 2 1 2 2 2 1 2 2 2 1 2
 2 1]


In [146]:
def compute_centers(dataset, closest):
    centers = np.array([np.mean(dataset[closest == k, :], axis=0) for k in range(3)])
    return centers

centers = compute_centers(dataset, closest)
print(centers)

[[-0.9394007   0.82712241 -1.21663873 -1.16929645]
 [-0.07022162 -0.82832063  0.35318762  0.30456137]
 [ 1.18155515  0.04801366  0.9991168   1.00193178]]


In [165]:
dataset = normalize_and_standartize("iris.csv", dsformat="numpy")
def fit(dataset):
    centers = init_centers(dataset)
    for i in range(50):
        old_centroids = centers
        distance = compute_distance(dataset, centers)
        closest = find_closest(distance)
        centers = compute_centers(dataset, closest)
        if np.all(old_centroids == centers):
            print(closest)
            return centers
        
centers = fit(dataset)

def predict(new, centers):
    distance = compute_distance(new, centers)
    return find_closest(distance)

new_vector = np.zeros((1,4))
new = [-1.52, 0.1, -1.29, -1.32]
for i in range(4):
    new_vector[0][i] = new[i]
    
print(new_vector)

print(CLASSES[str(predict(new_vector, centers)[0])])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1
 1 2 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 2 2 2 2
 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 2 2 2 2 2 2 1 1 2 2 2 1 2 2 2 1 2 2 2 1 2
 2 1]
[[-1.52  0.1  -1.29 -1.32]]
Setosa
