## Mount drive


In [182]:
dataset_size = 150
# dataset_size = 15000000

try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    print('Note: using Google CoLab')
    COLAB = True
except:
    print('Note: not using Google Colab')
    COLAB = False

if COLAB:
    root_path = "/content/drive/My Drive/UPT/Master/Big Data/Dataset1"
else:
    root_path = "./data/"

Mounted at /content/drive
Note: using Google CoLab


## Process data



In [183]:
import numpy as np
import pandas as pd
import os

np.random.seed(50)

# read dataset
dirname = os.path.join(root_path, f'Iris-{str(dataset_size)}.txt')
df = pd.read_csv(dirname, header=None)
print(df)

# init cluster number
K = 3

       0    1    2    3               4
0    5.1  3.5  1.4  0.2     Iris-setosa
1    4.9  3.0  1.4  0.2     Iris-setosa
2    4.7  3.2  1.3  0.2     Iris-setosa
3    4.6  3.1  1.5  0.2     Iris-setosa
4    5.0  3.6  1.4  0.2     Iris-setosa
..   ...  ...  ...  ...             ...
145  6.7  3.0  5.2  2.3  Iris-virginica
146  6.3  2.5  5.0  1.9  Iris-virginica
147  6.5  3.0  5.2  2.0  Iris-virginica
148  6.2  3.4  5.4  2.3  Iris-virginica
149  5.9  3.0  5.1  1.8  Iris-virginica

[150 rows x 5 columns]


## K-Means

In [184]:
def euclidean_distance(a, b):
  # print('euclidean_distance')
  # print(a)
  # print(b)
  return np.sqrt(np.sum( [((a[i] - b[i])**2) for i in range(4)]  ))
euclidean_distance([1,1,1,0],[0,0,0,1])

2.0

In [185]:
def get_random_centroids():
  return [ np.random.uniform(0,8,[4]).tolist() for i in range(K) ]
  # return {
  #     i+1: np.random.uniform(0,8,[4]).tolist() for i in range(K) }
      # 3 points between 0 and 8
      # i+1: np.random.rand(3,1)*8
# print(get_random_centroids())

In [186]:
def closest_centroid(row, centroids):
  # distance of the current row to each centroid
  distances = [euclidean_distance(row, point) for point in centroids]
  closest_index = np.argmin(distances)
  return closest_index

def create_clusters(centroids):
  # Assign the samples to the closest centroids to create clusters
  clusters = [[] for _ in range(K)]
  for index, row in df.iterrows():
  # for idx, sample in enumerate(df):
    sample = np.delete(row.to_numpy(),-1) # convert to np array & trim last column
    centroid_idx = closest_centroid(sample, centroids)
    # print(centroid_idx)
    clusters[centroid_idx-1].append(index)

  return clusters

In [187]:
def get_centroids(clusters):
  # assign mean value of clusters to centroids
  centroids = np.zeros((K, 4))
  df_np=np.delete(df.to_numpy(),-1, 1)
  for cluster_idx, cluster in enumerate(clusters):
      # print(cluster)
      # print(df_np[cluster])
      if not df_np[cluster].any():
        centroids[cluster_idx] = np.random.uniform(0,8,[4,])
      else:
        cluster_mean = np.mean(df_np[cluster], axis=0)
        print(cluster_mean)
        centroids[cluster_idx] = cluster_mean
  return centroids

def is_converged(centroids_old, centroids):
  # distances between each old and new centroids, fol all centroids
  distances = [euclidean_distance(centroids_old[i], centroids[i]) for i in range(K)]
  print('Distances')
  print(distances)
  return sum(distances) < 1

In [190]:
max_iters = 10000
centroids = get_random_centroids()
print(centroids)
i=0
# Optimize clusters
for _ in range(max_iters):
    # Assign samples to closest centroids (create clusters)
    clusters = create_clusters(centroids)
    print(clusters)
    # Calculate new centroids from the clusters
    centroids_old = centroids
    centroids = get_centroids(clusters)
    print(centroids_old)
    print(centroids)
    # check if clusters have changed
    i=i+1
    if i==5000:
      break
    if is_converged(centroids_old, centroids):
        break

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[6.853846153846153 3.0769230769230766 5.715384615384615 2.053846153846153]
[[6.85384615 3.07692308 5.71538462 2.05384615]
 [5.006      3.418      1.464      0.244     ]
 [5.88360656 2.74098361 4.38852459 1.43442623]]
[[5.006      3.418      1.464      0.244     ]
 [5.88360656 2.74098361 4.38852459 1.43442623]
 [6.85384615 3.07692308 5.71538462 2.05384615]]
Distances
[4.988054041834384, 3.346416464356445, 1.7884235436555784]
[[51, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 101, 106, 113, 114, 119, 121, 123, 126, 127, 133, 138, 142, 146, 149], [50, 52, 77, 100, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 115, 116, 117, 118, 120, 122, 124, 125, 128, 129, 130, 131, 132, 134, 135, 136, 137, 139, 140, 141, 143, 144, 145, 147, 148], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1

## Test Accuracy

In [202]:
print(clusters)
total = 0
for cluster in clusters:
  # 100-150,0-50,50-100
  setosa = len([i for i in cluster if i <= dataset_size/3])
  versicolor = len([i for i in cluster if i > dataset_size/3 and i <= dataset_size/3*2])
  virginica = len([i for i in cluster if i <= dataset_size and i > dataset_size/3*2])
  count = max([setosa, versicolor, virginica])
  total = total + count
  print(f'{count} out of {dataset_size/3}')
print(f'Accuracy: {total/dataset_size} ,{total} out of {dataset_size}')


[[50, 52, 77, 100, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 115, 116, 117, 118, 120, 122, 124, 125, 128, 129, 130, 131, 132, 134, 135, 136, 137, 139, 140, 141, 143, 144, 145, 147, 148], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], [51, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 101, 106, 113, 114, 119, 121, 123, 126, 127, 133, 138, 142, 146, 149]]
35 out of 50.0
50 out of 50.0
47 out of 50.0
Accuracy: 0.88 ,132 out of 150
