In [29]:
import os
import pandas as pd

import copy, math
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('multi_classification_train.csv')


X_train = df.iloc[1:4000,1:-2].values
y_train = df.iloc[1:4000,-1].values

# K-Means clustering

def find_closest_centroids(X, centroids):

  # Set K
  K = centroids.shape[0]

  # You need to return the following variables correctly
  idx = np.zeros(X.shape[0], dtype=int)
  for i in range(X.shape[0]):
     #Array to hold distance between X[i] and each centroids[j]
     distance = []
     for j in range(centroids.shape[0]):
      norm_ij = np.linalg.norm(X[i] - centroids[j, :X.shape[1]])  # Your code to calculate the norm between (X[i] - centroids[j])
      distance.append(norm_ij)

     idx[i] = np.argmin(distance) # Your code here to calculate index of minimum value in distance

  return idx

def compute_centroids(X,idx,K):
  # Useful variables
  m, n = X.shape
  # You need to return the following variables correctly
  centroids = np.zeros((K, n))

    ### START CODE HERE ###
  for k in range(K):

    points = X[idx==k] # Your code here to get a list of all data points in X assigned to centroid k
    centroids[k] =  np.mean(points, axis = 0) # Your code here to compute the mean of the points assigne

  return centroids


def compute_cost_kmeans(X, idx, K, centroids):
  cost=0
  m = X.shape[0]
  for i in range(m):
    cost += np.linalg.norm(X[i] - centroids[idx[i]])
  cost = cost/m
  return cost


def kMeans_init_centroids(X, K):

  m, n = X.shape
  centroids = np.zeros((K, n))
  # Randomly reorder the indices of examples
  randidx = np.random.permutation(m)
  # Take the first K examples as centroids
  centroids = X[randidx[:K]]

  return centroids


def k_means(X, initial_centroids, max_iters):

  # Initialize values
  m, n = X.shape
  K = initial_centroids.shape[0]
  centroids = initial_centroids
  previous_centroids = centroids
  idx = np.zeros(m)

  #  Run K-Means
  for i in range(max_iters):

    # For each example in X, assign it to the closest centroid
    idx = find_closest_centroids(X, centroids)

    # Given the memberships, compute new centroids
    centroids = compute_centroids(X, idx, K)

  return centroids, idx

def run_kmeans(X,K, max_iters):

    for i in range(100):
        centroids_i = []
        J_history = []
        centroids_f = []
        centroids = kMeans_init_centroids(X, K)
        for j in range(max_iters):
           idx = find_closest_centroids(X, centroids)
           centroids = compute_centroids(X, idx, K)
           centroids_i.append(centroids)
           if j== max_iters - 1 :
             centroids_f.append(centroids_i[j])


        J_history.append(compute_cost_kmeans(X, idx, K, centroids_f[-1]))
    min_cost_index = np.argmin(J_history)
    centroids_final.append(centroids_f[min_cost_index])


    return centroids_final



centroids_final= run_kmeans(X_train, 5,10)








In [30]:
# calculating predictions
idx = np.zeros((X_train.shape[0],len(centroids_final)))
for i in range(len(centroids_final)):
  idx[:,i] =  find_closest_centroids(X_train, centroids_final[i])



In [31]:
# calculating accuracy

count= np.zeros(len(centroids_final))

for i in range(len(centroids_final)):
    idx = find_closest_centroids(X_train, centroids_final[i])
    for j in range(X_train.shape[0]):
        if idx[j] == y_train[j]:  # Directly compare cluster assignment to true label
            count[i] += 1

print(count)

best_centroid_index = np.argmax(count)
best_centroids = centroids_final[best_centroid_index]
print(best_centroids)

[1022.  883.  457.  415.  473. 1247.  886.  914. 1232. 1219. 1371.  552.
 1310.  792.  798.  159.  292. 1111.  808.  424.]
[[ -65.8640204   -62.76205429   87.36919131  230.12232927   80.17401119
    78.53852578   59.82645192   55.14555539  -74.83418286   23.52072325
    27.62111922   27.85802633   99.00361325 -160.88233809  -31.46641445
    -6.36857303  -34.50121484  -35.91730576  -92.62769895]
 [ -66.30658433   -2.77512729   83.74554469 -123.40369338   82.743717
    79.82961559   36.94267064  -50.33763547  -53.48451657   -2.07075978
    44.26522357   76.29624945   52.79489705    9.90911882 -116.36278454
    84.13953198  -65.4052994   -35.78607405  -95.05108342]
 [ -74.49751469  -32.02859084   87.38582639  146.03836804   83.84462711
    77.14352328  -21.43320709   23.78854966 -163.39365971   89.56782032
    34.89949195   49.03978744   64.41682895   11.77975875 -137.70446345
    -9.50395432  -57.24285105  -29.26278941  -95.1534837 ]
 [ -75.98226874  -55.35125242   86.91812321   60.89538

In [32]:
# predictions of the training  set

df_test = pd.read_csv("multi_classification_test.csv")
X_test = df_test.iloc[1:,1:-2].values

X_test_adjusted = X_test[:, :best_centroids.shape[1]]
y_pred = find_closest_centroids(X_test_adjusted, best_centroids)

print(y_pred)


[1 1 2 ... 2 3 2]
