In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("../input/unsupervised-learning-on-country-data/Country-data.csv")

In [None]:
df.head()

In [None]:
X = np.array((df[['income', 'gdpp']]).astype(float))
X.shape

# Mean Normalization

In [None]:
n = X.shape[1]
for j in range(n):
    X[:, j] -= np.mean(X[:, j])
    print(np.mean(X[:, j]))

# Initializing Centroids

In [None]:
k = 3
(m, n) = X.shape
mu = np.random.randint(1, 10, (k, n))
print(mu)

# Finding Closest Centroids

In [None]:
def find_closest_centroids(X, mu):
    m = X.shape[0]
    k = mu.shape[0]
    c = np.zeros([m, 1])
    distance = np.zeros([m, k])
    for i in range(m):
        for j in range(k):
            distance[i, j] = np.sum((X[i, :] - mu[j, :])**2)
        dist = list(distance[i, :])
        c[i, 0] = dist.index(min(dist))
    return c

In [None]:
c = find_closest_centroids(X, mu)
print(c[:5])

# Compute Centroids

In [None]:
def compute_centroids(X, c, mu):
    (k, n) = mu.shape
    m = X.shape[0]
    for i in range(k):
        points = []
        for j in range(m):
            if c[j, 0] == i:
                points.append(j)
        for j in range(n):
            mu[i, j] = np.mean(X[points, j])
    return mu

In [None]:
compute_centroids(X, c, mu)

# Optimization Objective / Cost Function

In [None]:
def cost_function(X, mu, c):
    m = X.shape[0]
    J = 0
    for i in range(m):
        idx = int(c[i, 0])
        J += np.sum((X[i, :] - mu[idx, :])**2)
    return J / m

# K-Means Clustering

## Iterations showing clusters imporoved by K-Means Algorithm after every iteration

In [None]:
k = 3
max_iters = 25
np.random.seed(0)
mu = np.random.randint(1, 10, (k, n))
for i in range(max_iters):
    idx = find_closest_centroids(X, mu)
    centroids = compute_centroids(X, idx, mu)
    plt.figure(figsize = (12, 8))
    color = ['r', 'g', 'b']
    mark = ['+', 'o', '*']
    for i in range(k):
        points = []
        for j in range(m):
            if idx[j, 0] == i:
                points.append(j)
        plt.scatter(X[points, 0], X[points, 1], c = color[i], marker = mark[i], s = 100)
    plt.xlabel("Income")
    plt.ylabel("GDP")

**As it can be observed from the last two plots that they are exactly same, hence concludes the K-Means Algorithm trying to find best clusters for the given data.**

## Final K-Means Plot

In [None]:
plt.figure(figsize = (12, 8))
color = ['r', 'g', 'b']
mark = ['+', 'o', '*']
for i in range(k):
    points = []
    for j in range(m):
        if idx[j, 0] == i:
            points.append(j)
    plt.scatter(X[points, 0], X[points, 1], c = color[i], marker = mark[i], s = 100)
plt.xlabel("Income")
plt.ylabel("GDP")

In [None]:
cost_function(X, centroids, idx)

# K-Means by Monte-Carlo Method

In [None]:
k = 3
(m, n) = X.shape
indexes = []
costs = []
for i in range(100):
    mu = np.random.randint(1, 10, (k, n))
    idx = find_closest_centroids(X, mu)
    if (0 not in idx) or (1 not in idx) or (2 not in idx):
        pass
        #print("something's missing")
    else:
        centroids = compute_centroids(X, idx, mu)
        J = cost_function(X, centroids, idx)
        #print(J)
        costs.append(J)
        indexes.append(idx)
i_min = costs.index(min(costs))
best_clusters = indexes[i_min]
print(f"minimum cost: {costs[i_min]}")

plt.figure(figsize = (12, 8))
color = ['r', 'g', 'b']
mark = ['+', 'o', '*']
for i in range(k):
    points = []
    for j in range(m):
        if best_clusters[j, 0] == i:
            points.append(j)
    plt.scatter(X[points, 0], X[points, 1], c = color[i], marker = mark[i], s = 100)
plt.xlabel("Income")
plt.ylabel("GDP")

**It can be easily observed that even after using 100 random samples, it cannot give a better result. Still the iteration method of finding best position for cluster centroids gives a better output with just 5 iterations.**

Hence our implementation of K-Means Clustering from Scratch works just fine.

# Prinicipal Component Analysis (PCA)

In [None]:
features = np.array(df.drop(['country'], axis = 1))
print(features.shape)

In [None]:
m = features.shape[0]
sigma = np.dot(features.T, features) / m
print(sigma.shape)

In [None]:
u, s, v = np.linalg.svd(sigma)
print(u.shape, s.shape, v.shape)

In [None]:
dim = range(1, 9)
variance = []
for i in dim:
    v = np.sum(s[:i]) / np.sum(s)
    variance.append(v)
print(variance)

***For taking a suitable dimension for PCA variance retained should be greater than 0.99, which is we're getting for 2 dimension for columns of the data. Hence we can reduce the data upto 2D.***

In [None]:
d = 2
u_reduce = u[:, 0:d]
print(u_reduce.shape)

Getting reduced dimension data with all training examples & the new features

In [None]:
z = np.dot(features, u_reduce)
print(z.shape)

## Applying K-Means Clustering

In [None]:
max_iters = 5
k = 3
(m, n) = z.shape
np.random.seed(0)
mu = np.random.randn(k, n)
for i in range(max_iters):
    idx = find_closest_centroids(z, mu)
    centroids = compute_centroids(z, idx, mu)
    plt.figure(figsize = (12, 8))
    color = ['r', 'g', 'b']
    mark = ['+', 'o', '*']
    for i in range(k):
        points = []
        for j in range(m):
            if idx[j, 0] == i:
                points.append(j)
        plt.scatter(z[points, 0], z[points, 1], c = color[i], marker = mark[i], s = 100)

Well, no wonder the results are not good, because we only use PCA in case we do not get our desired results with the data. If we'll apply PCA to every problem unnecessarily, then it will end up getting worse, still the target of the implementation was to get understanding of how PCA works and how is it implemented & that is achieved.