In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA


In [None]:
df = pd.read_csv("../input/clicks-conversion-tracking/KAG_conversion_data.csv")
df.head()

In [None]:
print(df.age.unique())

In [None]:
x=df.iloc[:, [6, 8]].values
x[0:5]

In [None]:
y=df.iloc[:,[1]].values
y

In [None]:
fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
X_reduced = x
ax.scatter(X_reduced[:, 0], X_reduced[:, 1],  c=y,
           cmap="rainbow", edgecolor='k', s=100)
ax.set_title("First three PCA directions")
ax.set_xlabel("principal component 1")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("principal component 1")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("principal component 1")
ax.w_zaxis.set_ticklabels([])
plt.show()


In [None]:
def init_medoids(X, k):
    from numpy.random import choice
    from numpy.random import seed
 
    seed(1)
    samples = choice(len(X), size=k, replace=False)
    return X[samples, :]


In [None]:
medoids_initial = init_medoids(x, 3)


In [None]:
def compute_d_p(X, medoids, p):
    m = len(X)
    medoids_shape = medoids.shape
    # If a 1-D array is provided, 
    # it will be reshaped to a single row 2-D array
    if len(medoids_shape) == 1: 
        medoids = medoids.reshape((1,len(medoids)))
    k = len(medoids)
    
    S = np.empty((m, k))
    
    for i in range(m):
        d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1)
        S[i, :] = d_i**p

    return S


In [None]:
S = compute_d_p(x, medoids_initial, 2)


In [None]:
def assign_labels(S):
    return np.argmin(S, axis=1)


In [None]:
labels = assign_labels(S)


In [None]:
def update_medoids(X, medoids, p):
    
    S = compute_d_p(x, medoids, p)
    labels = assign_labels(S)
        
    out_medoids = medoids
                
    for i in set(labels):
        
        avg_dissimilarity = np.sum(compute_d_p(x, medoids[i], p))

        cluster_points = x[labels == i]
        
        for datap in cluster_points:
            new_medoid = datap
            new_dissimilarity= np.sum(compute_d_p(x, datap, p))
            
            if new_dissimilarity < avg_dissimilarity :
                avg_dissimilarity = new_dissimilarity
                
                out_medoids[i] = datap
                
    return out_medoids


In [None]:
def has_converged(old_medoids, medoids):
    return set([tuple(x) for x in old_medoids]) == set([tuple(x) for x in medoids])


In [None]:
def kmedoids(X, k, p, starting_medoids=None, max_steps=np.inf):
    if starting_medoids is None:
        medoids = init_medoids(X, k)
    else:
        medoids = starting_medoids
        
    converged = False
    labels = np.zeros(len(X))
    i = 1
    while (not converged) and (i <= max_steps):
        old_medoids = medoids.copy()
        
        S = compute_d_p(X, medoids, p)
        
        labels = assign_labels(S)
        
        medoids = update_medoids(X, medoids, p)
        
        converged = has_converged(old_medoids, medoids)
        i += 1
    return (medoids,labels)


In [None]:
results = kmedoids(x, 3, 2)
final_medoids = results[0]
df['clusters'] = results[1]


In [None]:
def mark_matches(a, b, exact=False):
    assert a.shape == b.shape
    a_int = a.astype(dtype=int)
    b_int = b.astype(dtype=int)
    all_axes = tuple(range(len(a.shape)))
    assert ((a_int == 0) | (a_int == 1) | (a_int == 2)).all()
    assert ((b_int == 0) | (b_int == 1) | (b_int == 2)).all()
    
    exact_matches = (a_int == b_int)
    if exact:
        return exact_matches

    assert exact == False
    num_exact_matches = np.sum(exact_matches)
    if (2*num_exact_matches) >= np.prod (a.shape):
        return exact_matches
    return exact_matches == False


In [None]:
def count_matches(a, b, exact=False):

    matches = mark_matches(a, b, exact=exact)
    return np.sum(matches)


In [None]:
n_matches = count_matches(labels, df['clusters'])
print(n_matches,"matches out of",len(df), "data points","(~ {:.1f}%)".format(100.0 * n_matches / len(labels)))


In [None]:
fig = plt.figure(1, figsize=(6, 5))
ax = Axes3D(fig, elev=-150, azim=110)
X_reduced = x
ax.scatter(x[labels==0, 0], x[labels==0, 1], c="red",
           cmap=plt.cm.Set1, edgecolor='k', s=100)
ax.scatter(x[labels==1, 0], x[labels==1, 1], c='orange',
           cmap=plt.cm.Set1, edgecolor='k', s=100)
ax.scatter(x[labels==2, 0], x[labels==2, 1], c='green',
           cmap=plt.cm.Set1, edgecolor='k', s=100)
ax.scatter(final_medoids[:, 0], final_medoids[:,1], s = 100,
            c = 'purple', label = 'Centroids')
ax.set_title("K-medoids")
ax.set_xlabel("principal component 1")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("principal component 1")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("principal component 1")
ax.w_zaxis.set_ticklabels([])

plt.show()


In [None]:
print(labels)

In [None]:
plt.scatter(x[labels==0, 0], x[labels==0, 1], s=100, c = 'red', label = '916')
plt.scatter(x[labels==1, 0], x[labels==1, 1], s=100, c = 'orange', label = '936')
plt.scatter(x[labels==2, 0], x[labels==2, 1], s=100, c = 'green', label = '1178')

#Plotting the centroids of the clusters
plt.scatter(final_medoids[:, 0], final_medoids[:,1], s = 100, c = 'blue', label = 'Centroids')

plt.legend()