# Lab 3 — clustering

In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pickle
import sklearn.cluster as cl
import pandas as pd

# Import bokeh
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import HoverTool, ResetTool, PanTool, WheelZoomTool, SaveTool
output_notebook()
# Categorial colors
from bokeh.palettes import Dark2_8

%matplotlib inline
plt.style.use("ggplot")

In [2]:
movie_data = sc.textFile('/ix/ml-20m/movies.txt').map(json.loads)

## 3.11

In [3]:
with open("tagsPCA.pickle", "rb") as file:
    tagsPCA_p = pickle.load(file, encoding="utf-8")

In [4]:
tagsPCA = list(tagsPCA_p.values())
ks = [2, 3, 4, 5]

In [5]:
def k_means(k, data):
    kmeans = cl.KMeans(k)
    kmeans.fit(data)
    #print(kmeans)
    return kmeans

In [6]:
def create_cluster(kmeans, names, dims):
    source = ColumnDataSource(
        data={
            "x": [x[dims[0]] for x in tagsPCA],
            "y": [x[dims[1]] for x in tagsPCA],
            "name": [x for x in names],
            "color": [Dark2_8[x] for x in kmeans.labels_],
        })

    hover = HoverTool(
        tooltips=[
            ("Name", "@name"),
        ])
    tools = [hover, ResetTool(), PanTool(), WheelZoomTool(), SaveTool()]
    p = figure(plot_width=960, plot_height=360, tools=tools, title="Mouse over the dots")
    p.circle("x", "y", source=source, size=20, color="color", alpha=0.5)
    #show(p, notebook_handle=True)
    return p

In [7]:
def visualize_cluster(cluster_plot):
    show(cluster_plot, notebook_handle=True)

In [8]:
# tagsPCA_p.keys()

In [9]:
# would run on my machine but I don't know what your policy is regarding this
# so please do not complain about the huge amount of cells
# show(vplot(plots))

# gridplot is unfortunately not available on the cluster
# show(gridplot(plots))

### Principal Direction Cluster separation

#### Using 1st and 2nd principal component
PD: 1, 2

In [10]:
k = 2
prin_dir = (0, 1)
visualize_cluster(create_cluster(k_means(k, tagsPCA), tagsPCA_p.keys(), prin_dir))

In [11]:
k = 3
visualize_cluster(create_cluster(k_means(k, tagsPCA), tagsPCA_p.keys(), prin_dir))

In [12]:
k = 4
visualize_cluster(create_cluster(k_means(k, tagsPCA), tagsPCA_p.keys(), prin_dir))

In [13]:
k = 5
visualize_cluster(create_cluster(k_means(k, tagsPCA), tagsPCA_p.keys(), prin_dir))

#### Using 1st and 4th principal component
PD: 1, 4

In [14]:
k = 2
prin_dir = (0, 3)
visualize_cluster(create_cluster(k_means(k, tagsPCA), tagsPCA_p.keys(), prin_dir))

In [15]:
k = 3
visualize_cluster(create_cluster(k_means(k, tagsPCA), tagsPCA_p.keys(), prin_dir))

In [16]:
k = 4
visualize_cluster(create_cluster(k_means(k, tagsPCA), tagsPCA_p.keys(), prin_dir))

In [17]:
k = 5
visualize_cluster(create_cluster(k_means(k, tagsPCA), tagsPCA_p.keys(), prin_dir))

#### Using 1st and 5th principal component
PD: 1, 5

In [18]:
k = 2
prin_dir = (0, 4)
visualize_cluster(create_cluster(k_means(k, tagsPCA), tagsPCA_p.keys(), prin_dir))

In [19]:
k = 3
visualize_cluster(create_cluster(k_means(k, tagsPCA), tagsPCA_p.keys(), prin_dir))

In [20]:
k = 4
visualize_cluster(create_cluster(k_means(k, tagsPCA), tagsPCA_p.keys(), prin_dir))

In [21]:
k = 5
visualize_cluster(create_cluster(k_means(k, tagsPCA), tagsPCA_p.keys(), prin_dir))

#### Using 2nd and 4th principal component
PD: 2, 4

In [22]:
k = 2
prin_dir = (1, 3)
visualize_cluster(create_cluster(k_means(k, tagsPCA), tagsPCA_p.keys(), prin_dir))

In [23]:
k = 3
visualize_cluster(create_cluster(k_means(k, tagsPCA), tagsPCA_p.keys(), prin_dir))

In [24]:
k = 4
visualize_cluster(create_cluster(k_means(k, tagsPCA), tagsPCA_p.keys(), prin_dir))

In [25]:
k = 5
visualize_cluster(create_cluster(k_means(k, tagsPCA), tagsPCA_p.keys(), prin_dir))

#### Using 2nd and 5th principal component
PD: 2, 5

In [46]:
k = 2
prin_dir = (1, 4)
visualize_cluster(create_cluster(k_means(k, tagsPCA), tagsPCA_p.keys(), prin_dir))

In [47]:
k = 3
visualize_cluster(create_cluster(k_means(k, tagsPCA), tagsPCA_p.keys(), prin_dir))

In [48]:
k = 4
visualize_cluster(create_cluster(k_means(k, tagsPCA), tagsPCA_p.keys(), prin_dir))

In [50]:
k = 5
visualize_cluster(create_cluster(k_means(k, tagsPCA), tagsPCA_p.keys(), prin_dir))

The other combinations are already not properly separated at k=2

#### Which principal directions separate the clusters well?
> We find several principal directions that separate quite well. Not all are listed above
>
> But in particular we like the projection along PD 1, 5
>
> then it would probably be either 1,4 or 2,5

## 3.12

In [30]:
movie_dict = movie_data.map(lambda x: (x['movieId'], x['genres'])).collectAsMap()

In [31]:
def J_distance(a, b):
    set1 = set(a)
    set2 = set(b)
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return 1 - ( float(len(intersection)) / len(union) )

In [32]:
def k_medioids(k, points, threshold, maxsteps=1500):
    
    shuffle = np.random.randint(len(points), size=k)
    medioids = points[shuffle]
    
    for step in range(maxsteps):
        dists = np.zeros((len(points), k))
        #For each medioid, the distance between each point and this medioids.
        for i in range(len(points)):
            for j in range(k):
                dists[i,j] = J_distance(points[i], medioids[j])
                
        labels = np.argpartition(dists,1,1)[:,0]
        
        medioids2 = np.zeros(medioids.shape)
        
        for i in range(k):
            clusters = points[labels == i]
            real_index = np.argwhere(labels == i)
            sums = np.zeros(len(clusters))
            for j, x in enumerate(clusters):
                for y in clusters:
                    sums[j] += J_distance(x, y)
            if(len(clusters) > 0):
                medioids2[i] = clusters[np.argmin(sums)]

        diff = medioids2 - medioids
        dist = np.max(np.linalg.norm(diff,axis=0))
        
        if(dist < threshold):
            return labels
        medioids = medioids2
    return labels

In [33]:
with open("most-rated.pickle", "rb") as f:
    movies = pickle.load(f, encoding="utf-8")
most_rated_df = pd.DataFrame(list(movies), columns=['movieId', 'title'])

In [34]:
most_rated_df.head(4)

Unnamed: 0,movieId,title
0,296,Pulp Fiction (1994)
1,356,Forrest Gump (1994)
2,318,"Shawshank Redemption, The (1994)"
3,593,"Silence of the Lambs, The (1991)"


In [35]:
# Initialize the matrix with 0's
matKmed = np.zeros((most_rated_df.shape[0], len(movie_dict)))

# fill matrix

In [36]:
cluster = k_medioids(2,matKmed, 0.00005)