# Lab 3 — clustering
**Internet Analytics - Lab 3**

---

**Group:** *K*

**Names:**

* *Robin Lang*
* *Kim Lan Phan Hoang*

In [97]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pickle
import random

import sklearn.cluster as scl

%matplotlib inline
plt.style.use("ggplot")

## Exercice 3.11

### Cluster the data using the k-means clustering algorithm and visualize it using an interactive bokeh plot.
source: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.k_means.html

In [4]:
# open the dict from exercise 3.2
with open("tag_dict.txt", "rb") as file:
    tag_dict_load = pickle.load(file)

In [12]:
tags_matrix = np.matrix([tuple[1] for tuple in tag_dict_load])

In [18]:
tag_dict_matrix = []
for i in tag_dict_load:
    tag_dict_matrix.append(tag_dict_load[i])

In [98]:
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import HoverTool, ResetTool, PanTool, WheelZoomTool, SaveTool
output_notebook()

def visualize(km, index, M, pd1=0, pd2=1, size=960):
    projected_x = [] # first principal direction
    projected_y = [] # second principal direction

    for i in M:
        projected_x.append(np.dot(i, M[pd1]))
        projected_y.append(np.dot(i, M[pd2]))
    
    # Continuous colors
    from matplotlib.colors import rgb2hex
    from colorsys import hsv_to_rgb

    # colors:   0% -> red
    #          50% -> yellow
    #         100% -> green
    # interpolated in between
    def clusterColor(val):
        return rgb2hex(hsv_to_rgb(val/index, 1, 1))
    
    source = ColumnDataSource(
        data={
            "x": projected_x,
            "y": projected_y,
            "name": [x for x in km[1]],
            "color": [clusterColor(x) for x in km[1]],
        })

    hover = HoverTool(
        tooltips=[
            ("Name", "@name"),
        ])
    tools = [hover, ResetTool(), PanTool(), WheelZoomTool(), SaveTool()]

    p = figure(plot_width=size, plot_height=size, tools=tools, title="clusters from k-means algo")
    p.circle("x", "y", source=source, size=20, color="color")
    show(p)

In [84]:
tag_matrix_t = np.array(tag_dict_matrix).T
for i in range(2, 6):
    km = scl.k_means(tag_matrix_t, i)
    visualize(km, i, tag_matrix_t)

### Which principal directions separate the clusters well?

In [100]:
km2 = scl.k_means(tag_matrix_t, 3)
for i in range(5):
    for j in range(i+1, 5):
        print(i, j)
        visualize(km2, 3, tag_matrix_t, pd1=i, pd2=j, size=240)

0 1


0 2


0 3


0 4


1 2


1 3


1 4


2 3


2 4


3 4


The first 2 principal directions separate the clusters very well.

## Exercice 3.12

### Create a dict that maps movie IDs to set of genres from the data in movies.txt

In [89]:
movies = sc.textFile("/ix/ml-20m/movies.txt").map(json.loads).collect()

In [90]:
idToGenre = {}
for x in movies:
    idToGenre[x["movieId"]] = x["genres"]

### Implement the k-medioids algorithm with the Jaccard distance.

In [143]:
def intersect(a, b):
     return list(set(a) & set(b))
    
def union(a,b):
    return set().union(a,b)

def jaccard_index(a,b):
    return len(intersect(a,b)) / len(union(a,b))

def jaccard_distance(a,b):
    return 1-jaccard_index(a,b)

In [154]:
def kmedioids(points, k):
    medioids = []
    clusters = []
    
    # initialize medioids
    for i in range(k):
        medioids.append(random.choice(points))
        clusters.append([])
        
    # convergence parameters
    convergedLimit = 2
    convergedLoop = 0
        
    # repeat until convergence
    while(convergedLoop < convergedLimit):
        
        # class points in clusters
        for p in points:
            min_i = 0
            jdk = 1000000000
            for i in range(k):
                jdi = jaccard_distance(idToGenre[medioids[i][0]], idToGenre[p[0]])
                if(jdi < jdk):
                    min_i = i
                    jdk = jdi
                    
            clusters[min_i].append(p)
        
        m2 = []
        #recompute medioids -> choose the closest point to the others
        for i in range(k):
            best_x = 0
            for x in clusters[i]:
                
                add = 0
                best_add = 1000000000
                
                for y in clusters[i]:
                    add = add + jaccard_distance(idToGenre[x[0]],idToGenre[y[0]])
                    
                if add < best_add:
                    best_add = add
                    best_x = x
                    
            m2.append(best_x)
            
        # check convergence
        if(medioids == m2):
            convergedLoop = convergedLoop + 1
        else :
            convergedLoop = 0
        medioids = m2
            
    return (medioids, clusters)

### Cluster the set of movies in the file most-rated.pickle, using k = 2

In [177]:
import pickle

# Reading an object from disk.
with open("most-rated.pickle", "rb") as f:
    most_rated = pickle.load(f, encoding="utf-8")

In [156]:
kmed = kmedioids(most_rated, 2)

### Find a good way to visualize the results of the clustering. How do you intepret the two clusters?

In [167]:
kmed[1][0][0]

(318, 'Shawshank Redemption, The (1994)')

In [166]:
relevance_file_name = "relevance_array.npy"
relevance_array = np.load(relevance_file_name)

In [171]:
len(relevance_array.T[0])

1128

We visualize the clusters by projecting them onto the first amd fifth principal direction, as they seem to be quite close to being orthogonal (as discussed in the dimred exercise 3.2)

In [176]:
projected_x = [] # first principal direction
projected_y = [] # second principal direction
cluster = []
names = []

for i in kmed[1][0]:
    projected_x.append(np.dot(relevance_array.T[i[0]-1], np.array(tag_dict_matrix).T[0]))
    projected_y.append(np.dot(relevance_array.T[i[0]-1], np.array(tag_dict_matrix).T[4]))
    cluster.append(0)
    names.append(i[1])
    
for i in kmed[1][1]:
    projected_x.append(np.dot(relevance_array.T[i[0]-1], np.array(tag_dict_matrix).T[0]))
    projected_y.append(np.dot(relevance_array.T[i[0]-1], np.array(tag_dict_matrix).T[4]))
    cluster.append(1)
    names.append(i[1])
    
# Continuous colors
from matplotlib.colors import rgb2hex
from colorsys import hsv_to_rgb

# colors:   0% -> red
#          50% -> yellow
#         100% -> green
# interpolated in between
def clusterColor(val):
    return rgb2hex(hsv_to_rgb(val/2, 1, 1))

source = ColumnDataSource(
    data={
        "x": projected_x,
        "y": projected_y,
        "name": [x for x in names],
        "color": [clusterColor(x) for x in cluster],
    })
    
hover = HoverTool(
    tooltips=[
        ("Name", "@name"),
    ])
tools = [hover, ResetTool(), PanTool(), WheelZoomTool(), SaveTool()]
p = figure(plot_width=960, plot_height=960, tools=tools, title="2 k-medioids clusters")
p.circle("x", "y", source=source, size=20, color="color")
show(p)

The points of the clusters are entirely mixed, however the red cluster seems more skewed in the length, whereas the blue bluster is a little more spread out. Overall, both clusters seem to containfairly similar(-ly mixed) data, with the blue cluster having a larger variance than the red one.