# Lab 3 — clustering

In [None]:
import numpy as np
import sklearn.cluster as cl
from scipy.spatial import distance

import json
import pickle
import random as rand
from copy import deepcopy
from collections import Counter

import matplotlib.pyplot as plt

from bokeh.layouts import column, row
from bokeh.plotting import figure, output_notebook,show, ColumnDataSource
from bokeh.models import CustomJS, ColumnDataSource, Slider, HoverTool
from bokeh.charts import Bar
from bokeh.palettes import small_palettes

%matplotlib inline
plt.style.use("ggplot")
output_notebook()

## 3.11 Clustering tags

In [None]:
# Data computed in the exercice 3.2
tag2pca = np.load("tag2pca.npy").item()

# Computes the k-means for our data
def clusters(k):
    kmeans = cl.KMeans(k)
    kmeans.fit(list(tag2pca.values()))
    return kmeans.cluster_centers_

# Cluster data
data = dict(
        x=[],
        y=[],
        clusters=[],
    )
# We fill the data with the coordinates of all the clusters
# We do one more than needed because we need 
# len(x)=len(y)=len(clusters), otherwise bokeh won't draw all points...
for k in range(2,7):
    data['clusters'].append(clusters(k))
    
# We initialize the coords at 2 clusters
# and with the first two principal directions
for coords in data['clusters'][0]:
    data['x'].append(coords[0])
    data['y'].append(coords[1])
    
source = ColumnDataSource(data=data)

colors = small_palettes['Set1'][5]
# The points of the data set
dataPoints = dict(
        x=[],
        y=[],
        dataCoords=[],
        colors=[],
    )

# We intialize the coords with the
# first two principal directions
for name, coords in tag2pca.items():
    dataPoints['x'].append(coords[0])
    dataPoints['y'].append(coords[1])
    dataPoints['dataCoords'].append(coords)
    idx = np.argmin(list(map(lambda cluster: distance.euclidean(cluster,coords),data['clusters'][0])))

sourcePoints=ColumnDataSource(data=dataPoints)

p = figure(plot_width=600, plot_height=600, title="k-means clustering")

# We draw all the data points
p.circle('x', 'y', size=5,legend="data", source=sourcePoints, fill_alpha=0.2, line_color=None)

# We draw the clusters
p.circle('x', 'y', size=20,legend="cluster", source=source)

# Callback function when any of the sliders move
def callback(clusters=source, sourcePoints=sourcePoints, window=None):

    k = slider.get('value')
    x = sliderX.get('value')
    y = sliderY.get('value')
    # We ensure that the two directions are not the same
    if x == y:
        y = (x%5)+1
        sliderY['value'] = y
        
    data =clusters.get('data')
    data['x'] = []
    data['y'] = []
    # We iterate over the cluster with k points
    for coords in data['clusters'][k-2]:
        # We add the cluster in the directions given by the sliders
        data['x'].append(coords[x-1])
        data['y'].append(coords[y-1])
    clusters.trigger('change')
    
    data = sourcePoints.get('data')
    data['x'] = []
    data['y'] = []
    for coords in data['dataCoords']:
        # We add the data in the directions given by the sliders
        data['x'].append(coords[x-1])
        data['y'].append(coords[y-1])
        
    sourcePoints.trigger('change')

callback = CustomJS.from_py_func(callback)

slider = Slider(start=2, end=5, value=2, step=1, title="Number of clusters",callback=callback)
callback.args['slider'] = slider

sliderX = Slider(start=1, end=5, value=1, step=1, title="X principal direction", callback=callback)
callback.args['sliderX'] = sliderX

sliderY = Slider(start=1, end=5, value=2, step=1, title="Y principal direction", callback=callback)
callback.args['sliderY'] = sliderY

layout = row(p,column(slider, sliderX, sliderY))
show(layout)

The principal directions that separate the cluster well are the tuples (1,5) and (3,4)

## 3.12 Clustering movies

In [11]:
def distance(A, B):
    jaccard_index = len(A.intersection(B))/len(A.union(B)) 
    return 1 - jaccard_index

In [118]:
rand.randint(0,1)

1

In [24]:
def compareListOfSets(list1, list2):
    set1 = set([tuple(x) for x in list1])
    set2 = set([tuple(x) for x in list2])
    return set1 == set2
# Xin is a list of list
def k_medioids(Xin, k, maxSteps=200):
    X = set(map(frozenset,Xin))
    m = rand.sample(X,k)
    mPrev = deepcopy(m)
    C = [set() for x in range(k)]
    for step in range(maxSteps):
        for x in X:
            idx = np.argmin(list(map(lambda y: distance(x,y), m)))
            C[idx].add(x)
        for i in range(k):
            mini = len(C[i])*2
#             print("Current cluster is", C[i])
            for x in C[i]:
                res = np.sum(list(map(lambda y: distance(x,y), C[i])))
                if res < mini:
#                     print("smaller: was", mini, "now is", res)
                    mini = res
                    m[i] = x
#             print("Done with cluster",i, "minimum set is", m[i])
        if compareListOfSets(m,mPrev):
            return list(map(set,m))
        mPrev = deepcopy(m)
    return list(map(set, m))

In [26]:
k_medioids([[1,2],[1,3],[1,2,3],[4,5,7],[4,5]],2)

[{1, 3}, {1, 2}]

In [13]:
# Reading an object from disk.
with open("most-rated.pickle", "rb") as f:
    movies = pickle.load(f, encoding="utf-8")
movieIds = []
for id, movie in movies:
    movieIds.append(id)

In [14]:
data = sc.textFile("/ix/ml-20m/movies.txt").map(json.loads)

In [15]:
moviesGenre = data.filter(lambda movie: movie['movieId'] in movieIds).map(lambda movie: (movie['movieId'],movie['genres'])).collectAsMap()

In [28]:
medoids = k_medioids(list(moviesGenre.values()),2,1000)

In [29]:
def clusterMovies(movies, medoids):
    k = len(medoids)
    C = [[] for x in range(k)]
    for movieId, genres in movies.items():
        idx = np.argmin(list(map(lambda y: distance(set(genres),y), medoids)))
        C[idx].extend(genres)
    return C

clusters = [sorted(x) for x in clusterMovies(moviesGenre,medoids)]

allGenres = set(clusters[0]).union(set(clusters[1]))

occurences = Counter(clusters[0])
occurences.update({k: 0 for k in allGenres.difference(set(occurences.keys()))})

data = dict(
    genre = list(occurences.keys()),
    occ = list(occurences.values()),
)
bar = Bar(data, label='genre',values='occ', legend=None, color='genre', plot_width=450, title='Cluster 1',ylabel='Occurences',palette=small_palettes['Set1'][9])

occurences = Counter(clusters[1])
occurences.update({k: 0 for k in allGenres.difference(set(occurences.keys()))})

data = dict(
    genre = list(occurences.keys()),
    occ = list(occurences.values()),
)
bar2 = Bar(data, label='genre',values='occ', legend=None, color='genre', plot_width=450, title='Cluster 2',ylabel='Occurences',palette=small_palettes['Set1'][9])
show(row(bar,bar2))

In [176]:
!hdfs dfs -cat /ix/ml-20m/movies.txt | tail -n 2

{"genres": ["(no genres listed)"], "movieId": 131260, "title": "Rentun Ruusu (2001)"}
{"genres": ["Adventure", "Fantasy", "Horror"], "movieId": 131262, "title": "Innocence (2014)"}
