In [1]:
import matplotlib.pyplot as plt
import numpy as np
from moviepy.editor import *
from PIL import Image
import os
import time
#from annoy import AnnoyIndex
import random
from scipy.cluster.vq import vq, kmeans, kmeans2, whiten
import matplotlib.pyplot as plt
import pickle
#from nltk.cluster.kmeans import KMeansClusterer
#import nltk



def video_to_hash(path, dimensionality = 3, image_size = 7):

    #clip = VideoFileClip("videos/0HH3N4OZUXUB.mp4")
    clip = VideoFileClip(path)

    hashes = np.array([])
    for t in np.linspace(0, clip.duration, dimensionality):
        im = Image.fromarray(clip.get_frame(t), "RGB")
        im = im.resize((image_size, image_size), Image.ANTIALIAS)
        red, green, blue = im.split()

        pixels_r = list(red.getdata())
        pixels_g = list(green.getdata())
        pixels_b = list(blue.getdata())

        avg_r = sum(pixels_r)/len(pixels_r)
        avg_g = sum(pixels_g)/len(pixels_g)
        avg_b = sum(pixels_b)/len(pixels_b)

        bits_r = "".join(map(lambda pixel_r: '1' if pixel_r < avg_r else '0', pixels_r))
        bits_g = "".join(map(lambda pixel_g: '1' if pixel_g < avg_g else '0', pixels_g))
        bits_b = "".join(map(lambda pixel_b: '1' if pixel_b < avg_b else '0', pixels_b))

        hashes = np.append(hashes, np.array([int(x) for x in bits_r]))
        hashes = np.append(hashes, np.array([int(x) for x in bits_g]))
        hashes = np.append(hashes, np.array([int(x) for x in bits_b]))
        
    return np.array([hashes])



In [64]:

# SETTINGS
dimensionality = 5 # dimensionality of points PER COLOUR CHANNEL (x3)
image_size = 8 # size of image in pixels - e.g. 8^2 = 64 bit hash
directory = "../videos/"
num_all_points = len(os.listdir(directory))
num_points = 9700
dict_file_names = {}
k = 970

t1 = time.time()

points = np.zeros(shape=(num_points,3*image_size**2*dimensionality)).astype(int)
i = 0
for file in os.listdir(directory):
    if i > num_points-1: break
    
    filename = os.fsdecode(file)
    
    newpoint = video_to_hash(os.path.join(directory, filename), dimensionality, image_size).astype(int)
    points[i,:] = newpoint.astype(int)
    dict_file_names[tuple(newpoint.flatten())] = filename

    i += 1;



pickle_out = open("dict.pickle","wb")
pickle.dump(dict_file_names, pickle_out)
pickle_out.close()

np.savetxt("points", points)
t2 = time.time()
print("Execution Time: " + str(t2-t1)) # 4056.1573548316956 s


Execution Time: 5013.627940893173


In [23]:

t1 = time.time()

data = np.loadtxt("points")
pickle_in = open("dict.pickle","rb")
names = pickle.load(pickle_in)
k = 970

In [24]:
# Whiten data
whitened = whiten(data)
# Find clusters in the data
%time codebook, label = kmeans2(whitened, k, minit='points', missing='warn')

CPU times: user 22.5 s, sys: 3.52 s, total: 26 s
Wall time: 6.76 s


In [25]:
# Reorder clustering data structure
clusters = dict()

for i in range(len(label)):
    if label[i] not in clusters:
        clusters[label[i]] = []
    clusters[label[i]].append(
        tuple([
                names[tuple(data[i,:].flatten())][:-4],
                tuple(data[i,:]),
                i
            ])
    )
    #clusters[label[i]].append(i)


In [26]:
# Check integrity
c = 0
for k in clusters.values():
    c += len(k)
c

9700

In [27]:
# Reorder data structure and check integrity
clusters = [[np.mean(np.array([p[1] for p in c]),0), list(set(c))] for c in clusters.values()]

c = 0
for k in clusters:
    c += len(k[1])
c


9700

In [28]:
# Postprocessing, Loyd's Algorithm

clusters.sort(key = lambda x: len(x[1]), reverse=True)

for i in range(len(clusters)-1):
    while len(clusters[i][1]) > 10 :
        c = clusters[i][1].pop()
        min_d = 1000000000000
        new_cluster = i
        for k in range(i, len(clusters)):
            if len(clusters[k][1]) < 10:
                d = np.linalg.norm(np.array(clusters[k][0]) - np.array(c[1]))
                if min_d > d:
                    new_cluster = k
                    min_d = d
        m = np.mean(np.array([p[1] for p in clusters[k][1]]))
        clusters[new_cluster][0] = m
        clusters[new_cluster][1].append(c)

# Check integrity
c = 0
for k in clusters:
    c += len(k[1])
c

9700

In [29]:
# Compute adjusted_rand_score

clusters = [list([y[0] for y in x[1]]) for x in clusters]
clusters = [set(c) for c in clusters]

import adjusted_rand_index as ari

ari.rand_index(clusters)


0.5209300267041163

In [30]:

t2 = time.time()

print("Clustering time: "+str(t2-t1))


Clustering time: 127.76400947570801


In [31]:
clusters

[{'09FT68DJGFVF',
  '5QZPKXEWIE9V',
  '7EV0W2NNFK2E',
  'LBJYWYITWASM',
  'LPGHETFMMYCH',
  'O42ESLPSDEYR',
  'O9QTZU1IGVMB',
  'OFAF8Y08BPTX',
  'OVPDF3VWG53A',
  'VW52OMCQ4ZTA'},
 {'05ORLRFR4WNS',
  '48HTNJPP6NSW',
  'E1HCESGJBK4R',
  'FBCKMNN8IK2D',
  'J5YJQS41C8RZ',
  'M240UHC9W428',
  'ML89ZCA9GZ10',
  'SUK5BRIIAHYA',
  'VIX93FKCOI74',
  'YA3KWUJRI978'},
 {'6ZELLX2RTND0',
  'BDO5UKE3LR8R',
  'CWWYKDJT9DBR',
  'G71XRH5J10LG',
  'G7G4FTBU34Y4',
  'I6GO5DZ4Y79W',
  'JNY91S9Y77P3',
  'PF5JE0FOM6GR',
  'W5OZ5728TV8B',
  'Z1WT1PBDRX69'},
 {'18NHLF6QN6U5',
  '9PMTDA7HUJWC',
  'EVGNR5BWSNV9',
  'G8TPPPETZO3T',
  'H6J5GA4H6GID',
  'HOPT26I7GN0N',
  'HPBDX1MXGLU8',
  'II0QGC2NB62L',
  'OEC2RZNYN5FD',
  'TC7Z6ISQ7T59'},
 {'0WXAM3B0TO4C',
  '7VHWIF24N8BR',
  '93KA5YZ40V4L',
  'GPDNMR5LFQCH',
  'HE7FKGV4952E',
  'L3KL66O5UMUZ',
  'MVYNBYQZ0JID',
  'PPV7UMHTAA1W',
  'PY2ILSPGKSHP',
  'SW10BZWNCNQC'},
 {'2JHORLENHCYN',
  '7OTKAAO5OYCZ',
  '906MH4W0YCYJ',
  'AWV4XDBZBNXV',
  'GDG81137X3C4',
  'HQ

In [None]:
# Not used for anything

def hamming_distance(x, y):
    x = x.astype(int)
    y = y.astype(int)
    return sum([(bin(x[i] ^ y[i])).count('1') for i in range(dimensionality*3)])


In [None]:
# Not used for anything

data = np.loadtxt("points")

t = AnnoyIndex(dimensionality*3, metric = "euclidean")  # Length of item vector that will be indexed

for i in range(10):#np.size(data, 0)):
    #print(data[i,:].flatten())
    t.add_item(i, data[i,:].flatten())

t.build(10) # 10 trees
t.save('test.ann')
#print(t.get_nns_by_item(0, num_all_points))
#print(t.get_n_items())