In [None]:
import matplotlib.pyplot as plt
import numpy as np
from moviepy.editor import *
from PIL import Image
import os
import time
#from annoy import AnnoyIndex
import random
from scipy.cluster.vq import vq, kmeans, whiten
import matplotlib.pyplot as plt
import pickle
from nltk.cluster.kmeans import KMeansClusterer
import nltk



def video_to_hash(path, dimensionality = 3, image_size = 7):

    #clip = VideoFileClip("videos/0HH3N4OZUXUB.mp4")
    clip = VideoFileClip(path)

    hashes = np.array([])
    for t in np.linspace(0, clip.duration, dimensionality):
        im = Image.fromarray(clip.get_frame(t), "RGB")
        im = im.resize((image_size, image_size), Image.ANTIALIAS)
        red, green, blue = im.split()

        pixels_r = list(red.getdata())
        pixels_g = list(green.getdata())
        pixels_b = list(blue.getdata())

        avg_r = sum(pixels_r)/len(pixels_r)
        avg_g = sum(pixels_g)/len(pixels_g)
        avg_b = sum(pixels_b)/len(pixels_b)

        bits_r = "".join(map(lambda pixel_r: '1' if pixel_r < avg_r else '0', pixels_r))
        bits_g = "".join(map(lambda pixel_g: '1' if pixel_g < avg_g else '0', pixels_g))
        bits_b = "".join(map(lambda pixel_b: '1' if pixel_b < avg_b else '0', pixels_b))

        hashes = np.append(hashes, np.array([int(x) for x in bits_r]))
        hashes = np.append(hashes, np.array([int(x) for x in bits_g]))
        hashes = np.append(hashes, np.array([int(x) for x in bits_b]))
        
    return np.array([hashes])



In [None]:

# SETTINGS
dimensionality = 5 # dimensionality of points PER COLOUR CHANNEL (x3)
image_size = 8 # size of image in pixels - e.g. 8^2 = 64 bit hash
directory = "../videos/"
num_all_points = len(os.listdir(directory))
num_points = 20
dict_file_names = {}


t1 = time.time()

points = np.zeros(shape=(num_points,3*image_size**2*dimensionality)).astype(int)
i = 0
for file in os.listdir(directory):
    if i > num_points-1: break
    
    filename = os.fsdecode(file)
    
    newpoint = video_to_hash(os.path.join(directory, filename), dimensionality, image_size).astype(int)
    points[i,:] = newpoint.astype(int)
    dict_file_names[tuple(newpoint.flatten())] = filename[:-4]

    i += 1;



pickle_out = open("dict.pickle","wb")
pickle.dump(dict_file_names, pickle_out)
pickle_out.close()

np.savetxt("points", points)
t2 = time.time()
print("Execution Time: " + str(t2-t1))


In [None]:
data = np.loadtxt("points")
pickle_in = open("dict.pickle","rb")
names = pickle.load(pickle_in)

k = 5

# Whiten data
whitened = whiten(data)
# Find 2 clusters in the data
codebook, distortion = kmeans(whitened, k)


clusters = [[] for x in range(k)]
for i in range(k):
    clusters[i] = []


for i in range(np.size(data, 0)):
    d = []
    for j in range(k):
        d.append(np.linalg.norm(codebook[j] - data[i,:]))
    print(np.argmin(d), np.min(d))
    print(d)
    clusters[np.argmin(d)].append(names[tuple(data[i,:].flatten())][:-4])

clusters





In [None]:

def hamming_distance(x, y):
    x = x.astype(int)
    y = y.astype(int)
    return sum([(bin(x[i] ^ y[i])).count('1') for i in range(dimensionality*3)])


In [None]:
data = np.loadtxt("points")

t = AnnoyIndex(dimensionality*3, metric = "euclidean")  # Length of item vector that will be indexed

for i in range(10):#np.size(data, 0)):
    #print(data[i,:].flatten())
    t.add_item(i, data[i,:].flatten())

t.build(10) # 10 trees
t.save('test.ann')
#print(t.get_nns_by_item(0, num_all_points))
#print(t.get_n_items())