In [2]:
import os
import shutil
import glob
import math
import argparse
import warnings
import numpy as np
from PIL import Image
from tqdm import tqdm
from multiprocessing.dummy import Pool as ThreadPool
from multiprocessing import cpu_count

In [3]:
Image.MAX_IMAGE_PIXELS = None
warnings.simplefilter('ignore')

In [5]:
class K_means:

  def __init__(self, k=3, size=False, resample=32):
    self.k = k
    self.cluster = []
    self.data = []
    self.end = []
    self.i = 0
    self.size = size
    self.resample = resample

  def manhattan_distance(self,x1,x2):
    s = 0.0
    for i in range(len(x1)):
      s += abs( float(x1[i]) - float(x2[i]) )
    return s

  def euclidian_distance(self,x1,x2):
    s = 0.0
    for i in range(len(x1)):
      s += math.sqrt((float(x1[i]) - float(x2[i])) ** 2)
    return s

  def read_image(self,im):
    if self.i >= self.k :
      self.i = 0
    try:
      img = Image.open(im)
      osize = img.size
      img.thumbnail((self.resample,self.resample))
      v = [float(p)/float(img.size[0]*img.size[1])*100  for p in np.histogram(np.asarray(img))[0]]
      if self.size :
        v += [osize[0], osize[1]]
      pbar.update(1)
      i = self.i
      self.i += 1
      return [i, v, im]
    except Exception as e:
      print("Error reading ",im,e)
      return [None, None, None]


  def generate_k_means(self):
    final_mean = []
    for c in range(self.k):
      partial_mean = []
      for i in range(len(self.data[0])):
        s = 0.0
        t = 0
        for j in range(len(self.data)):
          if self.cluster[j] == c :
            s += self.data[j][i]
            t += 1
        if t != 0 :
          partial_mean.append(float(s)/float(t))
        else:
          partial_mean.append(float('inf'))
      final_mean.append(partial_mean)
    return final_mean

  def generate_k_clusters(self,folder):
    pool = ThreadPool(cpu_count())
    result = pool.map(self.read_image, folder)
    pool.close()
    pool.join()
    self.cluster = [r[0] for r in result if r[0] != None]
    self.data = [r[1] for r in result if r[1] != None]
    self.end = [r[2] for r in result if r[2] != None]

  def rearrange_clusters(self):
    isover = False
    while(not isover):
      isover = True
      m = self.generate_k_means()
      for x in range(len(self.cluster)):
        dist = []
        for a in range(self.k):
          dist.append( self.manhattan_distance(self.data[x],m[a]) )
        _mindist = dist.index(min(dist))
        if self.cluster[x] != _mindist :
          self.cluster[x] = _mindist
          isover = False

In [30]:
args = {"folder":"/media/ruben/Data Drive/test","kmeans":80,"resample":128,"size":False,"move":False}
types = ('*.jpg', '*.JPG', '*.png', '*.jpeg')
imagePaths = []
folder = args["folder"]
if not folder.endswith("/") :
    folder+="/"
for files in types :
    imagePaths.extend(sorted(glob.glob(folder+files)))
nimages = len(imagePaths)
nfolders = int(math.log(args["kmeans"], 10))+1
if nimages <= 0 :
    print("No images found!")
    #exit()
if args["resample"] < 16 or args["resample"] > 256 :
    print("-r should be a value between 16 and 256")
    #exit()
pbar = tqdm(total=nimages)




  0%|          | 0/622 [00:00<?, ?it/s][A[A[A

In [31]:
k = K_means(args["kmeans"],args["size"],args["resample"])
k.generate_k_clusters(imagePaths)
k.rearrange_clusters()




  0%|          | 3/622 [00:00<04:44,  2.18it/s][A[A[A


  1%|▏         | 8/622 [00:00<02:29,  4.12it/s][A[A[A


  1%|▏         | 8/622 [00:00<02:29,  4.12it/s][A[A[A


  1%|▏         | 9/622 [00:00<02:28,  4.12it/s][A[A[A


  2%|▏         | 12/622 [00:00<02:28,  4.12it/s][A[A[A


  3%|▎         | 16/622 [00:00<02:27,  4.12it/s][A[A[A


  4%|▍         | 24/622 [00:00<01:44,  5.73it/s][A[A[A


  4%|▍         | 27/622 [00:00<00:54, 10.85it/s][A[A[A


  5%|▍         | 31/622 [00:00<00:54, 10.85it/s][A[A[A


  7%|▋         | 41/622 [00:00<00:23, 24.57it/s][A[A[A


  7%|▋         | 45/622 [00:00<00:12, 45.70it/s][A[A[A


  7%|▋         | 46/622 [00:00<00:12, 45.70it/s][A[A[A


  8%|▊         | 48/622 [00:00<00:12, 45.70it/s][A[A[A


  8%|▊         | 50/622 [00:00<00:12, 45.70it/s][A[A[A


  9%|▊         | 53/622 [00:01<00:12, 45.70it/s][A[A[A


 10%|█         | 64/622 [00:01<00:11, 47.18it/s][A[A[A


 11%|█▏        | 70/622 [00:01<00:11, 47.

In [32]:
for i in range(50):
    print(k.end[i],k.cluster[i])

/media/ruben/Data Drive/test/00a52ce4-aa38-11ea-a49c-b0359fc72c2e.jpg 42
/media/ruben/Data Drive/test/00b16690-aa39-11ea-a6d7-b0359fc72c2e.jpg 53
/media/ruben/Data Drive/test/00b6930a-aa39-11ea-bae2-b0359fc72c2e.jpg 39
/media/ruben/Data Drive/test/00bb988b-aa39-11ea-a9c1-b0359fc72c2e.jpg 66
/media/ruben/Data Drive/test/00c973f6-aa39-11ea-924f-b0359fc72c2e.jpg 54
/media/ruben/Data Drive/test/00d8162c-aa38-11ea-9e61-b0359fc72c2e.jpg 45
/media/ruben/Data Drive/test/00dc9f06-aa39-11ea-a97d-b0359fc72c2e.jpg 27
/media/ruben/Data Drive/test/00de4b8c-aa39-11ea-bb3d-b0359fc72c2e.jpg 39
/media/ruben/Data Drive/test/00dea13a-aa38-11ea-b3a8-b0359fc72c2e.jpg 33
/media/ruben/Data Drive/test/00e30c00-aa38-11ea-b571-b0359fc72c2e.jpg 34
/media/ruben/Data Drive/test/00e4b62e-aa39-11ea-80af-b0359fc72c2e.jpg 15
/media/ruben/Data Drive/test/00f882ca-aa39-11ea-a7a4-b0359fc72c2e.jpg 74
/media/ruben/Data Drive/test/00f90e54-aa38-11ea-9691-b0359fc72c2e.jpg 25
/media/ruben/Data Drive/test/00f994af-aa39-11ea-a99

In [None]:
for i in range(k.k) :
	try :
	  os.makedirs(folder+str(i+1).zfill(nfolders))
	except :
	  print("Folder already exists")
action = shutil.copy
if args["move"] :
	action = shutil.move
for i in range(len(k.cluster)):
	action(k.end[i], folder+"/"+str(k.cluster[i]+1).zfill(nfolders)+"/")
