# Descritores de Imagens via Histograma de Cor

Exemplos baseados em https://github.com/xn2333/OpenCV/blob/master/Seminar_Image_Processing_in_Python.ipynb*







# Instalando Bibliotecas

In [94]:
import numpy as np
import pandas as pd
import cv2 as cv 
from google.colab.patches import cv2_imshow # for image display
from skimage import io
from PIL import Image 
import matplotlib.pylab as plt

# Vamos construir nosso dataset

In [95]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [108]:
# percentagens taken by each girl
"""
loona = [
      'heejin',
      'hyunjin',
      'haseul',
      'yeojin',
      'vivi',
      'kimlip',
      'jinsoul',
      'choerry',
      'yves',
      'chuu',
      'gowon',
      'oliviahye'
  ]

deukae = [
    "jiu",
    "sua",
    "siyeon",
    "handong",
    "yoohyeon",
    "dami",
    "gahyeon"
]

images = pd.read_csv('/content/drive/MyDrive/hourlyloonacatcher_dfs/images_to_post.csv')

for l in loona:
    print('{0}: {1}'.format(l,(100 * len(images[images['loona'].str.contains(l)].index) / len(images.index))))

print()

for d in deukae:
    print('{0}: {1}'.format(d,(100 * len(images[images['deukae'].str.contains(d)].index) / len(images.index))))   
"""

heejin: 9.090909090909092
hyunjin: 9.090909090909092
haseul: 8.041958041958042
yeojin: 8.041958041958042
vivi: 7.6923076923076925
kimlip: 8.041958041958042
jinsoul: 8.041958041958042
choerry: 9.090909090909092
yves: 8.391608391608392
chuu: 6.993006993006993
gowon: 8.041958041958042
oliviahye: 9.44055944055944

jiu: 13.636363636363637
sua: 16.433566433566433
siyeon: 14.335664335664335
handong: 15.384615384615385
yoohyeon: 11.888111888111888
dami: 15.034965034965035
gahyeon: 13.286713286713287


In [97]:
def load_zero():
  import os

  path = '/content/drive/MyDrive/hourlyloonacatcher/'

  dataset = {"group": [], "filename": [], "image": [], "post": []}
  for filename in os.listdir(path + 'loona/'):
      img = cv.imread(path + 'loona/' + filename)
      if img is not None:
          dataset["group"].append('loona')
          dataset["filename"].append(filename)
          dataset["image"].append(img)
          dataset["post"].append(False)

  for filename in os.listdir(path + 'deukae/'):
      img = cv.imread(path + 'deukae/' + filename)
      if img is not None:
          dataset["group"].append('deukae')
          dataset["filename"].append(filename)
          dataset["image"].append(img)
          dataset["post"].append(False)

  return dataset

def load_flying():
  import os

  path = '/content/drive/MyDrive/hourlyloonacatcher/'

  dataset = pd.read_csv('/content/drive/MyDrive/hourlyloonacatcher_dfs/df_dataset.csv').drop(columns='Unnamed: 0').reset_index().drop(columns='index').to_dict(orient='list')
  dataset["image"] = []
  for filename in os.listdir(path + 'loona/'):
      img = cv.imread(path + 'loona/' + filename)
      if img is not None and filename not in dataset["filename"]:
        dataset["group"].append('loona')
        dataset["filename"].append(filename)
        dataset["image"].append(img)
        dataset["post"].append(False)
      elif img is not None:
        dataset["image"].append(img)

  for filename in os.listdir(path + 'deukae/'):
      img = cv.imread(path + 'deukae/' + filename)
      if img is not None and filename not in dataset["filename"]:
        dataset["group"].append('deukae')
        dataset["filename"].append(filename)
        dataset["image"].append(img)
        dataset["post"].append(False)
      elif img is not None:
        dataset["image"].append(img)

  return dataset

# Extraindo características do dataset usando Histograma de Cor

In [98]:
def prepareX(dataset):
  color = ('b','g','r')

  dataset_hist_r = []
  dataset_hist_g = []
  dataset_hist_b = []

  counter = 0
  for image in dataset["image"]:
    hists = {}
    for i,col in enumerate(color):
      histr = cv.calcHist([image],[i],None,[256],[0,256])
      if col == 'r': dataset_hist_r.append(histr)
      if col == 'g': dataset_hist_g.append(histr)
      if col == 'b': dataset_hist_b.append(histr)


  X_r = np.array(dataset_hist_r)
  length = np.sqrt((X_r**2).sum(axis=1))[:,None]
  X_r = X_r / length

  X_g = np.array(dataset_hist_g)
  length = np.sqrt((X_g**2).sum(axis=1))[:,None]
  X_g = X_g / length


  X_b = np.array(dataset_hist_b)
  length = np.sqrt((X_b**2).sum(axis=1))[:,None]
  X_b = X_b / length

  X = np.concatenate((X_r,X_g,X_g),axis=1)
  X.shape

  X = X.reshape(X.shape[0],X.shape[1])
  X.shape

  return X

# Agrupamento de Imagens

In [99]:
from sklearn.cluster import KMeans
import numpy as np

def do_kmeans(X, n_clusters=100):
  kmeans = KMeans(n_clusters=n_clusters).fit(X)
  return kmeans

## Escolher a imagem

In [100]:
def get_probs(loona_prob = [1.0/12] * 12, deukae_prob = [1.0/7] * 7):
  loona = [
      'heejin',
      'hyunjin',
      'haseul',
      'yeojin',
      'vivi',
      'kimlip',
      'jinsoul',
      'choerry',
      'yves',
      'chuu',
      'gowon',
      'oliviahye'
  ]

  deukae = [
      "jiu",
      "sua",
      "siyeon",
      "handong",
      "yoohyeon",
      "dami",
      "gahyeon"
  ]

  return loona, loona_prob, deukae, deukae_prob

In [101]:
def update_probs(loona_prob, deukae_prob, loona, deukae, loona_choice, deukae_choice):
  for i in range(len(loona)):
    if loona[i] not in loona_choice:
      loona_prob[i] += 1.0/12
    else:
      loona_prob[i] = 1.0/12

  for i in range(len(deukae)):
    if deukae[i] not in deukae_choice:
      deukae_prob[i] += 1.0/7
    else:
      deukae_prob[i] = 1.0/7

  return loona_prob, deukae_prob

In [102]:
import random 
from sklearn.metrics import silhouette_samples

n_clusters = 50

def first_run():  

  dataset = load_zero()
  X = prepareX(dataset)
  kmeans = do_kmeans(X, n_clusters) 
  cluster_labels = kmeans.labels_

  sample_silhouette_values = silhouette_samples(X, cluster_labels)

  means_lst = []
  for label in range(n_clusters):
      means_lst.append(sample_silhouette_values[cluster_labels == label].mean())

  clusterList = [x for _,x in sorted(zip(means_lst,range(n_clusters)))]

  imageLoona = None
  imageDeukae = None

  images_to_post = {'loona': [], 'deukae': []}

  # first time

  loona, loona_prob, deukae, deukae_prob = get_probs()
  loona_choice = random.choices(loona, weights=loona_prob, k=1) 
  deukae_choice = random.choices(deukae, weights=deukae_prob, k=1) 
  loona_prob, deukae_prob = update_probs(loona_prob, deukae_prob, loona, deukae, loona_choice, deukae_choice)

  for i in range(192):
    print(loona_choice[0], deukae_choice[0]) 
    imageLoona, imageDeukae = -1, -1
    #get image pair for post
    for c in clusterList:
      for image_id, cluster in enumerate(cluster_labels):
          if not dataset["post"][image_id]:
            if cluster == c:
              if loona_choice[0] in dataset["filename"][image_id]:
                print(image_id)
                imageLoona = image_id
                break

      for image_id, cluster in enumerate(cluster_labels):
          if not dataset["post"][image_id]:
            if cluster == c:
              if deukae_choice[0] in dataset["filename"][image_id]:
                print(image_id)
                imageDeukae = image_id
                break

      if (imageLoona != -1) and (imageDeukae != -1):
        dataset["post"][imageLoona] = True
        dataset["post"][imageDeukae] = True
        # new pair
        loona_choice = random.choices(loona, weights=loona_prob, k=1) 
        deukae_choice = random.choices(deukae, weights=deukae_prob, k=1) 
        loona_prob, deukae_prob = update_probs(loona_prob, deukae_prob, loona, deukae, loona_choice, deukae_choice)
        # get out and choose right image
        break
    
    if (imageLoona != -1) and (imageDeukae != -1):
      images_to_post['loona'].append(dataset["filename"][imageLoona])
      images_to_post['deukae'].append(dataset["filename"][imageDeukae])
    else:
      print("No more image combinations for the pair")

    #cv2_imshow(dataset["image"][imageLoona])
    #cv2_imshow(dataset["image"][imageDeukae])

  # generate dfs for saving

  del dataset['image']
  df_dataset = pd.DataFrame(dataset)

  df_images_to_post = pd.DataFrame(images_to_post)

  df_loona = {'name': [], 'prob': []}
  df_loona['name'] = loona
  df_loona['prob'] = loona_prob
  df_loona = pd.DataFrame(df_loona)

  df_deukae = {'name': [], 'prob': []}
  df_deukae['name'] = deukae
  df_deukae['prob'] = deukae_prob
  df_deukae = pd.DataFrame(df_deukae)

  df_dataset.to_csv('/content/drive/MyDrive/hourlyloonacatcher_dfs/df_dataset.csv')
  df_images_to_post.to_csv('/content/drive/MyDrive/hourlyloonacatcher_dfs/images_to_post.csv')
  df_loona.to_csv('/content/drive/MyDrive/hourlyloonacatcher_dfs/df_loona.csv')
  df_deukae.to_csv('/content/drive/MyDrive/hourlyloonacatcher_dfs/df_deukae.csv')

def flying_run():  

  dataset = load_flying()
  X = prepareX(dataset)
  kmeans = do_kmeans(X, n_clusters) 
  cluster_labels = kmeans.labels_

  sample_silhouette_values = silhouette_samples(X, cluster_labels)

  means_lst = []
  for label in range(n_clusters):
      means_lst.append(sample_silhouette_values[cluster_labels == label].mean())

  clusterList = [x for _,x in sorted(zip(means_lst,range(n_clusters)))]

  imageLoona = None
  imageDeukae = None

  images_to_post = pd.read_csv('/content/drive/MyDrive/hourlyloonacatcher/images_to_post.csv').drop(columns='Unnamed: 0').reset_index().drop(columns='index').to_dict(orient='list')
  df_loona = pd.read_csv('/content/drive/MyDrive/hourlyloonacatcher_dfs/df_loona.csv').drop(columns='Unnamed: 0').reset_index().drop(columns='index')
  loona, loona_prob = list(df_loona['name']), list(df_loona['prob'])
  df_deukae = pd.read_csv('/content/drive/MyDrive/hourlyloonacatcher_dfs/df_deukae.csv').drop(columns='Unnamed: 0').reset_index().drop(columns='index')
  deukae, deukae_prob = list(df_deukae['name']), list(df_deukae['prob'])
  
  loona_choice = random.choices(loona, weights=loona_prob, k=1) 
  deukae_choice = random.choices(deukae, weights=deukae_prob, k=1) 
  loona_prob, deukae_prob = update_probs(loona_prob, deukae_prob, loona, deukae, loona_choice, deukae_choice)

  for i in range(192):
    print(loona_choice[0], deukae_choice[0]) 
    imageLoona, imageDeukae = -1, -1
    #get image pair for post
    for c in clusterList:
      for image_id, cluster in enumerate(cluster_labels):
          if not dataset["post"][image_id]:
            if cluster == c:
              if loona_choice[0] in dataset["filename"][image_id]:
                print(image_id)
                imageLoona = image_id
                break

      for image_id, cluster in enumerate(cluster_labels):
          if not dataset["post"][image_id]:
            if cluster == c:
              if deukae_choice[0] in dataset["filename"][image_id]:
                print(image_id)
                imageDeukae = image_id
                break

      if (imageLoona != -1) and (imageDeukae != -1):
        dataset["post"][imageLoona] = True
        dataset["post"][imageDeukae] = True
        # new pair
        loona_choice = random.choices(loona, weights=loona_prob, k=1) 
        deukae_choice = random.choices(deukae, weights=deukae_prob, k=1) 
        loona_prob, deukae_prob = update_probs(loona_prob, deukae_prob, loona, deukae, loona_choice, deukae_choice)
        # get out and choose right image
        break
    
    if (imageLoona != -1) and (imageDeukae != -1):
      images_to_post['loona'].append(dataset["filename"][imageLoona])
      images_to_post['deukae'].append(dataset["filename"][imageDeukae])
    else:
      print("No more image combinations for the pair")

    #cv2_imshow(dataset["image"][imageLoona])
    #cv2_imshow(dataset["image"][imageDeukae])

  # generate dfs for saving

  del dataset['image']
  df_dataset = pd.DataFrame(dataset)

  df_images_to_post = pd.DataFrame(images_to_post)

  df_loona = {'name': [], 'prob': []}
  df_loona['name'] = loona
  df_loona['prob'] = loona_prob
  df_loona = pd.DataFrame(df_loona)

  df_deukae = {'name': [], 'prob': []}
  df_deukae['name'] = deukae
  df_deukae['prob'] = deukae_prob
  df_deukae = pd.DataFrame(df_deukae)

  df_dataset.to_csv('/content/drive/MyDrive/hourlyloonacatcher_dfs/df_dataset.csv')
  df_images_to_post.to_csv('/content/drive/MyDrive/hourlyloonacatcher_dfs/images_to_post.csv')
  df_loona.to_csv('/content/drive/MyDrive/hourlyloonacatcher_dfs/df_loona.csv')
  df_deukae.to_csv('/content/drive/MyDrive/hourlyloonacatcher_dfs/df_deukae.csv')


In [103]:
# when no images pairs were generated previously
#first_run()

# when image pairs are already being posted
flying_run()

heejin jiu
1080
3088
kimlip siyeon
1791
2307
haseul handong
4849
2410
1344
4902
jinsoul siyeon
2
4680
yves gahyeon
2958
276
hyunjin jiu
1243
3145
gowon sua
690
2098
oliviahye dami
827
2866
vivi gahyeon
1588
3368
1663
2987
haseul siyeon
4708
2358
2211
2208
1314
2246
kimlip dami
1799
2896
heejin jiu
1112
5388
chuu handong
3997
4849
hyunjin siyeon
1255
4708
yeojin yoohyeon
1400
2569
gowon gahyeon
743
655
3009
choerry dami
169
5120
haseul sua
2173
2099
2119
2095
2046
1305
2065
kimlip jiu
3480
1768
5359
yves dami
5125
369
5150
oliviahye yoohyeon
4123
2582
hyunjin handong
1265
4871
oliviahye dami
4124
5125
jinsoul gahyeon
62
26
1899
5277
heejin jiu
4229
4289
5416
yeojin dami
1470
5189
oliviahye sua
2173
941
2099
gowon siyeon
743
4731
haseul sua
2173
4507
2119
2095
2046
2144
4521
1346
3201
chuu yoohyeon
4035
2705
heejin dami
4229
5241
hyunjin handong
1146
2410
choerry gahyeon
236
71
94
87
3013
yves jiu
5445
279
3067
gowon sua
766
2173
vivi dami
1588
5264
heejin siyeon
4309
4786
chuu dami
3994

In [106]:
df = pd.read_csv('/content/drive/MyDrive/hourlyloonacatcher_dfs/df_dataset.csv').drop(columns='Unnamed: 0').reset_index().drop(columns='index')

In [107]:
len(df[df['post'] == True])

768