In [None]:
import tensorflow.keras as keras # tf is a low model, every little detail, keras is high level api that they wrapped in tf2 
from tensorflow.keras.preprocessing import image # this is dependant on a lib pip install pillow 
from tensorflow.keras.applications.imagenet_utils import preprocess_input
from tensorflow.keras.models import Model

In [None]:
model = keras.applications.VGG16(weights='imagenet', include_top =True) # this means use the weights from when you trained the network on imagenet
model.summary()
feat_extractor = Model(inputs=model.input, outputs=model.get_layer("fc2").output)
feat_extractor.summary()

In [None]:
import glob 
import random 

image_files = glob.glob('./Portraits/*.jpg') # this is because we have folders of folders so we need to enable it recursively

# ** is a wildcard, anything that ends with .jpg, give me the name of it 

In [None]:
image_files

In [None]:
len(image_files)

In [None]:
random.shuffle(image_files)

In [None]:
image_files = image_files[:670] # this grabs the first thousand

In [None]:
len(image_files)

In [None]:
image_files # to show that its in random order 

In [None]:
import numpy as np
features = []

# for i, image_path in zip(range(0,len(image_files)), image_files) # same as below 

for i, image_path in enumerate(image_files): #it takes image_files and goes through and gives you each item and its associated index 
    if i % 10 == 0:
        print("analyzed " + str(i) + " out of " + str(len(image_files)))
    
    img = image.load_img(image_path, target_size=model.input_shape[1:3])
    x = image.img_to_array(img) # turning it into a numpy array 
    #print(x)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    #print(x)
    
    feat = feat_extractor.predict(x)[0] # grabbing first value out of that prediction 
    #print(len(feat))
    features.append(feat)
    

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans # this is sensitive to distance, so standardscaler normalizes it to distance 

ss = StandardScaler()
scaled = ss.fit_transform(features)
scaled[0] # looks like everything is btwn 1 and -1 

In [None]:
kmeans = KMeans(n_clusters=20)
clusters = kmeans.fit_predict(scaled)
clusters

In [None]:
from sklearn.neighbors import NearestNeighbors

neighbors = NearestNeighbors(n_neighbors=5, metric='cosine').fit(scaled)
_, closest = neighbors.kneighbors(kmeans.cluster_centers_)
#cosine works better in high dimension space, we haven't done any dimentionality reduction 



In [None]:
closest # index of all the images, 1st image, and the top 5 image by distance, list of a list 

In [None]:
_ # this is the actual distance, its an _ when we're generally not using it 

In [None]:
from PIL import Image
import matplotlib.pyplot as plt

img_width = 200

grid_image = Image.new('RGB', (5 * img_width, len(closest) * 240), (0, 0, 0, 255))

max_height = 0
for cluster_row in closest:
    max_height_temp = 0
    xpos = 0
    for col_num in cluster_row:
        img = Image.open(image_files[col_num])
        img_ar = img.width / img.height
        img = img.resize((img_width, int(img_width / img_ar)), Image.ANTIALIAS)
        max_height_temp = max(max_height_temp, img.height)
        grid_image.paste(img, (xpos, max_height))
        xpos = xpos + img.width
    max_height = max_height + max_height_temp + 20 # adding 20 pixel margin

plt.figure(figsize = (32,24))
plt.imshow(grid_image)

In [None]:
# dimentionality reduction 
# we have a shit ton of features, and have high dimentionality, and we want to reduce it to a smaller dimention problem. ie going from 3d to 2d 

In [None]:
# there is data loss on dimentionality reduction 

In [None]:
#T-SNE it does a really good job of preserving spacial relationship


In [None]:
import umap.umap_ as umap
embedding = umap.UMAP().fit_transform(scaled)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(embedding)
embedding_scaled = scaler.transform(embedding)

In [None]:
width = 4000
height = 3000
max_dim = 200

full_image = Image.new('RGBA', (width, height), (0, 0, 0, 255))
for img, pos in zip(image_files, embedding_scaled):
    x = pos[0]
    y = pos[1]
    tile = Image.open(img)
    rs = max(1, tile.width/max_dim, tile.height/max_dim)
    tile = tile.resize((int(tile.width/rs), int(tile.height/rs)), Image.ANTIALIAS)
    full_image.paste(tile, (int((width-max_dim)*x), int((height-max_dim)*y)), mask=tile.convert('RGBA'))

plt.figure(figsize = (32,24))
plt.imshow(full_image)

In [None]:
import rasterfairy

In [None]:
nx = 23
ny = 25

grid_assignment = rasterfairy.transformPointCloud2D(embedding, target=(nx, ny))

In [None]:
tile_width = 100
tile_height = 100

full_width = tile_width * nx
full_height = tile_height * ny
aspect_ratio = tile_width / tile_height

grid_image = Image.new('RGB', (full_width, full_height), (0, 0, 0, 255))

for img, grid_pos in zip(image_files, grid_assignment[0]):
    idx_x, idx_y = grid_pos
    x, y = tile_width * idx_x, tile_height * idx_y
    tile = Image.open(img)
    tile_ar = tile.width / tile.height
    tile = tile.resize((int(0.8 * tile_width), int(0.8 * tile_height / tile_ar)), Image.ANTIALIAS)
    grid_image.paste(tile, (int(x), int(y)))

plt.figure(figsize = (16,12))
plt.imshow(grid_image)

In [None]:
lookup = []

for image, grid_pos, cluster_pos in zip(image_files, grid_assignment[0], embedding_scale):
    lookup.append({
        'filename': image_files.replace('./', ""),
        'grid_post': grid_pos.tolist(),
        'cluster_pos': cluster_pos.tolist()
    })

In [None]:
import json 
with open('image_umap_position.json', 'w') as outfile"
    json.dump(lookup, outfile)