In [1]:
import pandas as pd
import numpy as np

#img load
import requests
from fake_useragent import UserAgent 
from io import BytesIO
from keras.preprocessing import image

#img show
import matplotlib.pyplot as plt
from PIL import Image
import random

#preprocessing
from keras.applications.vgg16 import preprocess_input

# models 
from keras.applications.vgg16 import VGG16 
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# color distributions
import cv2
import imutils
import urllib.request as urllib2

In [2]:
def extract_features(img, model):
    # get the feature vector
    features = model.predict(img, use_multiprocessing=True)
    return features

def show_image(img):
    
    image = Image.open(img)
    imgplot = plt.imshow(image)
    plt.show()
    
# function that lets you view a cluster (based on identifier)        
def view_cluster(cluster):
    plt.figure(figsize = (25,25));
    
    # gets the list of filenames for a cluster
    imgs = groups[cluster]
    # only allow up to 30 images to be shown at a time
    if len(imgs) > 30:
        t = f"Clipping cluster number {cluster} size from {len(imgs)} to 30"
        imgs =  random.sample(img_list, 30)
    else:
        t = f"Cluster number {cluster}"
        
    # plot each image in the cluster
    plt.suptitle(t, fontsize=30)
    for index, img in enumerate(imgs):
        plt.subplot(10,10,index+1);

        img_a = np.array(img)
        plt.imshow(img_a)
        plt.axis('off')

import dataframes

In [3]:
img_link_df = pd.read_pickle('attractions_img_links_df.pkl')
att_loc_df = pd.read_pickle('attractions_loc_df.pkl')

download images and img array into list

In [4]:
ua = UserAgent()
headers = {'user-agent': ua.random}

end =  len(img_link_df)
img_list = []
img_preprocessed_list = []
attraction_img_list = []
image_color_preprocess_list = []
locations = []
urls = []

for x in range(0, end ):
    
    url_list = img_link_df.iloc[x]
#     print(url_list)
    att = img_link_df.iloc[[x]].index.values[0]
#     print(att)
    img_num = 1
    for url in url_list:

        try:
            #download img from url
            response = requests.get(url, headers = headers)
            image_io = BytesIO(response.content)
            img = image.load_img(image_io, target_size=(224, 224))
            img_list.append(img)
            
            # convert from 'PIL.Image.Image' to numpy array
            img_array = np.array(img)
            img_array_reshaped = img_array.reshape(1,224,224,3) #reshape(num_of_samples, dim 1, dim 2, channels)
            img_preprocessed_list.append(preprocess_input(img_array_reshaped)) #prepare image for model
            
            #find name of attraction and img num in attraction
            attraction_img_list.append(att+'_'+str(img_num))
            img_num = img_num + 1
            
            urls.append(url)
            
            if img_num % 5 == 0:
                ua = UserAgent()
                headers = {'user-agent': ua.random}
            
#             #preprocess for color
#             req = urllib.request.Request(url, headers=headers)
#             resp = urllib.request.urlopen(req)
#             image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
#             features = []
#             try:
#                 resp = urllib.request.urlopen(url)
#                 image = np.array(bytearray(resp.read()), dtype = np.uint8)
#                 image = cv2.imdecode(image, cv2.IMREAD_COLOR)
#                 image_color_preprocess_list.append(image)
#             except:
#                 print("xxxxxxx")
            
        except:
            continue

In [28]:
img_preprocessed_list[0].shape

(1, 224, 224, 3)

In [6]:
print(len(img_list), len(attraction_img_list), len(urls))

77846 77846 77846


load model

In [7]:
# load model
model = VGG16()
# remove the output layer
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

geat features of each image and load into dictionary

In [8]:
data = {}
for name,img in zip(attraction_img_list, img_preprocessed_list):
    data[name] = extract_features(img, model)
    
# get a list of just the features
feat = np.array(list(data.values()))
print(feat.shape)

# reshape so that there are all samples of 4096 vectors
feat = feat.reshape(-1,4096)
print(feat.shape)

(77846, 1, 4096)
(77846, 4096)


Dimensionality Reduction unsing PCA

In [9]:
pca = PCA(n_components=100, random_state=22)
pca.fit(feat)
x = pca.transform(feat)

KMeans clustering

In [10]:
k = 100
kmeans = KMeans(n_clusters=k,n_jobs=-1, random_state=22)
kmeans.fit(x)



KMeans(n_clusters=100, n_jobs=-1, random_state=22)

In [11]:
print(len(kmeans.labels_), len(img_list))

77846 77846


In [12]:
# holds the cluster id and the images { id: [images] }
groups = {}
for img, cluster in zip(img_list,kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(img)
    else:
        groups[cluster].append(img)

In [1]:
# for cluster in range(0,k):
#     view_cluster(cluster)

Find optimal k value

In [2]:
# # this is just incase you want to see which value for k might be the best 
# sse = []
# list_k = list(range(50, 300))

# for k in list_k:
#     km = KMeans(n_clusters=k, random_state=22, n_jobs=-1)
#     km.fit(x)
    
#     sse.append(km.inertia_)

# # Plot sse against k
# plt.figure(figsize=(6, 6))
# plt.plot(list_k, sse)
# plt.xlabel(r'Number of clusters *k*')
# plt.ylabel('Sum of squared distance');

pickle data

In [15]:
# However, while RGB values are simple to understand, the RGB color space fails to mimic how humans perceive color. 
# Instead, we are going to use the HSV color space which maps pixel intensities into a cylinder:
import time

def load_img_open_cv(url):
    ua = UserAgent()
    headers = ('user-agent', ua.random)
    
    request = urllib2.Request(url)
    request.add_header(headers[0], headers[1])
    resp = urllib2.urlopen(request)
#     resp = urllib.request.urlopen(url)

    image = np.array(bytearray(resp.read()), dtype = np.uint8)
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)
    
    return image
    
def describe( url, bins):
    try:
        image = load_img_open_cv(url)
    #     print(image)

        try:
            # convert the image to the HSV color space and initialize the features used to quantify the image
            image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
            features = []
        except:
            return None

        # grab the dimensions and compute the center of the image
        (h, w) = image.shape[:2]
        (cX, cY) = (int(w * 0.5), int(h * 0.5))

        # divide the image into four rectangles/segments (top-left, top-right, bottom-right, bottom-left)
        segments = [(0, cX, 0, cY), (cX, w, 0, cY), (cX, w, cY, h), (0, cX, cY, h)]

        # construct an elliptical mask representing the center of the image
        (axesX, axesY) = (int(w * 0.75) // 2, int(h * 0.75) // 2)
        ellipMask = np.zeros(image.shape[:2], dtype = "uint8")
        cv2.ellipse(ellipMask, (cX, cY), (axesX, axesY), 0, 0, 360, 255, -1)

        # loop over the segments
        for (startX, endX, startY, endY) in segments:

            # construct a mask for each corner of the image, subtracting the elliptical center from it
            cornerMask = np.zeros(image.shape[:2], dtype = "uint8")
            cv2.rectangle(cornerMask, (startX, startY), (endX, endY), 255, -1)
            cornerMask = cv2.subtract(cornerMask, ellipMask)

            # extract a color histogram from the image, then update the feature vector
            hist = histogram(image, cornerMask, bins)
            features.extend(hist)

        # extract a color histogram from the elliptical region and update the feature vector
        hist = histogram(image, ellipMask, bins)
        features.extend(hist)

        return features
    
    except:
        return None

def histogram(image, mask, bins):
    # extract a 3D color histogram from the masked region of the image, using the supplied number of bins per channel
    hist = cv2.calcHist([image], [0,1,2], mask, [bins,bins,bins],[0, 256, 0, 256, 0, 256])
    
    # normalize the histogram if we are using OpenCV 2.4
    if imutils.is_cv2():
        hist = cv2.normalize(hist).flatten()
        
    # otherwise handle for OpenCV 3+
    else:
        hist = cv2.normalize(hist, hist).flatten()

    return hist

def plot_color_hist(url):
    img = load_img_open_cv(url)
    
    color = ('b','g','r')
    for i,col in enumerate(color):
        histr = cv2.calcHist([img],[i],None,[256],[0,256])
        plt.plot(histr,color = col)
        plt.xlim([0,256])
    plt.show()


Get Color features for each image

In [26]:
bins = 7
color_data = {}
# idx = 0
for name,url in zip(attraction_img_list, urls):
    color_data[name] = describe(url, bins)
    
#     if idx%5==0:
#         ua = UserAgent()
#         headers = ('user-agent', ua.random)
#     idx = idx +1
#     if idx == 200:
#         break
    
# get a list of just the color features
color_feat = np.array(list(color_data.values()))
print(color_feat.shape)

# reshape so that there are all samples of 4096 vectors
# feat = feat.reshape(-1,4096)
# print(feat.shape)

(77846,)


  color_feat = np.array(list(color_data.values()))


Kmeans with color

In [29]:
k = 120
kmeans_color = KMeans(n_clusters=k,n_jobs=-1, random_state=22)
kmeans_color.fit(color_feat)

# holds the cluster id and the images { id: [images] }
color_groups = {}
for img, cluster in zip(img_list,kmeans_color.labels_):
    if cluster not in groups.keys():
        color_groups[cluster] = []
        color_groups[cluster].append(img)
    else:
        color_groups[cluster].append(img)
        


ValueError: setting an array element with a sequence.

In [None]:
for cluster in range(0,k):
    view_cluster(cluster)

In [23]:
# this is just incase you want to see which value for k might be the best 
sse = []
list_k = list(range(50, 300))

for k in list_k:
    km = KMeans(n_clusters=k, random_state=22, n_jobs=-1)
    km.fit(color_feat)
    
    sse.append(km.inertia_)

# Plot sse against k
plt.figure(figsize=(6, 6))
plt.plot(list_k, sse)
plt.xlabel(r'Number of clusters *k*')
plt.ylabel('Sum of squared distance');











ValueError: n_samples=200 should be >= n_clusters=201.

In [19]:
# print(describe(urls[666], 7))

In [20]:
urls[0]


'https://dynamic-media-cdn.tripadvisor.com/media/photo-o/16/a6/36/da/photo0jpg.jpg?w=700&h=-1&s=1'

In [None]:
import cv2
import imutils
img_array = X_train[6679]
bins = [5,5,6]
color = cv2.COLOR_BGR2HSV
#BGR2RGB
#COLOR_BGR2HSV

def histogram(image, mask, bins):
    # extract a 3D color histogram from the masked region of the image, using the supplied number of bins per channel
    hist = cv2.calcHist([image], [0,1,2], mask, [bins[0],bins[1],bins[2]],[0, 180, 0, 256, 0, 256])
    
    # normalize the histogram if we are using OpenCV 2.4
    if imutils.is_cv2():
        hist = cv2.normalize(hist).flatten()
        
    # otherwise handle for OpenCV 3+
    else:
        hist = cv2.normalize(hist, hist).flatten()

    return hist

def plot_color_hist(image):
    
    color = ('b','g','r')
    for i,col in enumerate(color):
        histr = cv2.calcHist([image],[i],None,[256],[0,256])
        plt.plot(histr,color = col)
        plt.xlim([0,256])
        # plt.ylim([0,1000])
    plt.show()

def show_image(img):
    
    # image = Image.open(img)
    imgplot = plt.imshow(img)
    plt.show()
    
def get_color_description(img_array, bins, color):
    img = np.float32(img_array) * 255
    image = cv2.cvtColor(img, color)
    
    features = []
   
    # grab the dimensions and compute the center of the image
    (h, w) = image.shape[:2]
    (cX, cY) = (int(w * 0.5), int(h * 0.5))

    # divide the image into four rectangles/segments (top-left, top-right, bottom-right, bottom-left)
    segments = [(0, cX, 0, cY), (cX, w, 0, cY), (cX, w, cY, h), (0, cX, cY, h)]

    # construct an elliptical mask representing the center of the image
    (axesX, axesY) = (int(w * 0.75) // 2, int(h * 0.75) // 2)
    ellipMask = np.zeros(image.shape[:2], dtype = "uint8")
    cv2.ellipse(ellipMask, (cX, cY), (axesX, axesY), 0, 0, 360, 255, -1)

    # loop over the segments
    for (startX, endX, startY, endY) in segments:
        # construct a mask for each corner of the image, subtracting the elliptical center from it
        cornerMask = np.zeros(image.shape[:2], dtype = "uint8")
        cv2.rectangle(cornerMask, (startX, startY), (endX, endY), 255, -1)
        cornerMask = cv2.subtract(cornerMask, ellipMask)

        # extract a color histogram from the image, then update the feature vector
        hist = histogram(image, cornerMask,bins)
        features.extend(hist)

        # extract a color histogram from the elliptical region and update the feature vector
        hist = histogram(image, ellipMask, bins)
        features.extend(hist)