In [14]:
import pandas as pd
import numpy as np
from collections import defaultdict

#img load
from PIL import Image
import requests
from io import BytesIO
from fake_useragent import UserAgent 

#img preprocessing
from keras.preprocessing import image
from keras.applications import mobilenet_v2


#show img
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

#remove warnings
import tensorflow as tf
# tf.logging.set_verbosity(tf.logging.ERROR)
tf.get_logger().setLevel('ERROR')

# color distributions
import cv2
import imutils
import urllib.request
import random

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


load link dataframes

In [3]:
img_link_df = pd.read_pickle('attractions_img_links_df.pkl')
att_loc_df = pd.read_pickle('attractions_loc_df.pkl')

In [4]:
small_df = img_link_df.dropna()
small_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1233 entries, Barber Vintage Motorsports Museum to Antler Arches of Jackson
Data columns (total 58 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       1233 non-null   object
 1   1       1233 non-null   object
 2   2       1233 non-null   object
 3   3       1233 non-null   object
 4   4       1233 non-null   object
 5   5       1233 non-null   object
 6   6       1233 non-null   object
 7   7       1233 non-null   object
 8   8       1233 non-null   object
 9   9       1233 non-null   object
 10  10      1233 non-null   object
 11  11      1233 non-null   object
 12  12      1233 non-null   object
 13  13      1233 non-null   object
 14  14      1233 non-null   object
 15  15      1233 non-null   object
 16  16      1233 non-null   object
 17  17      1233 non-null   object
 18  18      1233 non-null   object
 19  19      1233 non-null   object
 20  20      1233 non-null   object
 21  21      1233 

In [5]:
# function that lets you view a cluster (based on identifier)        
def view_cluster(cluster):
    plt.figure(figsize = (25,25));
    
    # gets the list of filenames for a cluster
    imgs = groups[cluster]
    # only allow up to 30 images to be shown at a time
    if len(imgs) > 30:
        t = f"Clipping cluster number {cluster} size from {len(imgs)} to 30"
        imgs =  random.sample(img_list, 30)
    else:
        t = f"Cluster number {cluster}"
        
    # plot each image in the cluster
    plt.suptitle(t, fontsize=30)
    for index, img in enumerate(imgs):
        plt.subplot(10,10,index+1);

        img_a = np.array(img)
        plt.imshow(img_a)
        plt.axis('off')
        
def dowload_image(url):
    #download img from url
    response = requests.get(url, headers = headers)
    image_io = BytesIO(response.content)
    img = image.load_img(image_io, target_size=(224, 224))
    return img

In [6]:
def prepare_image(url, headers):
    im = loadImage(url, headers)
#     im = Image.open(requests.get(url, stream=True).raw)
#     im = im.resize((224, 224), Image.ANTIALIAS)
    x = image.img_to_array(im)
    x = np.expand_dims(x, axis=0)
    x = mobilenet_v2.preprocess_input(x)
    return x

def show_image(url):
    
    img = Image.open(requests.get(url, stream=True).raw)
    imgplot = plt.imshow(img)
    plt.show()

def loadImage(URL, headers):
    response = requests.get(URL, headers=headers) 
    image_io = BytesIO(response.content)
    img = image.load_img(image_io, target_size=(224, 224))

    return img

def get_predictions(url, headers, models):

    x = prepare_image(url, headers)

    out = model.predict(x)

    df = pd.DataFrame(columns = ['prediction', 'probability'])
    predictions = mobilenet_v2.decode_predictions(out)[0]
    idx = 0
    for x in predictions:
        df = df.append(pd.DataFrame({'prediction':x[1], 'probability': x[2]}, index=[idx]), ignore_index=True)
        idx = idx + 1
    return df

def get_img_predictions_dict(url, headers, models):

    x = prepare_image(url, headers)

    out = model.predict(x)

    preds_dict = defaultdict(int)
    predictions = mobilenet_v2.decode_predictions(out)[0]
    for x in predictions:
        preds_dict[x[1]] =  x[2]
        
    return preds_dict

def get_predictions_dictionary(attraction_index, model):
    ua = UserAgent()
    headers = {'user-agent': ua.random}
    
    url_list = small_df.iloc[attraction_index]
    preds_dict = defaultdict(int)
    
    idx = 0
    for url in url_list :
        df = get_predictions(url, headers, model)
        for index, row in df.iterrows():
            preds_dict[row[0]] += row[1]
        if idx % 5 == 0:
            ua = UserAgent()
            headers = {'user-agent': ua.random}

    return preds_dict

In [7]:
# However, while RGB values are simple to understand, the RGB color space fails to mimic how humans perceive color. 
# Instead, we are going to use the HSV color space which maps pixel intensities into a cylinder:

def load_img_open_cv(url):
#     ua = UserAgent()
#     headers = {'user-agent': ua.random}

    resp = urllib.request.urlopen(url)

    image = np.array(bytearray(resp.read()), dtype = np.uint8)
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)
    
    return image
    
def describe( url, bins):
    image = load_img_open_cv(url)
    
    try:
        # convert the image to the HSV color space and initialize the features used to quantify the image
        image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
        features = []
    except:
        return None
    
    # grab the dimensions and compute the center of the image
    (h, w) = image.shape[:2]
    (cX, cY) = (int(w * 0.5), int(h * 0.5))
    
    # divide the image into four rectangles/segments (top-left, top-right, bottom-right, bottom-left)
    segments = [(0, cX, 0, cY), (cX, w, 0, cY), (cX, w, cY, h), (0, cX, cY, h)]
    
    # construct an elliptical mask representing the center of the image
    (axesX, axesY) = (int(w * 0.75) // 2, int(h * 0.75) // 2)
    ellipMask = np.zeros(image.shape[:2], dtype = "uint8")
    cv2.ellipse(ellipMask, (cX, cY), (axesX, axesY), 0, 0, 360, 255, -1)
    
    # loop over the segments
    for (startX, endX, startY, endY) in segments:
        
        # construct a mask for each corner of the image, subtracting the elliptical center from it
        cornerMask = np.zeros(image.shape[:2], dtype = "uint8")
        cv2.rectangle(cornerMask, (startX, startY), (endX, endY), 255, -1)
        cornerMask = cv2.subtract(cornerMask, ellipMask)

        # extract a color histogram from the image, then update the feature vector
        hist = histogram(image, cornerMask, bins)
        features.extend(hist)
        
    # extract a color histogram from the elliptical region and update the feature vector
    hist = histogram(image, ellipMask, bins)
    features.extend(hist)
    
    return features

def histogram(image, mask, bins):
    # extract a 3D color histogram from the masked region of the image, using the supplied number of bins per channel
    hist = cv2.calcHist([image], [0,1,2], mask, [bins,bins,bins],[0, 256, 0, 256, 0, 256])
    
    # normalize the histogram if we are using OpenCV 2.4
    if imutils.is_cv2():
        hist = cv2.normalize(hist).flatten()
        
    # otherwise handle for OpenCV 3+
    else:
        hist = cv2.normalize(hist, hist).flatten()

    return hist

def plot_color_hist(url):
    img = load_img_open_cv(url)
    
    color = ('b','g','r')
    for i,col in enumerate(color):
        histr = cv2.calcHist([img],[i],None,[256],[0,256])
        plt.plot(histr,color = col)
        plt.xlim([0,256])
    plt.show()


In [None]:
attr_preds_list = []
img_preds_list = []
img_color_list = []
img_feats_list = []
img_attraction_list = []
img_list = []

attractions = small_df.index.values
end = 50 #len(small_df) + 1
bins = 12

# This is all we need to load and use the full pretrained model!
model = mobilenet_v2.MobileNetV2(weights='imagenet')

img_num = 0
for x in range(0, end): 
    ua = UserAgent()
    headers = {'user-agent': ua.random}

    attr_preds_list.append(dict(get_predictions_dictionary(x, model))) 
    
    url_list = small_df.iloc[x]
    for url in url_list:
        img_feats_list.append(describe( url, bins))
        
        preds_dict = get_img_predictions_dict(url, headers, model)
        img_preds_list.append(dict(preds_dict))
        
        img_attraction_list.append(attractions[x])
        
        img_list.append(dowload_image(url))
        
        if img_num % 5 == 0:
            ua = UserAgent()
            headers = {'user-agent': ua.random}
        img_num = img_num + 1
    
df_attr_preds = pd.DataFrame(attr_preds_list)
df_attr_preds.set_index(attractions[0:end], inplace=True)

df_img_preds = pd.DataFrame(img_preds_list)
# df_img_preds.set_index(img_attraction_list, inplace=True)

df_img_color = pd.DataFrame(img_feats_list)
# df_img_color.set_index(img_attraction_list, inplace=True)
# print(df_attr_preds)

In [None]:
from sklearn.cluster import KMeans
k = 50
kmeans = KMeans(n_clusters=k,n_jobs=-1, random_state=22)
kmeans.fit(img_feats_list)

In [None]:
# holds the cluster id and the images { id: [images] }
groups = {}
for img, cluster in zip(img_list,kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(img)
    else:
        groups[cluster].append(img)

In [None]:
for cluster in range(0,k):
    view_cluster(cluster)

In [None]:
df_img_preds

In [None]:
df_img_color

In [None]:
url = small_df.iloc[225][25]
show_image(url)

In [None]:
feats = describe(url, 16)
plot_color_hist(url)

In [None]:
get_predictions(url, headers, model)

In [None]:
feats

In [None]:
url