In [1]:
# Import modules
import pandas as pd
import urllib.request

In [2]:
def mysymlink(source, link_name):
    import os
    os_symlink = getattr(os, "symlink", None)
    if callable(os_symlink):
        os_symlink(source, link_name)
    else:
        import ctypes
        csl = ctypes.windll.kernel32.CreateSymbolicLinkW
        csl.argtypes = (ctypes.c_wchar_p, ctypes.c_wchar_p, ctypes.c_uint32)
        csl.restype = ctypes.c_ubyte
        flags = 1 if os.path.isdir(source) else 0
        if csl(link_name, source, flags) == 0:
            raise ctypes.WinError()

# Load data indexes

In [3]:
# Load data 
index = pd.read_csv('index.csv') # index [ID, URL]
test = pd.read_csv('test.csv') # test [ID, URL]
sample_submission = pd.read_csv('sample_submission.csv') # sample_submission [ID, similar_image_ID_array]

In [4]:
print(len(index))
print(len(sample_submission))
print(len(test))

1098461
117703
117703


# Download images

In [5]:
# Define image download function
def download_images(df = None, folder = None, file_extension = None):
    import os
    import urllib.request
    image_dict = df.set_index('id').T.to_dict('list')
    for image in image_dict.keys():
        image_id = image
        if os.path.isfile(image_id) == False:
            url = image_dict[image][0]
            image_file_path = folder + image_id + file_extension
            try:
                urllib.request.urlretrieve(url, image_file_path)
            except:
                print("url error:", url)
        else:
            pass

In [None]:
# Download sample of test data 
test_df = test.iloc[1000:10000]
download_images(df = test_df, folder = "image_downloads_test/", file_extension = ".jpg")

url error: https://lh3.googleusercontent.com/-h0rtva4WT5w/WPuX6QmLfQI/AAAAAAAAAkk/7tokxCFS-gI1I-Kx9FV1UNO-wX3Qw430gCOcB/s1600/
url error: https://lh3.googleusercontent.com/-WXJquvoG_lk/WLetwQl_R_I/AAAAAAAAEIY/Z6IfvA3dlX0NbtPGXPvSa5vjuDqKd07nQCOcB/s1600/
url error: https://lh3.googleusercontent.com/-dQEjJvcAr2g/WP_PVglj2bI/AAAAAAABScs/J7ryU2AuLqIiz3cPFaIVLky83_JGYovngCOcB/s1600/


In [None]:
# Download sample of index data 
index_df = index.iloc[1000:10000]
download_images(df = index_df, folder = "image_downloads_index/", file_extension = ".jpg")

# Image Cluster 

In [26]:
from scipy.spatial import distance
from scipy.cluster import hierarchy
import numpy as np

import PIL.Image, os, shutil
from keras.applications.vgg16 import VGG16
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input
from keras.models import Model

from imagecluster import common as co

pj = os.path.join


def get_model():
    """Keras Model of the VGG16 network, with the output layer set to the
    second-to-last fully connected layer 'fc2' of shape (4096,)."""
    # base_model.summary():
    #     ....
    #     block5_conv4 (Conv2D)        (None, 15, 15, 512)       2359808
    #     _________________________________________________________________
    #     block5_pool (MaxPooling2D)   (None, 7, 7, 512)         0
    #     _________________________________________________________________
    #     flatten (Flatten)            (None, 25088)             0
    #     _________________________________________________________________
    #     fc1 (Dense)                  (None, 4096)              102764544
    #     _________________________________________________________________
    #     fc2 (Dense)                  (None, 4096)              16781312
    #     _________________________________________________________________
    #     predictions (Dense)          (None, 1000)              4097000
    #
    base_model = VGG16(weights='imagenet', include_top=True)
    model = Model(inputs=base_model.input,
                  outputs=base_model.get_layer('fc2').output)
    return model


def fingerprint(fn, model, size):
    try:
        """Load image from file `fn`, resize to `size` and run through `model`
        (keras.models.Model).

        Parameters
        ----------
        fn : str
            filename
        model : keras.models.Model instance
        size : tuple
            input image size (width, height), must match `model`, e.g. (224,224)

        Returns
        -------
        fingerprint : 1d array
        """
        print(fn)

        # keras.preprocessing.image.load_img() uses img.rezize(shape) with the
        # default interpolation of PIL.Image.resize() which is pretty bad (see
        # imagecluster/play/pil_resample_methods.py). Given that we are restricted
        # to small inputs of 224x224 by the VGG network, we should do our best to
        # keep as much information from the original image as possible. This is a
        # gut feeling, untested. But given that model.predict() is 10x slower than
        # PIL image loading and resizing .. who cares.
        #
        # (224, 224, 3)
        ##img = image.load_img(fn, target_size=size)
        img = PIL.Image.open(fn).resize(size, 2)

        # (224, 224, {3,1})
        arr3d = image.img_to_array(img)

        # (224, 224, 1) -> (224, 224, 3)
        #
        # Simple hack to convert a grayscale image to fake RGB by replication of
        # the image data to all 3 channels.
        #
        # Deep learning models may have learned color-specific filters, but the
        # assumption is that structural image features (edges etc) contibute more to
        # the image representation than color, such that this hack makes it possible
        # to process gray-scale images with nets trained on color images (like
        # VGG16).
        if arr3d.shape[2] == 1:
            arr3d = arr3d.repeat(3, axis=2)

        # (1, 224, 224, 3)
        arr4d = np.expand_dims(arr3d, axis=0)

        # (1, 224, 224, 3)
        arr4d_pp = preprocess_input(arr4d)
        return model.predict(arr4d_pp)[0,:]
    except:
        print(fn, "IMAGE DID NOT FIT ARRAY SHAPE, SKIPPED")

# Cannot use multiprocessing (only tensorflow backend tested):
# TypeError: can't pickle _thread.lock objects
# The error doesn't come from functools.partial since those objects are
# pickable since python3. The reason is the keras.model.Model, which is not
# pickable. However keras with tensorflow backend runs multi-threaded
# (model.predict()), so we don't need that. I guess it will scale better if we
# parallelize over images than to run a muti-threaded tensorflow on each image,
# but OK. On low core counts (2-4), it won't matter.
#
##def _worker(fn, model, size):
##    print(fn)
##    return fn, fingerprint(fn, model, size)
##
##def fingerprints(files, model, size=(224,224)):
##    worker = functools.partial(_worker,
##                               model=model,
##                               size=size)
##    pool = multiprocessing.Pool(multiprocessing.cpu_count())
##    return dict(pool.map(worker, files))


def fingerprints(files, model, size=(224,224)):
    """Calculate fingerprints for all `files`.

    Parameters
    ----------
    files : sequence
        image filenames
    model, size : see :func:`fingerprint`

    Returns
    -------
    fingerprints : dict
        {filename1: array([...]),
         filename2: array([...]),
         ...
         }
    """
    output = dict((fn, fingerprint(fn, model, size)) for fn in files)
    print(output)
    return output


def cluster(fps, sim=0.5, method='average', metric='euclidean'):
    try:
        """Hierarchical clustering of images based on image fingerprints.

        Parameters
        ----------
        fps: dict
            output of :func:`fingerprints`
        sim : float 0..1
            similarity index
        method : see scipy.hierarchy.linkage(), all except 'centroid' produce
            pretty much the same result
        metric : see scipy.hierarchy.linkage(), make sure to use 'euclidean' in
            case of method='centroid', 'median' or 'ward'

        Returns
        -------
        clusters : nested list
            [[filename1, filename5],                    # cluster 1
             [filename23],                              # cluster 2
             [filename48, filename2, filename42, ...],  # cluster 3
             ...
             ]
        """
        assert 0 <= sim <= 1, "sim not 0..1"
        # array(list(...)): 2d array
        #   [[... fingerprint of image1 (4096,) ...],
        #    [... fingerprint of image2 (4096,) ...],
        #    ...
        #    ]
        try:
            dfps = distance.pdist(np.array(list(fps.values())), metric)
            files = list(fps.keys())
        except:
            print("C1")
        # hierarchical/agglomerative clustering (Z = linkage matrix, construct
        # dendrogram)
        
        Z = hierarchy.linkage(dfps, method=method, metric=metric)
        # cut dendrogram, extract clusters
        cut = hierarchy.fcluster(Z, t=dfps.max()*(1.0-sim), criterion='distance')
        cluster_dct = dict((ii,[]) for ii in np.unique(cut))
        for iimg,iclus in enumerate(cut):
            cluster_dct[iclus].append(files[iimg])
        return list(cluster_dct.values())
    except:
        print("CLUSTER ISSUE")


def make_links(clusters, cluster_dr):
    # group all clusters (cluster = list_of_files) of equal size together
    # {number_of_files1: [[list_of_files], [list_of_files],...],
    #  number_of_files2: [[list_of_files],...],
    # }
    cdct_multi = {}
    for x in clusters:
        nn = len(x)
        if nn > 1:
            if not (nn in cdct_multi.keys()):
                cdct_multi[nn] = [x]
            else:
                cdct_multi[nn].append(x)

    #print("cluster dir: {}".format(cluster_dr))
    #print("items per cluster : number of such clusters")
    if os.path.exists(cluster_dr):
        shutil.rmtree(cluster_dr)
    
    dfout = pd.DataFrame({'cluster_n':[],'sub_cluster':[],'image':[]})
    for cluster_n, nn in enumerate(np.sort(list(cdct_multi.keys()))): # each cluster 
        cluster_list = cdct_multi[nn]
        print("{} : {}".format(nn, len(cluster_list)))
        for insidegroup, lst in enumerate(cluster_list): # each sub-cluster 
            dr = pj(cluster_dr,
                    'cluster_with_{}'.format(nn),
                    'cluster_{}'.format(insidegroup))
            for record, fn in enumerate(lst):  # each record 
                link = pj(dr, os.path.basename(fn))
                os.makedirs(os.path.dirname(link), exist_ok=True)
                dftemp = pd.DataFrame({'cluster_n':[cluster_n],'sub_cluster':[insidegroup],'image':[fn]})
                dfout = dfout.append(dftemp)
                #os.symlink(os.path.abspath(fn), link)
    return dfout

In [45]:
import os, re
import numpy as np
from imagecluster import imagecluster2 as ic
from imagecluster import common as co

pj = os.path.join
ic_base_dir = 'imagecluster'
imagedir = "image_downloads_test/"
sim = 0.49

dbfn = pj(imagedir, ic_base_dir, 'fingerprints.pk')
if not os.path.exists(dbfn):
    os.makedirs(os.path.dirname(dbfn), exist_ok=True)
    print("no fingerprints database {} found".format(dbfn))
    files = co.get_files(imagedir)
    model = get_model()
    print("running all images through NN model ...".format(dbfn))
    fps = fingerprints(files, model, size=(224,224))
    co.write_pk(fps, dbfn)
else:
    print("loading fingerprints database {} ...".format(dbfn))
    fps = co.read_pk(dbfn)
    fps2 = {}
    for x in fps:
        key = x
        value = fps[key]
        try:
            value.shape
            fps2[key] = value
        except:
            pass
print("clustering ...")
df = make_links(cluster(fps2, sim), pj(imagedir, ic_base_dir, 'clusters'))

loading fingerprints database image_downloads_test/imagecluster\fingerprints.pk ...
clustering ...
2 : 10
3 : 4
4 : 1
5 : 2
7 : 1
45 : 1
810 : 1


In [46]:
df['unique_cluster'] = df['cluster_n'].astype(str) + "." + df['sub_cluster'].astype(str)
df

Unnamed: 0,cluster_n,image,sub_cluster,unique_cluster
0,0.0,image_downloads_test/016679ee5cd1e4ce.jpg,0.0,0.0.0.0
0,0.0,image_downloads_test/0188f5ba544c317b.jpg,0.0,0.0.0.0
0,0.0,image_downloads_test/00942a43cdc2f6f1.jpg,1.0,0.0.1.0
0,0.0,image_downloads_test/01a851a97dda9140.jpg,1.0,0.0.1.0
0,0.0,image_downloads_test/00a902847134b54d.jpg,2.0,0.0.2.0
0,0.0,image_downloads_test/0169ff34f000e772.jpg,2.0,0.0.2.0
0,0.0,image_downloads_test/0014be4f4910bbe6.jpg,3.0,0.0.3.0
0,0.0,image_downloads_test/008cf75c83fa5067.jpg,3.0,0.0.3.0
0,0.0,image_downloads_test/001693c1ea808396.jpg,4.0,0.0.4.0
0,0.0,image_downloads_test/015c204ad3908890.jpg,4.0,0.0.4.0


In [8]:
#df['ben_real'] = df['image'].str.contains('ben')
#df_cluster_majority = df.groupby(['cluster_n']).agg(lambda x:x.value_counts().index[0]).reset_index()[['cluster_n','ben_real']]
#df_cluster_majority = df_cluster_majority.rename(columns={'ben_real':'cluster_majority'})
#df_final = df.merge(df_cluster_majority,on="cluster_n",how='left')

In [9]:
#df_final['result'] = np.where((df_final['ben_real'] == df_final['cluster_majority']), 1, 0)
#df_final.head()

Unnamed: 0,cluster_n,image,ben_real,cluster_majority,result
0,0.0,test_image_two_groups/taj23.jpg,False,False,1
1,0.0,test_image_two_groups/taj76.jpg,False,False,1
2,0.0,test_image_two_groups/taj26.jpeg,False,False,1
3,0.0,test_image_two_groups/taj46.jpg,False,False,1
4,1.0,test_image_two_groups/taj63.jpg,False,False,1


In [155]:
#df_final['result'].sum() / len(df_final)

0.96875