In [35]:
#### We will be using machine learning techniques for image similarity initially to get a benchmark solution before applying deep leaarning technique

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from imutils import paths
import numpy as np
import argparse
import imutils
import os

In [2]:
def image_to_feature_vector(image, size=(32, 32)):
    # resize the image to a fixed size, then flatten the image into
    # a list of raw pixel intensities
    return cv2.resize(image, size).flatten()

In [3]:
def extract_color_histogram(image, bins=(8, 8, 8)):
    # extract a 3D color histogram from the HSV color space using
    # the supplied number of `bins` per channel
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, bins,
        [0, 180, 0, 256, 0, 256])
    # handle normalizing the histogram if we are using OpenCV 2.4.X
    if imutils.is_cv2():
        hist = cv2.normalize(hist)
    # otherwise, perform "in place" normalization in OpenCV 3 (I
    # personally hate the way this is done
    else:
        cv2.normalize(hist, hist)
    # return the flattened histogram as the feature vector
    return hist.flatten()

In [4]:
# grab the list of images that we'll be describing
print("[INFO] describing images...")
imagePaths = list(paths.list_images('images/conflict/1954'))
# initialize the raw pixel intensities matrix, the features matrix,
# and labels list
rawImages = []
features = []
labels = []

[INFO] describing images...


In [5]:
imagePaths[:5]

['images/conflict/1954/4466335.png',
 'images/conflict/1954/4472340.png',
 'images/conflict/1954/3215127.png',
 'images/conflict/1954/4489694.png',
 'images/conflict/1954/3233692.png']

In [6]:
# loop over the input images
for (i, imagePath) in enumerate(imagePaths):
    # load the image and extract the class label (assuming that our
    # path as the format: /path/to/dataset/{class}.{image_num}.jpg
    image = cv2.imread(imagePath)
    label = imagePath.split(os.path.sep)[-1].split(".")[0]
    # extract raw pixel intensity "features", followed by a color
    # histogram to characterize the color distribution of the pixels
    # in the image
    pixels = image_to_feature_vector(image)
    hist = extract_color_histogram(image)
    # update the raw images, features, and labels matricies,
    # respectively
    rawImages.append(pixels)
    features.append(hist)
    labels.append(label)
    # show an update every 1,000 images
    if i > 0 and i % 1000 == 0:
        print("[INFO] processed {}/{}".format(i, len(imagePaths)))

[INFO] processed 1000/3084
[INFO] processed 2000/3084
[INFO] processed 3000/3084


In [7]:
labels[:5]

['4466335', '4472340', '3215127', '4489694', '3233692']

In [8]:
# show some information on the memory consumed by the raw images
# matrix and features matrix
rawImages = np.array(rawImages)
features = np.array(features)
labels = np.array(labels)
print("[INFO] pixels matrix: {:.2f}MB".format(
    rawImages.nbytes / (1024 * 1000.0)))
print("[INFO] features matrix: {:.2f}MB".format(
        features.nbytes / (1024 * 1000.0)))

[INFO] pixels matrix: 9.25MB
[INFO] features matrix: 6.17MB


In [10]:
from keras.preprocessing import image
from keras.applications.xception import Xception
from keras.applications.xception import preprocess_input
import numpy as np
from sklearn.cluster import KMeans
import os, shutil, glob, os.path
from PIL import Image as pil_image
image.LOAD_TRUNCATED_IMAGES = True 
model = Xception(weights='imagenet', include_top=False)

In [11]:
# Variables
imdir = 'images/conflict/1954'
targetdir = "clusters_kmeans_xception_v2/"

# Loop over files and get features
filelist = glob.glob(os.path.join(imdir, '*.png'))
filelist.sort()
featurelist = []
for i, imagepath in enumerate(filelist):
    print("    Status: %s / %s" %(i, len(filelist)), end="\r")
    img = image.load_img(imagepath, target_size=(224, 224))
    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)
    features = np.array(model.predict(img_data))
    featurelist.append(features.flatten())

    Status: 3083 / 3084

In [13]:
number_clusters = 1028

# Clustering
kmeans = KMeans(n_clusters=number_clusters, random_state=0).fit(np.array(featurelist))

# Copy images renamed by cluster 
# Check if target dir exists
try:
    os.makedirs(targetdir)
except OSError:
    pass
# Copy with cluster name
print("\n")
for i, m in enumerate(kmeans.labels_):
    print("    Copy: %s / %s" %(i, len(kmeans.labels_)), end="\r")
    shutil.copy(filelist[i], targetdir + str(m) + "_" + str(i) + ".jpg")



    Copy: 3083 / 3084

In [12]:
targetdir

'clusters_kmeans_xception_v2/'

#### Running on edge detection algorithm generated images

In [14]:
# Variables
imdir = 'edge_detection'

# Loop over files and get features
filelist = glob.glob(os.path.join(imdir, '*.png'))
filelist.sort()
featurelist = []
for i, imagepath in enumerate(filelist):
    print("    Status: %s / %s" %(i, len(filelist)), end="\r")
    img = image.load_img(imagepath, target_size=(224, 224))
    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)
    features = np.array(model.predict(img_data))
    featurelist.append(features.flatten())

    Status: 107 / 108

In [15]:
from tqdm import tqdm
### Using elbow method to find optimal number of clusters
SSE = []
for cluster in tqdm([5,10,15,25,40]):
    print("Number of Clusters ",cluster)
    kmeans = KMeans(n_clusters=cluster,n_jobs=-1, random_state=0).fit(np.array(featurelist))
    print(kmeans.inertia_)
    SSE.append(kmeans.inertia_)

  0%|          | 0/5 [00:00<?, ?it/s]

Number of Clusters  5


 20%|██        | 1/5 [00:03<00:14,  3.65s/it]

838705.3125
Number of Clusters  10


 40%|████      | 2/5 [00:10<00:13,  4.63s/it]

710950.875
Number of Clusters  15


 60%|██████    | 3/5 [00:17<00:10,  5.36s/it]

613447.5625
Number of Clusters  25


 80%|████████  | 4/5 [00:31<00:07,  7.94s/it]

445354.78125
Number of Clusters  40


100%|██████████| 5/5 [00:48<00:00,  9.67s/it]

285928.3125





In [16]:
filelist

['edge_detection/10_horizontal.png',
 'edge_detection/10_vertical.png',
 'edge_detection/11_horizontal.png',
 'edge_detection/11_vertical.png',
 'edge_detection/12_horizontal.png',
 'edge_detection/12_vertical.png',
 'edge_detection/13_horizontal.png',
 'edge_detection/13_vertical.png',
 'edge_detection/14_horizontal.png',
 'edge_detection/14_vertical.png',
 'edge_detection/24_vertical.png',
 'edge_detection/25_horizontal.png',
 'edge_detection/25_vertical.png',
 'edge_detection/26_horizontal.png',
 'edge_detection/26_vertical.png',
 'edge_detection/27_horizontal.png',
 'edge_detection/27_vertical.png',
 'edge_detection/28_horizontal.png',
 'edge_detection/28_vertical.png',
 'edge_detection/29_horizontal.png',
 'edge_detection/29_vertical.png',
 'edge_detection/2_horizontal.png',
 'edge_detection/2_vertical.png',
 'edge_detection/30_horizontal.png',
 'edge_detection/30_vertical.png',
 'edge_detection/31_horizontal.png',
 'edge_detection/31_vertical.png',
 'edge_detection/32_horizontal.

In [9]:
from tqdm import tqdm
### Using elbow method to find optimal number of clusters
SSE = []
for cluster in tqdm([150,192,220,257,308,385,514,771,1000]):
    print("Number of CLusters ",cluster)
    kmeans = KMeans(n_clusters=cluster,n_jobs=-1, random_state=0).fit(np.array(featurelist))

    SSE.append(kmeans.inertia_)

  0%|          | 0/9 [00:00<?, ?it/s]

Number of CLusters  150


 11%|█         | 1/9 [28:17<3:46:20, 1697.55s/it]

Number of CLusters  192


 22%|██▏       | 2/9 [1:04:25<3:34:31, 1838.76s/it]

Number of CLusters  220


 33%|███▎      | 3/9 [1:49:42<3:30:13, 2102.21s/it]

Number of CLusters  257


 44%|████▍     | 4/9 [2:42:56<3:22:28, 2429.64s/it]

Number of CLusters  308


 56%|█████▌    | 5/9 [3:44:46<3:07:35, 2813.88s/it]

Number of CLusters  385


 67%|██████▋   | 6/9 [7:30:59<5:02:04, 6041.39s/it]

Number of CLusters  514


 78%|███████▊  | 7/9 [9:16:59<3:24:34, 6137.09s/it]

Number of CLusters  771


 89%|████████▉ | 8/9 [12:43:07<2:13:26, 8006.32s/it]

Number of CLusters  1000


100%|██████████| 9/9 [62:49:40<00:00, 25131.16s/it] 


In [10]:
SSE

[21199678.0,
 20359950.0,
 19800666.0,
 19160336.0,
 18264618.0,
 17041110.0,
 15093859.0,
 11968366.0,
 9573138.0]

### inertia actually calculates the sum of distances of all the points within a cluster from the centroid of that cluster

In [None]:
%%time

# converting the results into a dataframe and plotting them
frame = pd.DataFrame({'Cluster':[150,192,220,257,308,385,514,771,1000], 'SSE':SSE})
plt.figure(figsize=(12,6))
plt.plot(frame['Cluster'], frame['SSE'], marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.savefig("xception.png")