# LSH Algorithm Improvement By Applying Bitmap Indexing

In [1]:
import argparse
import sys
from os import listdir
from os.path import isfile, join
from typing import Dict, List, Optional, Tuple
import imagehash
from PIL import Image
import os, os.path
import cv2
from collections import Counter
import scipy as sp
import numpy as np # Import numpy library 
from sklearn.model_selection import StratifiedKFold #Import stratified kfold as we are using a 10fold cross validation approach
from skimage.feature import hog # Import Hog model to extract features
from sklearn.metrics import confusion_matrix # Import confusion matrix to evaluate the performance

In [2]:
imgs = []
y = []
file_size = []
k = 0
path = "./data/101_ObjectCategories" # Give the dataset path here

##  Data Preprocessing:
1. Load the images using cv2
2. Image resize
3. Feature extraction: BGR to Gray conversion 
4. Feature extraction: Histogram of Oriented Gradients(HOG)

In [3]:
folder = os.listdir(path) # from the given path get the file names such as accordion, airplanes etc..
for file in folder: # for every file name in the given path go inseide that directory and get the images
    subpath = os.path.join(path,file)  # Join the name of these files to the previous path 
    
    files = os.listdir(subpath) # Take these image names to a list called files
    j = 0
    for i in range(np.size(files)): # now we shall loop through these number of files
        
        im = cv2.imread(subpath+'/'+files[0+j]) # Read the images from this subpath
        
        imgs.append(im) # append all the read images to a list called imgs
        y.append(k) # generate a labe to every file and append it to labels list

        j += 1
        if (j == (np.size(files))):
            file_size.append(j)
   
    k += 1
     
y = np.array(y).tolist()
ix = []
for index, item in enumerate(imgs):
    if (np.size(item) == 1):
        ix.append(index)
        del imgs[index]
        
for index, item in enumerate(y):
    for v in range(np.size(ix)):
        if (index == ix[v]):
            del y[index]
        
y = np.array(y).astype(np.float64) 

# Function to convert an image from color to grayscale
def rgb2gray(rgb):
    gray = cv2.cvtColor(rgb, cv2.COLOR_BGR2GRAY)
    return gray

def resize_(image):
    u = cv2.resize(image,(256,256))
    return u

def fd_hog(image):
    fd, hog_image = hog(image, orientations=8, pixels_per_cell=(64, 64),
                    cells_per_block=(1, 1), visualize=True)
    
    return fd

a=[]
for img in imgs:
    
    b=resize_(img)
    c=rgb2gray(b)   
    d=fd_hog(c)
    a.append(d)

a=np.array(a)

/Users/fudonghuang/anaconda3/lib/python3.7/site-packages/skimage/feature/_hog.py:150: skimage_deprecation: Default value of `block_norm`==`L1` is deprecated and will be changed to `L2-Hys` in v0.15. To supress this message specify explicitly the normalization method.
  skimage_deprecation)


## Split data
Split the data to training and validation data. We choose 70% for training and 30% for validation purposes.

In [4]:
# append 'label' and 'id' to the last two colunms
import pandas as pd
df = pd.DataFrame(a)
df['lable'] = y
id_ = np.arange(1,len(df)+1,1)
df['id'] = id_
X = df.values

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Using PySpark to retrieve similar images

In [6]:
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col
from pyspark.conf import SparkConf
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.sql import SparkSession

In [7]:
spark = SparkSession.builder \
     .master("local") \
     .appName("Image Retrieval") \
     .config("spark.some.config.option", "some-value") \
     .getOrCreate()

In [8]:
Train = map(lambda x: (int(x[-1]),int(x[-2]),Vectors.dense(x[:-2])), X_train)
Train_df = spark.createDataFrame(Train,schema=['id','label',"features"])

In [9]:
Test = map(lambda x: (int(x[-1]),int(x[-2]),Vectors.dense(x[:-2])), X_test)
Test_df = spark.createDataFrame(Test,schema=['id','label',"features"])

In [10]:
brp = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", bucketLength=0.6,numHashTables=10)
model = brp.fit(Train_df)

In [11]:
print("The hashed dataset where hashed values are stored in the column 'hashes':")
model.transform(Train_df).show()

The hashed dataset where hashed values are stored in the column 'hashes':
+----+-----+--------------------+--------------------+
|  id|label|            features|              hashes|
+----+-----+--------------------+--------------------+
|7213|   88|[0.19036477451395...|[[0.0], [0.0], [-...|
|5040|   66|[0.14891864858699...|[[0.0], [0.0], [-...|
|4176|   57|[0.04904503626484...|[[0.0], [0.0], [0...|
|7303|   90|[0.21163697710324...|[[-1.0], [0.0], [...|
|4119|   56|[0.05745209141258...|[[0.0], [0.0], [-...|
|2504|   29|[0.19048783853448...|[[0.0], [0.0], [-...|
|2271|   26|[0.08755233061400...|[[0.0], [0.0], [-...|
|4928|   65|[0.20031064348601...|[[0.0], [0.0], [-...|
|7116|   88|[0.29044205775487...|[[0.0], [0.0], [-...|
|4296|   59|[0.18236395870895...|[[-1.0], [0.0], [...|
|5356|   72|[0.29078179220169...|[[0.0], [0.0], [-...|
|5209|   69|[0.05100303792818...|[[0.0], [0.0], [-...|
|5618|   73|[0.12795618347010...|[[0.0], [0.0], [-...|
|8501|   96|[0.20028948043936...|[[-1.0], [0.0

In [13]:
print("Approximately joining Train_df and Test_df on Euclidean distance smaller than 1:")
model.approxSimilarityJoin(Train_df, Test_df, 1.1, distCol="EuclideanDistance")\
    .select(col("datasetA.id").alias("Train_df"),
            col("datasetB.id").alias("Test_df"),
            col("EuclideanDistance")).show(30)

Approximately joining Train_df and Test_df on Euclidean distance smaller than 1:


KeyboardInterrupt: 

In [None]:
key = Vectors.dense(X_test[1][0:-2])

In [None]:
X_test[1][-2]

In [None]:
key

In [None]:
print("Approximately searching Train_df for 2 nearest neighbors of the key:")
result = model.approxNearestNeighbors(Train_df, key, 10)

In [None]:
result_id = result.select('id',).collect()
result_id[0].id

In [None]:
result.show()

In [None]:
# from matplotlib.pyplot import imshow
# imshow(imgs[4795])

In [None]:
# Acc_train = 0 
# for i in range(Test_df.count()):
#     Catg = X_test[i][-2]
#     key = Vectors.dense(X_test[i][0:-2])
#     result = model.approxNearestNeighbors(Train_df, key, 10)
#     temp = Counter([int(row['label']) for row in result.collect()])
#     if  not temp.get(Catg):
#         Acc_train += 0
#     else:
#          Acc_train += 1
#     if (i % 100 == 0):
#         print (i)

In [None]:
Acc_train = 0 
for i in range(0, 100):
    Catg = X_test[i][-2]
    key = Vectors.dense(X_test[i][0:-2])
    result = model.approxNearestNeighbors(Train_df, key, 10)
    temp = Counter([int(row['label']) for row in result.collect()])
    if  not temp.get(Catg):
        Acc_train += 0
    else:
         Acc_train += 1
    if (i % 100 == 0):
        print (i)

In [None]:
# Acc_train / 2744

In [None]:
Acc_train / 100