# LSH Algorithm Improvement By Applying Bitmap Indexing

In [1]:
! pip3 install progressbar2
import argparse
import sys
from os import listdir
from os.path import isfile, join
from typing import Dict, List, Optional, Tuple
import imagehash
from PIL import Image
import os, os.path
import cv2
from collections import Counter
import scipy as sp
import numpy as np # Import numpy library 
from skimage.feature import hog # Import Hog model to extract features
from sklearn.metrics import confusion_matrix # Import confusion matrix to evaluate the performance
import pandas as pd
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col
from pyspark.conf import SparkConf
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.sql import SparkSession
from sklearn.model_selection import train_test_split



In [2]:
imgs = []
y = []
file_size = []
k = 0
path = "./data/101_ObjectCategories" # Give the dataset path here

##  Data Preprocessing:
1. Load the images using cv2
2. Image resize
3. Feature extraction: BGR to Gray conversion 
4. Feature extraction: Histogram of Oriented Gradients(HOG)

In [3]:
folder = os.listdir(path) # from the given path get the file names such as accordion, airplanes etc..
for file in folder: # for every file name in the given path go inseide that directory and get the images
    subpath = os.path.join(path,file)  # Join the name of these files to the previous path 
    
    files = os.listdir(subpath) # Take these image names to a list called files
    j = 0
    for i in range(np.size(files)): # now we shall loop through these number of files
        
        im = cv2.imread(subpath+'/'+files[0+j]) # Read the images from this subpath
        
        imgs.append(im) # append all the read images to a list called imgs
        y.append(k) # generate a labe to every file and append it to labels list

        j += 1
        if (j == (np.size(files))):
            file_size.append(j)
   
    k += 1
     
y = np.array(y).tolist()
ix = []
for index, item in enumerate(imgs):
    if (np.size(item) == 1):
        ix.append(index)
        del imgs[index]
        
for index, item in enumerate(y):
    for v in range(np.size(ix)):
        if (index == ix[v]):
            del y[index]
        
y = np.array(y).astype(np.float64) 

# Function to convert an image from color to grayscale
def rgb2gray(rgb):
    gray = cv2.cvtColor(rgb, cv2.COLOR_BGR2GRAY)
    return gray

def resize_(image):
    u = cv2.resize(image,(256,256))
    return u

def fd_hog(image):
    fd = hog(image, orientations=8, pixels_per_cell=(64, 64),
                        cells_per_block=(2, 2))
    
    return fd

In [4]:
a=[]
import progressbar
with progressbar.ProgressBar(max_value=len(imgs)) as bar:
    i=1
    for img in imgs:
        b=resize_(img)
        c=rgb2gray(b)   
        d=fd_hog(c)
        a.append(d)
        bar.update(i)
        i+=1

/Users/fudonghuang/anaconda3/lib/python3.7/site-packages/skimage/feature/_hog.py:150: skimage_deprecation: Default value of `block_norm`==`L1` is deprecated and will be changed to `L2-Hys` in v0.15. To supress this message specify explicitly the normalization method.
  skimage_deprecation)
100% (9176 of 9176) |####################| Elapsed Time: 0:04:05 Time:  0:04:05


In [5]:
print("HOG diamension: ")
len(a[0])

HOG diamension: 


288

In [6]:
def getBestPerformance(a, numOfTest, bucketLength,numHashTables,numOfNeighbor):
    df = pd.DataFrame(a)
    df['lable'] = y
    id_ = np.arange(1,len(df)+1,1)
    df['id'] = id_
    X = df.values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    spark = SparkSession.builder \
     .master("local") \
     .appName("Image Retrieval") \
     .config("spark.some.config.option", "some-value") \
     .getOrCreate()

    Train = map(lambda x: (int(x[-1]),int(x[-2]),Vectors.dense(x[:-2])), X_train)
    Train_df = spark.createDataFrame(Train,schema=['id','label',"features"])
    Test = map(lambda x: (int(x[-1]),int(x[-2]),Vectors.dense(x[:-2])), X_test)
    Test_df = spark.createDataFrame(Test,schema=['id','label',"features"])

    brp = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", 
                                      bucketLength=bucketLength,numHashTables=numHashTables)
    
    
    model = brp.fit(Train_df)
    model.transform(Train_df)
    ## run test dataset 
    accuracy = 0
    with progressbar.ProgressBar(max_value = numOfTest) as bar:
        for i in range(0, numOfTest):
            Catg = X_test[i][-2]
            key = Vectors.dense(X_test[i][0:-2])
            result = model.approxNearestNeighbors(Train_df, key, numOfNeighbor)
            temp = Counter([int(row['label']) for row in result.collect()])
            if  Catg in temp:
                accuracy += temp.get(Catg)/ numOfNeighbor
            bar.update(i)
        accuracy /= numOfTest
    return accuracy

In [14]:
#set Param 
bucketLengthList = np.arange(20, 61, 5)
numHashTablesList = np.arange(0,125,10)
numHashTablesList[0] = 1
numOfNeighbor = [5]
numOfTest = 1000
print("Checking bucketLength Param:")
print(bucketLengthList)
print("Checking numHashTablesList Param:")
print(numHashTablesList)

Checking bucketLength Param:
[20 25 30 35 40 45 50 55 60]
Checking numHashTablesList Param:
[  1  10  20  30  40  50  60  70  80  90 100 110 120]


In [None]:
%%time
bucketLengthList_para=[]
numHashTablesList_para=[]
resList = []
for i in bucketLengthList:
    for j in numHashTablesList:
        for k in numOfNeighbor:
            result = getBestPerformance(a, numOfTest ,i, j, k)
            resList.append(result)
            resStr = "bucketLen:" + str(i) + "  #Hashtable:" + str(j) + "  #Neighbor:" + str(k) + "  Acc:" + str(result)
            print(resStr)
            bucketLengthList_para.append(i)
            numHashTablesList_para.append(j)

100% (1000 of 1000) |####################| Elapsed Time: 0:17:50 Time:  0:17:50


bucketLen:20  #Hashtable:1  #Neighbor:5  Acc:0.39259999999999917


100% (1000 of 1000) |####################| Elapsed Time: 0:15:14 Time:  0:15:14


bucketLen:20  #Hashtable:10  #Neighbor:5  Acc:0.41099999999999925


100% (1000 of 1000) |####################| Elapsed Time: 0:14:49 Time:  0:14:49


bucketLen:20  #Hashtable:20  #Neighbor:5  Acc:0.425399999999999


100% (1000 of 1000) |####################| Elapsed Time: 0:15:46 Time:  0:15:46


bucketLen:20  #Hashtable:30  #Neighbor:5  Acc:0.4125999999999991


100% (1000 of 1000) |####################| Elapsed Time: 0:19:06 Time:  0:19:06


bucketLen:20  #Hashtable:40  #Neighbor:5  Acc:0.4037999999999989


100% (1000 of 1000) |####################| Elapsed Time: 0:26:49 Time:  0:26:49


bucketLen:20  #Hashtable:50  #Neighbor:5  Acc:0.4129999999999993


100% (1000 of 1000) |####################| Elapsed Time: 0:31:00 Time:  0:31:00


bucketLen:20  #Hashtable:60  #Neighbor:5  Acc:0.41559999999999914


 49% (493 of 1000) |##########           | Elapsed Time: 0:12:14 ETA:   0:13:36

In [None]:
df_result = pd.DataFrame()
df_result['BucketLength'] = bucketLengthList_para
df_result['NumHashTables'] = numHashTablesList_para
df_result['Acc'] = resList
df_result = df_result.sort_values(by=['Acc'],ascending=False)
df_result.to_csv('./result.csv') #Chang the name every you wanna sava a file

In [None]:
df_result

# !!!Skip all the code below !!!

## Split data
Split the data to training and validation data. We choose 70% for training and 30% for validation purposes.

In [None]:
%%time
# append 'label' and 'id' to the last two colunms
df = pd.DataFrame(a)
df['lable'] = y
id_ = np.arange(1,len(df)+1,1)
df['id'] = id_
X = df.values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Using PySpark to retrieve similar images

In [None]:
spark = SparkSession.builder \
     .master("local") \
     .appName("Image Retrieval") \
     .config("spark.some.config.option", "some-value") \
     .getOrCreate()

In [None]:

Train = map(lambda x: (int(x[-1]),int(x[-2]),Vectors.dense(x[:-2])), X_train)
Train_df = spark.createDataFrame(Train,schema=['id','label',"features"])

In [None]:
Test = map(lambda x: (int(x[-1]),int(x[-2]),Vectors.dense(x[:-2])), X_test)
Test_df = spark.createDataFrame(Test,schema=['id','label',"features"])

In [None]:
Train_df.show(n = 2)

# !!!!! Skip以下代码直接运行最后一行 !!!!!

In [None]:
## skip以下代码直接运行最后一行

In [None]:
brp = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes",bucketLength=2,numHashTables=3)
model = brp.fit(Train_df)
print("The hashed dataset where hashed values are stored in the column 'hashes':")
model.transform(Train_df).show()

In [None]:
key = Vectors.dense(X_test[0][0:-2])

In [None]:
key

In [None]:
X_test[0][-2]

In [None]:
print("Approximately searching Train_df for 2 nearest neighbors of the key:")
model.approxNearestNeighbors(Train_df, key, 5).show()

In [None]:
# result_id = result.select('label',).collect()
# result_id[0].label

In [None]:
# print("Approximately joining Train_df and Test_df on Euclidean distance smaller than 1:")
# model.approxSimilarityJoin(Train_df, Test_df, 1.1, distCol="EuclideanDistance")\
#     .select(col("datasetA.id").alias("Train_df"),
#             col("datasetB.id").alias("Test_df"),
#             col("EuclideanDistance")).show(30)

In [None]:
accuracy = 0
numOfNeighbor = 5
numOfTest= 5
accList = []
with progressbar.ProgressBar(max_value=numOfTest) as bar:
    for i in range(0, numOfTest):
        Catg = X_test[i][-2]
        key = Vectors.dense(X_test[i][0:-2])
        result = model.approxNearestNeighbors(Train_df, key, numOfNeighbor)
        temp = Counter([int(row['label']) for row in result.collect()])
        if  Catg in temp:
            accuracy += temp.get(Catg)/ numOfNeighbor
            accList.append(temp.get(Catg)/ numOfNeighbor)
        else:
            accList.append(0)
        bar.update(i)
    accuracy /= numOfTest

In [None]:
accuracy

In [None]:
print(accList)

# 运行下面一行 

In [None]:
# from matplotlib.pyplot import imshow
# imshow(imgs[4795])