### md5

In [1]:
import hashlib
from matplotlib.pyplot import imread
from skimage.transform import resize
import cv2
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline
import time
import numpy as np
import pandas as pd

In [None]:
def file_hash(filepath):
    with open(filepath, 'rb') as f:
        return md5(f.read()).hexdigest()

In [None]:
import os
os.getcwd()

In [None]:
os.chdir(r'\\images')
os.getcwd()

In [None]:
file_list = os.listdir()
print(len(file_list))

In [None]:
import hashlib, os
duplicates = []
hash_keys = dict()
for index, filename in  enumerate(os.listdir('.')):
    if os.path.isfile(filename):
        with open(filename, 'rb') as f:
            filehash = hashlib.md5(f.read()).hexdigest()
        if filehash not in hash_keys: 
            hash_keys[filehash] = index
        else:
            duplicates.append((index,hash_keys[filehash]))

In [None]:
duplicates

In [None]:
for file_indexes in duplicates[:30]:
    try:
    
        plt.subplot(121),plt.imshow(imread(file_list[file_indexes[1]]))
        plt.title(file_indexes[1]), plt.xticks([]), plt.yticks([])

        plt.subplot(122),plt.imshow(imread(file_list[file_indexes[0]]))
        plt.title(str(file_indexes[0])), plt.xticks([]), plt.yticks([])
        plt.show()
    
    except OSError as e:
        continue

In [None]:
# Remove Duplicates

for index in duplicates:
    os.remove(file_list[index[0]])

There are four types of Hashes:-

1) Average Hash (aHash): This algorithm computes the average brightness of an image and then reduces the image to a binary hash by comparing each pixel's brightness to the average. It's simple and fast, but not very robust to scaling, rotation, or other image transformations.

2) Difference Hash (dHash): This algorithm works similarly to aHash, but instead of using the average brightness, it computes the difference between adjacent pixels. The resulting hash is more robust to certain types of image transformations, but less so to others.

3) Perceptual Hash (pHash): This algorithm uses a more complex approach based on the Discrete Cosine Transform (DCT) to compute a hash that is more invariant to image transformations like scaling, rotation, and brightness changes. It's generally considered to be more robust than aHash or dHash, but also slower.

4) Wavelet Hash (wHash): This algorithm uses a multi-resolution wavelet decomposition to compute a hash that is invariant to certain types of image transformations, including scaling, rotation, and translation. It's more complex than aHash or dHash, but generally faster than pHash.

Each of these algorithms has its own strengths and weaknesses, and which one you choose will depend on the specific requirements of your application. In general, if you need a fast and simple hash for images that won't be heavily transformed, aHash or dHash may be sufficient. If you need a more robust hash that can handle a wider range of transformations, pHash or wHash may be more appropriate.

Let's try pHash for our use:-

### pHASH

In [2]:
from PIL import ImageFile
from PIL import Image 
ImageFile.LOAD_TRUNCATED_IMAGES = True
Image.MAX_IMAGE_PIXELS = 1000000000 

In [1]:
import pandas as pd
import requests
import io
import imagehash
from PIL import Image

duplicates = []
hash_keys = dict()
df = pd.read_csv("data.csv")

# add 'duplicate_url' column to the dataframe
df['duplicate_url'] = ""

# create a new dataframe for duplicate images
dup_df = pd.DataFrame(columns=df.columns)

# set a threshold for perceptual hash similarity scores
threshold = 5

for index, url in enumerate(df['url']):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            # compute perceptual hash of the image
            img = Image.open(io.BytesIO(response.content))
            phash = str(imagehash.phash(img))
            
            # compare the hash to identify duplicates
            if phash not in hash_keys:
                hash_keys[phash] = index
            else:
                dup_index = hash_keys[phash]
                score = imagehash.hex_to_hash(phash) - imagehash.hex_to_hash(str(imagehash.phash(Image.open(io.BytesIO(requests.get(df['url'][dup_index]).content)))))

                # check if the score is below the threshold
                if score <= threshold:
                    duplicates.append((index, dup_index))
                    # add duplicate image to duplicate dataframe
                    dup_df = pd.concat([dup_df, df.iloc[[index]]])

                    # update 'duplicates' column of original dataframe with URLs of all duplicate images
                    dup_urls = [df.at[dup_index, 'url'], url]
                    duplicate_urls = [url for url in dup_urls if url != df.at[index, 'url']]
                    duplicate_ids = [df.at[dup_index, 'item_id'] for url in dup_urls if url != df.at[index, 'url']]
                    df.at[index, 'duplicate_url'] = ', '.join(duplicate_urls)
                    df.at[index, 'duplicate_ids'] = ', '.join(map(str, duplicate_ids))
                    
                    # add 'duplicate_category_name' column to indicate the category of the duplicate image
                    duplicate_category = df.at[dup_index, 'category_name']
                    df.at[index, 'duplicate_category_name'] = duplicate_category

                    
        else:
            print(f"Error: Could not download image at index {index} ({url}) (status code: {response.status_code})")
            
    except requests.exceptions.RequestException as e:
        print(f"Error: Could not download image at index {index} ({url}) ({str(e)})")

# concatenate original and duplicate dataframes
result_df = pd.concat([df, dup_df])   

# print the duplicate images that belong to the different classes
for dup in duplicates:
    if df.at[dup[0], 'category_name'] != df.at[dup[1], 'category_name']:
        print(f"Duplicate images with item_id {df.at[dup[0], 'item_id']} and {df.at[dup[1], 'item_id']} belong to different classes ({df.at[dup[0], 'category_name']} and {df.at[dup[1], 'category_name']})")
        
result_df.to_csv("duplicate.csv", index=False)

In [2]:
import pandas as pd

# create sample dataframe
df = pd.read_csv("duplicate.csv")

# strip leading and trailing white spaces and convert to lowercase
df['category_name'] = df['category_name'].str.strip().str.lower()
df['duplicate_category_name'] = df['duplicate_category_name'].str.strip().str.lower()

# create comparison column
df['comparison'] = 'no'
df.loc[df['category_name'] == df['duplicate_category_name'], 'comparison'] = 'yes'

print(df)