In [6]:
import pandas as pd
import numpy as np



from PIL import Image
import imagehash
import os


In [7]:
def calculate_hash(dir):
    """Generate Hash Values for all images in a directory

    Args:
        dir (str): Directory to search for images

    Returns:
        hash_values (list): List of hash values for all images in the directory
        image_ids (list): List of image ids for all images in the directory
    """

    hash_values = []
    image_ids = []

    for file in os.listdir(dir):

        path = os.path.join(dir, file)
        img = Image.open(path)

        hash = imagehash.average_hash(img)

        hash_values.append(hash)
        image_ids.append(file)

    return image_ids, hash_values

In [8]:
image_ids, hash_values = calculate_hash("C:\\Users\\sdass\\Pictures\\FreeVideoToJPGConverter\\1. Python for data mining (11-1-2022 12-59-17 PM)")

In [9]:
hash_values

[array([[False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False]]),
 array([[False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False],
        

In [10]:
def prepare_dataframe(image_ids, hash_values):

    # Create DataFrame with hash values and image ids
    df = pd.DataFrame(
        {
            "image_ids": image_ids, 
            "hash_values": hash_values, 
            }
    )

    # Create new columns in df with image_ids having hash difference value=0 
    for i in range(len(df.image_ids)):
        df[f"diff_{image_ids[i]}"] = 0

    return df

In [11]:
df = prepare_dataframe(image_ids, hash_values)

In [12]:
df

Unnamed: 0,image_ids,hash_values,diff_1. Python for data mining 1.jpg,diff_1. Python for data mining 2.jpg,diff_1. Python for data mining 3.jpg,diff_1. Python for data mining 4.jpg,diff_1. Python for data mining 5.jpg,diff_1. Python for data mining 6.jpg,diff_1. Python for data mining 7.jpg,diff_1. Python for data mining 8.jpg,diff_1. Python for data mining 9.jpg
0,1. Python for data mining 1.jpg,0000000000000000,0,0,0,0,0,0,0,0,0
1,1. Python for data mining 2.jpg,0000000000000000,0,0,0,0,0,0,0,0,0
2,1. Python for data mining 3.jpg,1e8e9e9e9ebf8e9e,0,0,0,0,0,0,0,0,0
3,1. Python for data mining 4.jpg,ffa393c7938387b2,0,0,0,0,0,0,0,0,0
4,1. Python for data mining 5.jpg,8e9f9f8e9e869e8e,0,0,0,0,0,0,0,0,0
5,1. Python for data mining 6.jpg,f8fcf8f8fef60000,0,0,0,0,0,0,0,0,0
6,1. Python for data mining 7.jpg,0e8e8edffefe02be,0,0,0,0,0,0,0,0,0
7,1. Python for data mining 8.jpg,9e9e9ebf9f9f0f86,0,0,0,0,0,0,0,0,0
8,1. Python for data mining 9.jpg,f8fcf8f8fef60000,0,0,0,0,0,0,0,0,0


In [13]:
def calculate_differences(df):

    # Obtain difference for every image_id one by one
    for i in range(len(df.hash_values)):
        differences = []

        for j in range(len(df.hash_values)):
            differences.append(df.hash_values[i] - df.hash_values[j])

        # Store the difference values for every image_id
        df.iloc[i, 2:] = differences

    return df

In [14]:
df = calculate_differences(df)

In [19]:
df

Unnamed: 0,image_ids,hash_values,diff_1. Python for data mining 1.jpg,diff_1. Python for data mining 2.jpg,diff_1. Python for data mining 3.jpg,diff_1. Python for data mining 4.jpg,diff_1. Python for data mining 5.jpg,diff_1. Python for data mining 6.jpg,diff_1. Python for data mining 7.jpg,diff_1. Python for data mining 8.jpg,diff_1. Python for data mining 9.jpg
0,1. Python for data mining 1.jpg,0000000000000000,0,0,39,36,37,34,39,41,34
1,1. Python for data mining 2.jpg,0000000000000000,0,0,39,36,37,34,39,41,34
2,1. Python for data mining 3.jpg,1e8e9e9e9ebf8e9e,39,39,0,27,12,31,12,10,31
3,1. Python for data mining 4.jpg,ffa393c7938387b2,36,36,27,0,25,38,31,25,38
4,1. Python for data mining 5.jpg,8e9f9f8e9e869e8e,37,37,12,25,0,33,20,14,33
5,1. Python for data mining 6.jpg,f8fcf8f8fef60000,34,34,31,38,33,0,27,29,0
6,1. Python for data mining 7.jpg,0e8e8edffefe02be,39,39,12,31,20,27,0,18,27
7,1. Python for data mining 8.jpg,9e9e9ebf9f9f0f86,41,41,10,25,14,29,18,0,29
8,1. Python for data mining 9.jpg,f8fcf8f8fef60000,34,34,31,38,33,0,27,29,0


In [16]:
# Solution 2
def remove_duplicates(df):
    # image = [range(0,6)]
    for i in range(len(df.image_ids)):
        # clean_df = df.
        clean_df = df.drop(df[df[f"diff_{df.image_ids[i]}"] < 15][1:].index)

    return clean_df

In [17]:
clean_df = remove_duplicates(df)

In [26]:
clean_df

Unnamed: 0,image_ids,hash_values,diff_1. Python for data mining 1.jpg,diff_1. Python for data mining 2.jpg,diff_1. Python for data mining 3.jpg,diff_1. Python for data mining 4.jpg,diff_1. Python for data mining 5.jpg,diff_1. Python for data mining 6.jpg,diff_1. Python for data mining 7.jpg,diff_1. Python for data mining 8.jpg,diff_1. Python for data mining 9.jpg
0,1. Python for data mining 1.jpg,0000000000000000,0,0,39,36,37,34,39,41,34
1,1. Python for data mining 2.jpg,0000000000000000,0,0,39,36,37,34,39,41,34
2,1. Python for data mining 3.jpg,1e8e9e9e9ebf8e9e,39,39,0,27,12,31,12,10,31
3,1. Python for data mining 4.jpg,ffa393c7938387b2,36,36,27,0,25,38,31,25,38
4,1. Python for data mining 5.jpg,8e9f9f8e9e869e8e,37,37,12,25,0,33,20,14,33
5,1. Python for data mining 6.jpg,f8fcf8f8fef60000,34,34,31,38,33,0,27,29,0
6,1. Python for data mining 7.jpg,0e8e8edffefe02be,39,39,12,31,20,27,0,18,27
7,1. Python for data mining 8.jpg,9e9e9ebf9f9f0f86,41,41,10,25,14,29,18,0,29


In [27]:
df

Unnamed: 0,image_ids,hash_values,diff_1. Python for data mining 1.jpg,diff_1. Python for data mining 2.jpg,diff_1. Python for data mining 3.jpg,diff_1. Python for data mining 4.jpg,diff_1. Python for data mining 5.jpg,diff_1. Python for data mining 6.jpg,diff_1. Python for data mining 7.jpg,diff_1. Python for data mining 8.jpg,diff_1. Python for data mining 9.jpg
0,1. Python for data mining 1.jpg,0000000000000000,0,0,39,36,37,34,39,41,34
1,1. Python for data mining 2.jpg,0000000000000000,0,0,39,36,37,34,39,41,34
2,1. Python for data mining 3.jpg,1e8e9e9e9ebf8e9e,39,39,0,27,12,31,12,10,31
3,1. Python for data mining 4.jpg,ffa393c7938387b2,36,36,27,0,25,38,31,25,38
4,1. Python for data mining 5.jpg,8e9f9f8e9e869e8e,37,37,12,25,0,33,20,14,33
5,1. Python for data mining 6.jpg,f8fcf8f8fef60000,34,34,31,38,33,0,27,29,0
6,1. Python for data mining 7.jpg,0e8e8edffefe02be,39,39,12,31,20,27,0,18,27
7,1. Python for data mining 8.jpg,9e9e9ebf9f9f0f86,41,41,10,25,14,29,18,0,29
8,1. Python for data mining 9.jpg,f8fcf8f8fef60000,34,34,31,38,33,0,27,29,0


In [20]:
# Solution 1: from stackoverflow
def remove_duplicates(df):
    mask = (df[df.columns[2:]] > 0) & (df[df.columns[2:]] < 15)
    return df[~(pd.DataFrame(mask).any(axis=1))].reset_index(drop=True)

In [24]:
df_rem = remove_duplicates(df)

In [25]:
df_rem

Unnamed: 0,image_ids,hash_values,diff_1. Python for data mining 1.jpg,diff_1. Python for data mining 2.jpg,diff_1. Python for data mining 3.jpg,diff_1. Python for data mining 4.jpg,diff_1. Python for data mining 5.jpg,diff_1. Python for data mining 6.jpg,diff_1. Python for data mining 7.jpg,diff_1. Python for data mining 8.jpg,diff_1. Python for data mining 9.jpg
0,1. Python for data mining 1.jpg,0000000000000000,0,0,39,36,37,34,39,41,34
1,1. Python for data mining 2.jpg,0000000000000000,0,0,39,36,37,34,39,41,34
2,1. Python for data mining 4.jpg,ffa393c7938387b2,36,36,27,0,25,38,31,25,38
3,1. Python for data mining 6.jpg,f8fcf8f8fef60000,34,34,31,38,33,0,27,29,0
4,1. Python for data mining 9.jpg,f8fcf8f8fef60000,34,34,31,38,33,0,27,29,0


In [23]:
df['image_ids']

0    1. Python for data mining 1.jpg
1    1. Python for data mining 2.jpg
2    1. Python for data mining 3.jpg
3    1. Python for data mining 4.jpg
4    1. Python for data mining 5.jpg
5    1. Python for data mining 6.jpg
6    1. Python for data mining 7.jpg
7    1. Python for data mining 8.jpg
8    1. Python for data mining 9.jpg
Name: image_ids, dtype: object