### Idea
* Calculate phash matrices for images. Flatten them into vectors.
* Based on the Hamming distance between vectors, decide what images might have the wrong label. The assumption is that if the group of very similar images has different labels, then some of the images might be wrong labeled. 
* Use BKTree to reduce time complexity (we can use this data structure since Hamming distance is metric).

BKTree can be used also with other metrics like Levenshtein distance, Euclidean, L1, etc.

Generally, we call function $d$ metric if it satisfies following conditions
$$d(x, y) >= 0\\ 
  d(x, y) = 0 \iff x=y\\
  d(x, y) = d(y, x)\\
  d(x, y) <= d(x, z) + d(z, y),
$$
for all $x, y$ and $z$ from the domain of the function $d$. 

Notice that cosine similarity can't be used in BKTree since it doesn't satisfy triangle inequality and it is not metric by definition.

In [None]:
import numpy as np 
import pandas as pd 
from tqdm.notebook import tqdm
from collections import deque
from operator import itemgetter


__all__ = ['hamming_distance', 'BKTree']

__version__ = '1.1'

_getitem0 = itemgetter(0)

### BKTree
### source https://github.com/benhoyt/pybktree

class BKTree(object):
    """BK-tree data structure that allows fast querying of matches that are
    "close" given a function to calculate a distance metric (e.g., Hamming
    distance or Levenshtein distance).
    Each node in the tree (including the root node) is a two-tuple of
    (item, children_dict), where children_dict is a dict whose keys are
    non-negative distances of the child to the current item and whose values
    are nodes.
    """
    def __init__(self, distance_func, items=[]):
        """Initialize a BKTree instance with given distance function
        (which takes two items as parameters and returns a non-negative
        distance integer). "items" is an optional list of items to add
        on initialization.
        >>> tree = BKTree(hamming_distance)
        >>> list(tree)
        []
        >>> tree.distance_func is hamming_distance
        True
        >>> tree = BKTree(hamming_distance, [])
        >>> list(tree)
        []
        >>> tree = BKTree(hamming_distance, [0, 4, 5])
        >>> sorted(tree)
        [0, 4, 5]
        """
        self.distance_func = distance_func
        self.tree = None

        _add = self.add
        for item in items:
            _add(item)

    def add(self, item):
        """Add given item to this tree.
        >>> tree = BKTree(hamming_distance)
        >>> list(tree)
        []
        >>> tree.add(4)
        >>> sorted(tree)
        [4]
        >>> tree.add(15)
        >>> sorted(tree)
        [4, 15]
        """
        node = self.tree
        if node is None:
            self.tree = (item, {})
            return

        # Slight speed optimization -- avoid lookups inside the loop
        _distance_func = self.distance_func

        while True:
            parent, children = node
            distance = _distance_func(item, parent)
            node = children.get(distance)
            if node is None:
                children[distance] = (item, {})
                break

    def find(self, item, n):
        """Find items in this tree whose distance is less than or equal to n
        from given item, and return list of (distance, item) tuples ordered by
        distance.
        >>> tree = BKTree(hamming_distance)
        >>> tree.find(13, 1)
        []
        >>> tree.add(0)
        >>> tree.find(1, 1)
        [(1, 0)]
        >>> for item in [0, 4, 5, 14, 15]:
        ...     tree.add(item)
        >>> sorted(tree)
        [0, 0, 4, 5, 14, 15]
        >>> sorted(tree.find(13, 1))
        [(1, 5), (1, 15)]
        >>> sorted(tree.find(13, 2))
        [(1, 5), (1, 15), (2, 4), (2, 14)]
        >>> sorted(tree.find(0, 1000)) == [(hamming_distance(x, 0), x) for x in tree]
        True
        """
        if self.tree is None:
            return []

        candidates = deque([self.tree])
        found = []

        # Slight speed optimization -- avoid lookups inside the loop
        _candidates_popleft = candidates.popleft
        _candidates_extend = candidates.extend
        _found_append = found.append
        _distance_func = self.distance_func

        while candidates:
            candidate, children = _candidates_popleft()
            distance = _distance_func(candidate, item)
            if distance <= n:
                _found_append((distance, candidate))

            if children:
                lower = distance - n
                upper = distance + n
                _candidates_extend(c for d, c in children.items() if lower <= d <= upper)

        found.sort(key=_getitem0)
        return found

    def __iter__(self):
        """Return iterator over all items in this tree; items are yielded in
        arbitrary order.
        >>> tree = BKTree(hamming_distance)
        >>> list(tree)
        []
        >>> tree = BKTree(hamming_distance, [1, 2, 3, 4, 5])
        >>> sorted(tree)
        [1, 2, 3, 4, 5]
        """
        if self.tree is None:
            return

        candidates = deque([self.tree])

        # Slight speed optimization -- avoid lookups inside the loop
        _candidates_popleft = candidates.popleft
        _candidates_extend = candidates.extend

        while candidates:
            candidate, children = _candidates_popleft()
            yield candidate
            _candidates_extend(children.values())

    def __repr__(self):
        """Return a string representation of this BK-tree with a little bit of info.
        >>> BKTree(hamming_distance)
        <BKTree using hamming_distance with no top-level nodes>
        >>> BKTree(hamming_distance, [0, 4, 8, 14, 15])
        <BKTree using hamming_distance with 3 top-level nodes>
        """
        return '<{} using {} with {} top-level nodes>'.format(
            self.__class__.__name__,
            self.distance_func.__name__,
            len(self.tree[1]) if self.tree is not None else 'no',
        )

In [None]:
data_dir = '../input/shopee-product-matching'
train_dir = '{}/train.csv'.format(data_dir)
df_train = pd.read_csv(train_dir)
df_train.head()

In [None]:
def hex_to_hash(hexstr):
    # modified function from imagehash
    
    hash_size = int(np.sqrt(len(hexstr)*4))
    #assert hash_size == np.sqrt(len(hexstr)*4)
    binary_array = '{:0>{width}b}'.format(int(hexstr, 16), width = hash_size * hash_size)
    bit_rows = [binary_array[i:i+hash_size] for i in range(0, len(binary_array), hash_size)]
    hash_array = np.array([[bool(int(d)) for d in row] for row in bit_rows])
    return hash_array.flatten().astype(int)

def simple_hamming(x1, x2, idx =0):
    #hamming distance between integer arrays
    
    return np.count_nonzero(x1[idx]!=x2[idx])

df_train['phash_vec'] = df_train['image_phash'].apply(hex_to_hash)

tree_items = list(df_train[['phash_vec', 'label_group', 'title', 'image', 'posting_id']].values)

tree = BKTree(simple_hamming, tree_items)

# consider using different threshold
hamming_distance = 1

suspicious_groups = []

# O(nlogn)
for i in tqdm(range(len(tree_items))):
    
    neighbor_list = tree.find(tree_items[i], hamming_distance)
    
    if len(neighbor_list)>1:
        
        neighbor_labels = [neighbor_list[i][1][1] for i in range(len(neighbor_list))]
    
        # if all items in the neighbor_list don't have the same label then we assume
        # that some samples have wrong label
        
        if len(set(neighbor_labels))>1:
            print('>', end='')
            suspicious_groups.append(neighbor_list)

In [None]:
sus_ids = []
for group in suspicious_groups:
    for item in group:
        sus_ids.append(item[1][4])

n_sus = len(set(sus_ids))
print('Total number of suspicious samples: {} or in percent: {}%'.format(n_sus, round(n_sus/len(df_train)*100,2)))
print('Consider removing them from the training set')

In [None]:
#sample
suspicious_groups[0]

In [None]:
import cv2
import matplotlib.pyplot as plt
#plt.rcParams["figure.figsize"] = (10,10)

def image_viz(image_path, title, ax):
    """
    Function for visualization.
    Takes path to image as input.
    """
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)    
    ax.imshow(img)
    #plt.axis('off')
    ax.set_title(title)
    
train_img_dir = '../input/shopee-product-matching/train_images'

# show random 15 suspicious groups

print('Very similar images that have different label')

n_cols=4

for i in range(15):
    
    sample = np.random.choice(suspicious_groups)
    
    n_rows = int((len(sample)-0.01)/n_cols)+1
    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols,figsize=(15, 5*n_rows))
    labels = []
    
    for j, ax in enumerate(axes.flatten()):
        
        if j >= len(sample):
            ax.plot(list(range(1000)), color = 'red')
            continue
            
        img_path = f'{train_img_dir}/{sample[j][1][3]}'
        img_label_group = sample[j][1][1]
        labels.append(img_label_group)
        image_viz(img_path, img_label_group, ax)
        
    fig.suptitle("labels: "+"_".join(str(x) for x in set(labels)))
    plt.show()

References:
* https://www.kaggle.com/maksymshkliarevskyi/shopee-before-we-start-eda-phash-baseline
