In [None]:
!pip3 install ../input/datasketch/datasketch-1.5.3-py2.py3-none-any.whl

## Import Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

import numpy as np
import pandas as pd
from tqdm.autonotebook import tqdm
import unidecode
import codecs
import cv2
import matplotlib.pyplot as plt
import re
import time
import spacy
from datasketch import MinHash, MinHashLSHForest
import gc

In [None]:
np.random.seed(0)
nlp = spacy.load('en_core_web_lg')

## Helper funtions

In [None]:
def cosine_similarity(string1, string2):
    d1 = nlp(string1)
    d2 = nlp(string2)
    return d2.similarity(d2)

In [None]:
def jaccard_similarity(l1, l2):
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

In [None]:
def plot_matched_images(images_path, posting_id):
    plt.figure(figsize = (50, 50))
    for i, j in enumerate(zip(images_path, posting_id)):
        plt.subplot(10, 10, i + 1)
        img = cv2.cvtColor(cv2.imread(j[0]), cv2.COLOR_BGR2RGB)
        plt.title(j[1])
        plt.axis("off")
        plt.tight_layout()
        plt.imshow(img)

In [None]:
def handle_consecutive_char(string):
    # check & fix for 3 or more consecutive characters
    return re.sub(r'(.)\1+\1+', r'\1', string)

In [None]:
digit_check = re.compile('\d')
def check_alpha_num(token):
    # check if the token id alphanumeric
    return bool(digit_check.search(token))

In [None]:
source_path = '../input/shopee-product-matching'

In [None]:
train_df = pd.read_csv(f'{source_path}/train.csv')
test_df = pd.read_csv(f'{source_path}/test.csv')

In [None]:
tqdm.pandas()
train_df['image_path'] = train_df['image'].progress_apply(lambda x: f"{source_path}/train_images/{x}")
test_df['image_path'] = test_df['image'].progress_apply(lambda x: f"{source_path}/test_images/{x}")

In [None]:
tqdm.pandas()
train_df['unicode_handled_title'] = train_df['title'].progress_apply(lambda x: unidecode.unidecode(codecs.decode(x, 'unicode_escape')))
train_df['clean_title'] = train_df['unicode_handled_title'].progress_apply(lambda x: ' '.join(handle_consecutive_char(i) for i in str(
                re.sub('[^A-Za-z0-9]', ' ', x.lower().strip())).split() if i.strip() and not check_alpha_num(i.strip()) and not (i.strip(
                ) == len(i.strip()) * i.strip()[0])))

## LSH 

* Setting number of permutations
* Setting number of recommendations to return  
* Setting depth of LSH Forest
* Preparing shingles
* MinHashing all the shingles
* Preparing MinHashForest of MinHash
* Indexing forest
* Querying forest
* Calculating jaccard similarity & cosine similarity (Post-processing)

Reference: http://ekzhu.com/datasketch/

In [None]:
class LSH:
    def __init__(self, permutations, number_of_recommendations, depth, dataframe):
        self.permutations = permutations
        self.number_of_recommendations = number_of_recommendations
        self.depth = depth
        self.dataframe = dataframe
        self.minhash = []
        self.forest = None
    
    def minhash_data(self):
        for title in self.dataframe['clean_title']:
            tokens = title.split(' ')
            min_hash = MinHash(num_perm=self.permutations)
            for t in tokens:
                min_hash.update(t.encode('utf-8'))
            self.minhash.append(min_hash)
    
    def prepare_forest(self):
        self.forest = MinHashLSHForest(num_perm=self.permutations, l=self.depth)
        for i, j in enumerate(self.minhash):
            self.forest.add(i, j)
        self.forest.index()
        del self.minhash
        gc.collect()
    
    def query_forest(self, query, number_of_results, cosine_sim=False):
        query_tokens = query.split(' ')
        min_hash = MinHash(num_perm=self.permutations)
        for i in query_tokens:
            min_hash.update(i.encode('utf-8'))
        result = self.forest.query(min_hash, self.number_of_recommendations)
        if cosine_sim:
            print("Cosine Similarity")
            result = [(key, self.cosine_similarity(self.dataframe.iloc[key].clean_title, query)) for key in result]
        else:
            print("Jaccard Similarity")
            result = [(key, self.jaccard_similarity(self.dataframe.iloc[key].clean_title.split(' '), query_tokens)) for key in result]
        result = sorted(result, key=lambda x: x[1], reverse=True)[:number_of_results]
        iloc = [i[0] for i in result]
        return self.dataframe.iloc[iloc].image_path.to_list(), self.dataframe.iloc[iloc].posting_id.to_list()
    
    def jaccard_similarity(self, l1, l2):
        intersection = len(list(set(l1).intersection(l2)))
        union = (len(l1) + len(l2)) - intersection
        return float(intersection) / union
    
    def cosine_similarity(self, string1, string2):
        d1 = nlp(string1)
        d2 = nlp(string2)
        return d1.similarity(d2)
    

        

In [None]:
obj = LSH(permutations=256, number_of_recommendations=50, depth=10, dataframe=train_df)

In [None]:
%%time
obj.minhash_data()

In [None]:
%%time
obj.prepare_forest()

### Comparing product matches for Jaccard & Cosine similarity

In [None]:
query = 'focallure blush on powder mineral pigment warna'
im_path, train_path = obj.query_forest(query, 30)
plot_matched_images(im_path, train_path)

In [None]:
query = 'focallure blush on powder mineral pigment warna'
im_path, train_path = obj.query_forest(query, 30, cosine_sim=True)
plot_matched_images(im_path, train_path)

In [None]:
query = 'johnsonas top to toe hair body bath ml'
im_path, train_path = obj.query_forest(query, 30)
plot_matched_images(im_path, train_path)

In [None]:
query = 'johnsonas top to toe hair body bath ml'
im_path, train_path = obj.query_forest(query, 30, cosine_sim=True)
plot_matched_images(im_path, train_path)

In [None]:
query = 'pcs ikat rambut karet polos elastis gaya korea untuk wanita'
im_path, train_path = obj.query_forest(query, 30)
plot_matched_images(im_path, train_path)

In [None]:
query = 'pcs ikat rambut karet polos elastis gaya korea untuk wanita'
im_path, train_path = obj.query_forest(query, 30, cosine_sim=True)
plot_matched_images(im_path, train_path)

In [None]:
query = 'pashmina kusut polos rawis hitam'
im_path, train_path = obj.query_forest(query, 30)
plot_matched_images(im_path, train_path)

In [None]:
query = 'pashmina kusut polos rawis hitam'
im_path, train_path = obj.query_forest(query, 30, cosine_sim=True)
plot_matched_images(im_path, train_path)

<h3 align="center" style="background-color:#003300;color:white;">Thanks! More updates to come. WIP</h3> 