# Import Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

import os
import re
import numpy as np
import pandas as pd
from tqdm.autonotebook import tqdm
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import unidecode
import codecs
from wordcloud import WordCloud, STOPWORDS
from fuzzywuzzy import fuzz
import cudf
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from cuml.common.sparsefuncs import csr_row_normalize_l2
import gc

In [None]:
np.random.seed(0)
nlp = spacy.load('en')#, disable=["tagger", "parser", "ner"])

# Helper funtions

In [None]:
def plot_bar_chart(x, y, title, rotation_angle=45):
    plt.figure(figsize = (20, 15))
    sns.barplot(x=x, y=y).set_title(title)
    plt.xticks(rotation=rotation_angle)
    plt.show()

In [None]:
def plot_images(dataframe, column_name, value):
    '''
    Plot images using image_path, based on the column & value filter
    '''
    plt.figure(figsize = (30, 30))
    value_filter = dataframe[dataframe[column_name] == value]
    image_paths = value_filter['image_path'].to_list()
    print(f'Total images: {len(image_paths)}')
    posting_id = dataframe['posting_id'].to_list()
    for i, j in enumerate(zip(image_paths, posting_id)):
        plt.subplot(10, 10, i + 1)
        img = cv2.cvtColor(cv2.imread(j[0]), cv2.COLOR_BGR2RGB)
        plt.title(j[1])
        plt.axis("off")
        plt.tight_layout()
        plt.imshow(img)

In [None]:
def plot_matched_images(images_path, posting_id):
    plt.figure(figsize = (50, 50))
    for i, j in enumerate(zip(images_path, posting_id)):
        plt.subplot(10, 10, i + 1)
        img = cv2.cvtColor(cv2.imread(j[0]), cv2.COLOR_BGR2RGB)
        plt.title(j[1])
        plt.axis("off")
        plt.tight_layout()
        plt.imshow(img)

In [None]:
def plot_images_by_label_group(label):
    plt.figure(figsize = (30, 30))
    label_filter = train_df[train_df['label_group'] == label]
    image_paths = label_filter['image_path'].to_list()
    print(f'Total images: {len(image_paths)}')
    posting_id = label_filter['posting_id'].to_list()
    for i, j in enumerate(zip(image_paths, posting_id)):
        plt.subplot(10, 10, i + 1)
        img = cv2.cvtColor(cv2.imread(j[0]), cv2.COLOR_BGR2RGB)
        plt.title(j[1])
        plt.axis("off")
        plt.tight_layout()
        plt.imshow(img)

In [None]:
def plot_images_by_phash(image_phash):
    '''
    Plots image by phash value from train_df dataframe
    '''
    plt.figure(figsize = (30, 30))
    phash_filter = train_df[train_df['image_phash'] == image_phash]
    image_paths = phash_filter['image_path'].to_list()
    print(f'Total images: {len(image_paths)}')
    posting_id = phash_filter['posting_id'].to_list()
    for i, j in enumerate(zip(image_paths, posting_id)):
        plt.subplot(10, 10, i + 1)
        img = cv2.cvtColor(cv2.imread(j[0]), cv2.COLOR_BGR2RGB)
        plt.title(j[1])
        plt.axis("off")
        plt.tight_layout()
        plt.imshow(img)

In [None]:
def hamming_distance(phash1, phash2):
    '''
    helper function to calculate phash similarity
    '''
    phash1 = bin(int(phash1, 16))[2:].zfill(64)
    phash2 = bin(int(phash2, 16))[2:].zfill(64)
    distance = np.sum([i != j for i, j in zip(phash1, phash2)])
    return distance

In [None]:
def hamming_distance_bin(phash1, phash2):
    '''
    helper function to calculate phash similarity
    '''
    return np.sum([i != j for i, j in zip(phash1, phash2)])

In [None]:
def get_record_from_df(dataframe, column_name, value):
    '''
    Returns records from dataframe for the given value & column
    '''
    return dataframe[dataframe[column_name] == value]
    

In [None]:
def cosine_similarity(string1, string2):
    d1 = nlp(string1)
    d2 = nlp(string2)
    return d2.similarity(d2)

In [None]:
def find_matches(posting_id, dataframe, dist_thr=10, title_thr=60):
    '''
    posting_id: posting_id 
    dataframe: train/test dataframe from which the phash & title can be pulled
    dist_thr: phash distance/score threshold
    title_thr: title score threshold from 100
    '''
    results = {}
    phash_value = dataframe[dataframe['posting_id'] == posting_id].image_phash.to_list()[0]
    title_value = dataframe[dataframe['posting_id'] == posting_id].clean_title.to_list()[0]
    print(title_value)
    for i in dataframe.itertuples():
        phash_dist = hamming_distance(phash_value, i.image_phash)
        title_score = fuzz.token_set_ratio(title_value.lower(), i.clean_title.lower())

        if phash_dist <= dist_thr:
            # print(i.posting_id, " ::: ", i.title, phash_dist)
            # results.append([i.posting_id, i.image_path])
            results[i.posting_id] = i.image_path
            continue
        
        if title_score > title_thr:
            # print(i.posting_id, " ::: ", i.title, title_score)
            # results.append([i.posting_id, i.image_path])
            results[i.posting_id] = i.image_path
    return results

In [None]:
class ProductMatch:
    '''
    Aggregating phash | fuzzymatch | cosine similarity
    '''
    def __init__(self, cudf_df, pro_df):
        self.vectorizer = TfidfVectorizer(stop_words='english')
        self.tfidf_matrix = self.vectorizer.fit_transform(cudf_df['clean_title'])
        self.pro_df = pro_df
        
        
    def find_phash_fuzz_match(self, posting_id, dist_thr=10, title_thr=60):
        phash_val = self.pro_df.loc[self.pro_df['posting_id'] == posting_id].hash.to_list()[0]
        title_val = self.pro_df.loc[self.pro_df['posting_id'] == posting_id].clean_title.to_list()[0]

        self.pro_df['image_phash_score'] = self.pro_df['hash'].apply(lambda x: hamming_distance_bin(phash_val, x))
        self.pro_df['title_score'] = self.pro_df['clean_title'].apply(lambda x: fuzz.token_set_ratio(title_val, x))
        self.pro_df.sort_values(by='title_score', ascending=False, inplace=True)
        i_score = self.pro_df.loc[self.pro_df['image_phash_score'] <= dist_thr]
        t_score = self.pro_df.loc[self.pro_df['title_score'] > title_thr]

        self.fuz_ph = {**dict(zip(i_score.posting_id.to_list()[:50], i_score.image_path.to_list()[:50])), **dict(zip(
            t_score.posting_id.to_list()[:50], t_score.image_path.to_list()[:50]))}

        return self.fuz_ph
    
    
    # Ref: https://medium.com/rapids-ai/natural-language-processing-text-preprocessing-and-vectorizing-at-rocking-speed-with-rapids-cuml-74b8d751812e
    def efficient_csr_cosine_similarity(self, query, matrix_normalized=False):
        query = csr_row_normalize_l2(query, inplace=False)
        if not matrix_normalized:
            self.tfidf_matrix = csr_row_normalize_l2(self.tfidf_matrix, inplace=False)
        return self.tfidf_matrix.dot(query.T)

    def cos_match(self, df, query, cos_thr=0.2, top_n=50):
        query = self.pro_df.loc[self.pro_df['posting_id'] == query].clean_title.to_list()[0]
        query_vec = self.vectorizer.transform(cudf.Series([query]))
        similarities = self.efficient_csr_cosine_similarity(query_vec, matrix_normalized=True)
        similarities = similarities.todense().reshape(-1)
        best_idx = similarities.argsort()[-top_n:][::-1]
        op_df = cudf.DataFrame({
            'posting_id': df['posting_id'].iloc[best_idx],
            # 'title': df['clean_title'].iloc[best_idx],
            'image_path': df['image_path'].iloc[best_idx],
            'similarity': similarities[best_idx]
        })
        cos_df = op_df.to_pandas()
        cos_df = cos_df[~cos_df['posting_id'].isin([list(self.fuz_ph.keys())])]
        cos_df = cos_df.loc[cos_df['similarity'] > cos_thr]
        cos_df = dict(zip(cos_df.posting_id.to_list()[:50 - len(self.fuz_ph.keys())], cos_df.image_path.to_list()[:50 - len(self.fuz_ph.keys())]))
        return cos_df

In [None]:
# Ref: https://medium.com/rapids-ai/natural-language-processing-text-preprocessing-and-vectorizing-at-rocking-speed-with-rapids-cuml-74b8d751812e

def efficient_csr_cosine_similarity(query, tfidf_matrix, matrix_normalized=False):
    query = csr_row_normalize_l2(query, inplace=False)
    if not matrix_normalized:
        tfidf_matrix = csr_row_normalize_l2(tfidf_matrix, inplace=False)
    return tfidf_matrix.dot(query.T)

def product_match(df, query, vectorizer, tfidf_matrix, top_n=50):
    print(f"Product match: {query}")
    query_vec = vectorizer.transform(cudf.Series([query]))
    similarities = efficient_csr_cosine_similarity(query_vec, tfidf_matrix, matrix_normalized=True)
    similarities = similarities.todense().reshape(-1)
    best_idx = similarities.argsort()[-top_n:][::-1]
    op_df = cudf.DataFrame({
        'posting_id': df['posting_id'].iloc[best_idx],
        'title': df['clean_title'].iloc[best_idx],
        'image_path': df['image_path'].iloc[best_idx],
        'similarity': similarities[best_idx]
    })
    return op_df

In [None]:
digit_check = re.compile('\d')
def check_alpha_num(token):
    # check if the token id alphanumeric
    return bool(digit_check.search(token))

In [None]:
def handle_consecutive_char(string):
    # check & fix for 3 or more consecutive characters
    return re.sub(r'(.)\1+\1+', r'\1', string)

# Getting started with data

In [None]:
source_path = '../input/shopee-product-matching'

In [None]:
train_df = pd.read_csv(f'{source_path}/train.csv')
test_df = pd.read_csv(f'{source_path}/test.csv')
sample_submission_df = pd.read_csv(f'{source_path}/sample_submission.csv')

In [None]:
print(f'Is there any NaN values?: {train_df.isnull().values.any()}')

Preparing image paths

In [None]:
tqdm.pandas()
train_df['image_path'] = train_df['image'].progress_apply(lambda x: f"{source_path}/train_images/{x}")
test_df['image_path'] = test_df['image'].progress_apply(lambda x: f"{source_path}/test_images/{x}")

In [None]:
tqdm.pandas()
train_df['hash'] = train_df['image_phash'].progress_apply(lambda x: bin(int(x, 16))[2:].zfill(64))
test_df['hash'] = train_df['image_phash'].progress_apply(lambda x: bin(int(x, 16))[2:].zfill(64))

In [None]:
print(f'Trainset: {train_df.shape} \nTestset: {test_df.shape}')

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.head()

In [None]:
test_df.head()

## Label Group

Label group count

In [None]:
label_group_count = train_df.groupby(['label_group']).size().reset_index()
label_group_count.columns = ['label_group', 'count']
label_group_count.sort_values(by='count', ascending=False, inplace=True)
label_group_count

In [None]:
print(f'No. of Duplicate label group: {train_df[train_df["label_group"].duplicated() == True].shape[0]}')

In [None]:
print(f"Minimum product under single label group: {label_group_count['count'].min()}\nMaximum product under single label group: {label_group_count['count'].max()}")

In [None]:
x, y = label_group_count['label_group'][:50], label_group_count['count'][:50]
plot_bar_chart(x, y, title='Label Group Chart')

Checking the similarity of image phash that falls under same group

In [None]:
train_df[train_df['label_group'] == 509010932]

In [None]:
hamming_distance('eab5c295966ac368', 'efc096b0d38e98c3')

Cannot rely on phash similarity, title similarity along with phash can be considered

In [None]:
plot_images(train_df, 'label_group', 509010932)

## Images

Image count

In [None]:
image_count = train_df.groupby(['image']).size().reset_index()
image_count.columns = ['image', 'count']
image_count.sort_values(by='count', ascending=False, inplace=True)
image_count

In [None]:
print(f'No. of Duplicate images: {train_df[train_df["image"].duplicated() == True].shape[0]}')

In [None]:
x, y = image_count['image'][:50], image_count['count'][:50]
plot_bar_chart(x, y, title='Image count')

Check if same image have same phash

In [None]:
tmp_image = train_df[train_df["image"].duplicated() == True]

for i in tmp_image.itertuples():
    cnt = len(set(train_df[train_df['image'] == i.image].image_phash.to_list()))
    if cnt != 1:
        print(f'phash mismatch: {i}')


In [None]:
plot_images(train_df, 'label_group', 159351600)

# Image phash

* Perpetual hashing acts as the image fingerprint which is generated by analyzing the content of the mathematically. 
* Its a 64-bits representation. 
* We can calculate the distance between two phash using hamming distance to derive the semantics of both images. The lower the score; more they are likely to be identical. (The example is shown below)
* It is also widely used for use-cases of copyright-infringement. 

[Read more about phash](https://en.wikipedia.org/wiki/Perceptual_hashing)

In [None]:
phash_count = train_df.groupby(['image_phash']).size().reset_index()
phash_count.columns = ['image_phash', 'count']
phash_count.sort_values(by='count', ascending=False, inplace=True)
phash_count

In [None]:
plot_images(train_df, 'image_phash', 'e992966d4ba49761')

All images belongs to different **posting_id** and visually they are same.

In [None]:
# calculating distance between 2 phash for similarity
distance = []
for i in phash_count['image_phash']:
    d = hamming_distance('fad28daa2ad05595', i)
    if d <10:
        distance.append([i, d])
print(distance)

* Calculating distance between 2 image phash for similarity
* Considering fad28daa2ad05595 for reference to compare it with other hash for similarity
* Higher score = less similar

In [None]:
plot_images(train_df, 'image_phash', 'fad28daa2ad05595')

In [None]:
plot_images(train_df, 'image_phash', 'f2728d8b8ad055b5')

In [None]:
plot_images(train_df, 'image_phash', 'fad28dab22d05595')

* Here, we have visualised all the images from above mentioned phash values based on the hamming distance 
* It's visually similar except the watermark in the center part

# Title

Title count

In [None]:
print(f'No. of Duplicate titles: {train_df[train_df["title"].duplicated() == True].shape[0]}')

In [None]:
train_df[train_df["title"].duplicated() == True]

Handling unicode data

In [None]:
train_df['title'][4]

In [None]:
tqdm.pandas()
train_df['unicode_handled_title'] = train_df['title'].progress_apply(lambda x: unidecode.unidecode(codecs.decode(x, 'unicode_escape')))

In [None]:
train_df['unicode_handled_title'][4]

# WordCloud

In [None]:
title_data = ' '.join(i for i in train_df['unicode_handled_title'])

In [None]:
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='black', colormap='Set2', collocations=False, stopwords = STOPWORDS).generate(title_data)
plt.figure(figsize=(40, 30))
plt.imshow(wordcloud)  
plt.axis("off")

# Text cleaning

In [None]:
vectorizer = CountVectorizer(stop_words='english')
count_m = vectorizer.fit_transform(train_df['unicode_handled_title'])

In [None]:
count_df = pd.DataFrame({'tokens': vectorizer.get_feature_names(), 'count': count_m.toarray().sum(axis=0).tolist()})
count_df.sort_values(by='count', ascending=True, inplace=True)

In [None]:
plt.figure(figsize = (15, 15))
sns.pointplot(x=count_df['tokens'][:50], y=count_df['count'][:50], linestyles="-")
plt.xlabel("tokens")
plt.ylabel("frequency")
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize = (15, 15))
sns.pointplot(x=count_df['tokens'][-50:], y=count_df['count'][-50:], color = "green", linestyles="-")
plt.xlabel("tokens")
plt.ylabel("frequency")
plt.xticks(rotation=90)
plt.show()

Here, there is alot of numbers, punctuation, consecutive characters. Hence, cleaning this data.
* Dropping all the alphanumeric tokens
* Fixing consecutive characters
* Applied regex to filter non-alphabetic content from tokens

In [None]:
tqdm.pandas()
train_df['clean_title'] = train_df['unicode_handled_title'].progress_apply(lambda x: ' '.join(handle_consecutive_char(i) for i in str(
                re.sub('[^A-Za-z0-9]', ' ', x.lower().strip())).split() if i.strip() and not check_alpha_num(i.strip()) and not (i.strip(
                ) == len(i.strip()) * i.strip()[0])))

# Product Match

Earlier, we saw that phash similarity is not enough. Hence we considered the phash + title matching.

Plotting matched products/posting_id from the data frame for the given posting_id. Here, the results seem pretty interesting but ofcourse there are mismatched products in the results below.

In the below test, the first image is from the source posting_id and the images in the grid are the match from other records in the data frame.

**The results have improved after text cleaning**

In [None]:
result = find_matches('train_1638187876', train_df)
plot_images(train_df, 'posting_id', 'train_1638187876')
plot_matched_images(result.values(), result.keys())

In [None]:
result = find_matches('train_3193897481', train_df)
plot_images(train_df, 'posting_id', 'train_3193897481')
plot_matched_images(result.values(), result.keys())

In [None]:
result = find_matches('train_2767483557', train_df)
plot_images(train_df, 'posting_id', 'train_2767483557')
plot_matched_images(list(result.values())[:50], list(result.keys())[:50])

In [None]:
# result = find_matches('train_2928592022', train_df)
# plot_images(train_df, 'posting_id', 'train_2928592022')
# plot_matched_images([i[1] for i in result], [i[0] for i in result])
result = find_matches('train_2406599165', train_df)
plot_images(train_df, 'posting_id', 'train_2406599165')
plot_matched_images(result.values(), result.keys())

In [None]:
result = find_matches('train_4085449742', train_df)
plot_images(train_df, 'posting_id', 'train_4085449742')
plot_matched_images(result.values(), result.keys())

# RAPIDS TfidfVectorizer cosine similarity match

Here, I am trying TfidfVectorizer + cosine similarity with product titles. The results look reasonable and of course, the matched products that are returned seem correct as compare to the above approach results. And yes, there are mismatches. I need to experiment with the score threshold. 

***Aggregating phash, fuzzymatch, cosine similarity***

In [None]:
cudf_df = cudf.DataFrame(train_df)
obj = ProductMatch(cudf_df, train_df)

In [None]:
def con(obj, posting_id, df, dist_thr=10, title_thr=60, cos_thr=0.2):
    ph = obj.find_phash_fuzz_match(posting_id, dist_thr=10, title_thr=60)
    cs = obj.cos_match(df, posting_id, cos_thr=0.2, top_n=50)
    return {**ph, **cs}

In [None]:
%%time
result = con(obj, 'train_1638187876', cudf_df, dist_thr=10, title_thr=60, cos_thr=0.2)
plot_matched_images(result.values(), result.keys())

In [None]:
%%time
result = con(obj, 'train_3193897481', cudf_df, dist_thr=10, title_thr=60, cos_thr=0.2)
plot_matched_images(result.values(), result.keys())

In [None]:
result = con(obj, 'train_2767483557', cudf_df, dist_thr=10, title_thr=60, cos_thr=0.2)
plot_matched_images(result.values(), result.keys())

In [None]:
result = con(obj, 'train_1827962737', cudf_df, dist_thr=10, title_thr=60, cos_thr=0.2)
plot_matched_images(result.values(), result.keys())

In [None]:
result = con(obj, 'train_4085449742', cudf_df, dist_thr=10, title_thr=60, cos_thr=0.2)
plot_matched_images(result.values(), result.keys())

<h3 align="center" style="background-color:green;">WIP</h3> 

# Baseline

This is based on the fuzzy matching + phash approach.

In [None]:
tqdm.pandas()
test_df['unicode_handled_title'] = test_df['title'].progress_apply(lambda x: unidecode.unidecode(codecs.decode(x, 'unicode_escape')))
test_df['clean_title'] = test_df['unicode_handled_title'].progress_apply(lambda x: ' '.join(handle_consecutive_char(i) for i in str(
                re.sub('[^A-Za-z]', ' ', x.lower().strip())).split() if i.strip() and not check_alpha_num(i.strip()) and not (i.strip(
                ) == len(i.strip()) * i.strip()[0])))

In [None]:
cudf_test_df = cudf.DataFrame(test_df)
obj = ProductMatch(cudf_test_df, test_df)

In [None]:
tqdm.pandas()
test_df['matches'] = test_df['posting_id'].progress_apply(lambda x: ' '.join(con(obj, x, cudf_test_df, dist_thr=10, title_thr=60, cos_thr=0.2).keys()))

In [None]:
submission_csv = pd.DataFrame({'posting_id': test_df['posting_id'].to_list(), 'matches': test_df['matches'].to_list()})
submission_csv

In [None]:
submission_csv.to_csv('submission.csv', index=False)

Please check out my another kernel: [Locality Sensitive Hashing(LSH)](https://www.kaggle.com/srcecde/shoppee-locality-sensitive-hashing-lsh-jaccard)

<h3 align="center" style="background-color:#003300;color:white;">Thanks! More updates to come. WIP</h3> 