# Shopee - Price Match Guarantee competition.
## We are given a set of products, each represented by the image and text description. The goal is to discover products that should have equal price.
#### **NOTE**: the goal is quite different from looking for simply duplicated items, which is brilliantly explained by [Roman Glushko](https://www.kaggle.com/glushko) in [this discussion](https://www.kaggle.com/c/shopee-product-matching/discussion/236496)

### This notebook will walk you through the competition. We will perform EDA, have a look at images and explore perceptual hashing, peek into textual data. 
### For modeling part we will create a basic training pipelines with PyTorch and RAPIDS and do inference with separate models for image and text data.

## Before we start, I'd like to thank these guys for their insightful and much inspiring work:
[Chris Deotte](https://www.kaggle.com/cdeotte) - on awesome introductions to ML with CUDA, found in the top of the Shopee competition notebooks

[ragnar](https://www.kaggle.com/ragnar123) - https://www.kaggle.com/ragnar123/shopee-efficientnetb3-arcmarginproduct, https://www.kaggle.com/ragnar123/shopee-inference-efficientnetb1-tfidfvectorizer

[Mr_KnowNothing](https://www.kaggle.com/tanulsingh077) - https://www.kaggle.com/tanulsingh077/pytorch-metric-learning-pipeline-only-images

[Parth Dhameliya](https://www.kaggle.com/parthdhameliya77) - https://www.kaggle.com/parthdhameliya77/pytorch-eca-nfnet-l0-image-tfidf-inference

In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
sys.path.append('../input/shopee-competition-rgr')

In [None]:
%%bash
mkdir -p ./src && \
cp ../input/shopee-competition-rgr/*.py ./src

In [None]:
import gc
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm


import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
import cv2
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch import nn
from torch.optim import lr_scheduler
from torch.utils.data import Subset, DataLoader

from src.config import CFG
from src.dataset import ShopeeDataset
from src.loss import Mish, replace_activations
from src.model import ShopeeCNNModel
from src.train import train_fn, eval_fn
from src.transforms import get_train_transforms, get_test_transforms
from src.utils import read_dataset

import warnings
warnings.filterwarnings('ignore')

In [None]:
BASE_DIR = "../input/shopee-product-matching"

In [None]:
train = pd.read_csv(f'{BASE_DIR}/train.csv')
test = pd.read_csv(f'{BASE_DIR}/test.csv')

In [None]:
train.posting_id = train.posting_id.str.replace('train_', '')

In [None]:
train.head(3)

In [None]:
train.label_group.value_counts()[:10]

# Explore our target

In [None]:
ax = plt.axes()
sns.boxplot(train.label_group.value_counts(), ax=ax)
ax.set_xlabel("Count")
ax.set_ylabel("Labels groups")

In [None]:
duplicated_labels= train["label_group"].value_counts()[:20]
plt.xticks(range(len(duplicated_labels)), duplicated_labels.index, rotation=90)
plt.bar(range(len(duplicated_labels)), duplicated_labels.values)
plt.show()

### Visualization of these duplicates can provide us with insights into features to account for

In [None]:
labels_to_examine = duplicated_labels.iloc[np.random.randint(0, len(duplicated_labels), 3)].to_frame()
labels_to_examine

In [None]:
len(train), train["image"].nunique()

### Interesting! Identical images denote different label groups. 
### This suggests that description will play quite a role when deciding whether two products have the same price

# Explore the image data

In [None]:
def visualize_similar_imgs(random=False, COLS=6, ROWS=4, base_path=BASE_DIR):
    root = f'{base_path}/train_images'
    for k in range(ROWS):
        plt.figure(figsize=(20,5))
        for j in range(COLS):
            if random: row = np.random.randint(0,len(train))
            else: row = COLS*k + j
            name = train.iloc[row,1]
            title = train.iloc[row,3]
            title_with_return = ""
            for i,ch in enumerate(title):
                title_with_return += ch
                if (i!=0)&(i%20==0): title_with_return += '\n'
                img = cv2.imread(str(Path(root).joinpath(name)))
            plt.subplot(1,COLS,j+1)
            plt.title(title_with_return)
            plt.axis('off')
            plt.imshow(img)
    plt.show()

In [None]:
visualize_similar_imgs()

In [None]:
to_visualize = 5

def add_newlines_to_title(title: str):
    max_chars = 15
    idxs = [i*max_chars for i in range(0, len(title) // max_chars)]
    for i in idxs:
        title = title[:i]+'\n'+title[i:]
    return title+'\n'

def visualize_dupl_images():
    for k, (lg, dup_n) in enumerate(labels_to_examine.iterrows()):
        plt.figure(figsize=(20,5))
        samples = train[train.label_group==lg][:to_visualize]
        title = samples.loc[:,'title'].values
        names = samples.loc[:,'image'].values
        for j in range(to_visualize):
            img_path = str(Path(BASE_DIR).joinpath(f'train_images/{names[j]}'))
            img = cv2.imread(img_path)
            plt.subplot(1,to_visualize,j+1)
            img_title = add_newlines_to_title(title[j])
            plt.title(img_title)
            plt.axis('off')
            plt.imshow(img)
    plt.show()

visualize_dupl_images()

### Both images and descriptions give clear scent of duplication
### Images with same product contain the product with some distortions (rotation, brightness, gamma transforms), noisy objects. There are complete duplicates as well.
### Descriptions contain name of the product with some extra text, which mildly contributes to higher algorithm performance

In [None]:
target = "label_group"
labels = train[target]
descr = train.title
phash = train.image_phash
images = train.image

In [None]:
descr

### Image perceptual hash should be similar (not identical!) if the image is considered similar

In [None]:
phash, phash.nunique(), labels.nunique()

### If the phash is identical - the images are complete copies. We have a plenty of such cases

### Let's examine perceptual hashing

#### Different hashing techniques use different image features. Maybe we can guess the hash that was used?

In [None]:
from PIL import Image
import imagehash
img_path1 = f'{BASE_DIR}/train_images/{images[0]}'
img_path2 = f'{BASE_DIR}/train_images/{images[1]}'

def find_img_phashes(img):
    avg_hash = imagehash.average_hash(img)
    diff_hash = imagehash.dhash(img)
    dct_hash = imagehash.phash(img)
    wavelet_hash = imagehash.whash(img)
    print('Hashes:')
    print('AVG: ' + str(avg_hash))
    print('DIFF: ' + str(diff_hash))
    print('DCT: ' + str(dct_hash))
    print('Wavelet: ' + str(wavelet_hash))
    print('\nTrue hash: '+phash[0])
    return avg_hash, diff_hash, dct_hash, wavelet_hash

print("Image 1 perceptual hashes:")
avg_hash1, diff_hash1, dct_hash1, wavelet_hash1 = find_img_phashes(Image.open(img_path1))
print("\nImage 2 perceptual hashes:")
avg_hash2, diff_hash2, dct_hash2, wavelet_hash2 = find_img_phashes(Image.open(img_path2))

fig, ax = plt.subplots(1, 2, figsize=(10, 10)) 
ax[0].imshow(Image.open(img_path1))
ax[1].imshow(Image.open(img_path2))
if(dct_hash1 == dct_hash2):
    print("\nThe pictures are perceptually the same !")
else:
    print(f"\nThe pictures are different, distance: {dct_hash1 - dct_hash2}")

### So, our perceptual hashing algorithm is DCT

### But! DCT is tolerant to minor transformations that we have a lot in our data (acc. to http://www.hackerfactor.com/blog/?/archives/432-Looks-Like-It.html).
### This implies using additional perceptual hashes and some voting rule?!

# Explore product descriptions

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

### What text appears to be most frequently used?

In [None]:
text = ' '.join(descr)
wordcloud = WordCloud(width=400, height=400, min_font_size=8, max_font_size=64, background_color='white').generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

### The most frequent words can provide some noise to model and we should handle them accordingly

In [None]:
descr

In [None]:
sns.distplot(descr.map(lambda x: x.split(' ')).map(len), axlabel='#words in description')

### There are quite a few samples where description is either too short or too long, giving the average length of ~8.
### Smart way to concatenate image- and text-based predictions with these looks beneficial

# Cooking the model

In [None]:
def run_training(base_dir):
    data = pd.read_csv(base_dir)

    present_imgs = os.listdir(f"{base_dir}/train_images/")
    data = data[data['image'].isin(present_imgs)]
    data['image'] = data['image'].apply(lambda x: f"{base_dir}/train_images/" + x)

    encoder = LabelEncoder()
    data['label_group'] = encoder.fit_transform(data['label_group'])
    train_dataset = ShopeeDataset(data, transforms=get_train_transforms())

    torch.cuda.empty_cache()
    TRAIN_IDXS = int(0.9 * len(train_dataset))

    indices = np.arange(len(train_dataset))
    train_indices, test_indices = train_test_split(indices, train_size=TRAIN_IDXS)

    train_subset = Subset(train_dataset, train_indices)
    val_subset = Subset(train_dataset, test_indices)

    train_dataloader = DataLoader(dataset=train_subset,
                                  batch_size=CFG.batch_size,
                                  num_workers=CFG.num_workers,
                                  shuffle=True,
                                  pin_memory=True,
                                  drop_last=True)
    val_dataloader = DataLoader(dataset=val_subset,
                                batch_size=CFG.batch_size,
                                shuffle=True,
                                pin_memory=True,
                                drop_last=True)

    model = ShopeeCNNModel('efficientnet_b3')
    model.to(CFG.device)

    existing_layer = torch.nn.SiLU
    new_layer = Mish()
    model = replace_activations(model, existing_layer, new_layer)

    lr_start = 1e-2
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr_start)
    scheduler = lr_scheduler.CosineAnnealingLR(optimizer, 3, eta_min=lr_start * 1e-4)

    history = {'train': [], 'val': []}

    for i in range(CFG.train_epochs):
        epoch_loss_train = train_fn(model, train_dataloader, criterion, optimizer, scheduler, i)
        epoch_loss_val = eval_fn(model, val_dataloader, i)
        history['train'].append(epoch_loss_train)
        history['val'].append(epoch_loss_val)
        torch.save(model.state_dict(), 'arcface_512x512_efficientnet_b3.pt')

    print(history)

# Inference

## Image data

In [None]:
TEST_PATH = f"{BASE_DIR}/test.csv"

In [None]:
def get_image_embeddings(base_dir, model_name, model_path):
    embeds = []

    model = ShopeeCNNModel(model_name=model_name)
    model.to(CFG.device)
    model.eval()

    model.load_state_dict(torch.load(model_path))
    model = model.to(CFG.device)

    test_data = pd.read_csv(f"{base_dir}/test.csv")

    test_data['image'] = test_data['image'].apply(lambda x: f"{base_dir}/test_images/" + x)

    test_dataset = ShopeeDataset(test_data, transforms=get_test_transforms(), is_training=False)
    image_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=CFG.batch_size,
        pin_memory=True,
        drop_last=False,
        num_workers=4
    )

    with torch.no_grad():
        for img, label in tqdm(image_loader):
            img = img.cuda()
            label = label.cuda()
            feat = model(img, label)
            image_embeddings = feat.detach().cpu().numpy()
            embeds.append(image_embeddings)

    del model
    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

In [None]:
def get_image_predictions(df, embeddings, threshold=0.0):
    if len(df) > 3:
        KNN = 50
    else:
        KNN = 3

    model = NearestNeighbors(n_neighbors=KNN, metric='cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)

    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < threshold)[0]  # check if the distance is small enough for a 'neighbour'
        ids = indices[k, idx]  # select indices in KNN dataframe that match the distance req for item k
        posting_ids = df['posting_id'].iloc[ids].values  # obtain posting ids from indices, including identity
        predictions.append(posting_ids)

    del model, distances, indices
    gc.collect()
    return predictions

## Text data

#### Since the hidden test set contains approx. 70k samples, it requires GPU optimized ML to keep iterations relatively quick

In [None]:
def get_text_predictions(df, df_cu, max_features = 20_000):
    
    model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
    text_embeddings = model.fit_transform(df_cu['title']).toarray()
    preds = []
    CHUNK = 1024*4

    print('Finding similar titles...')
    CTS = len(df)//CHUNK
    if len(df)%CHUNK!=0: CTS += 1
    for j in range( CTS ):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(df))
        print('chunk',a,'to',b)

        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul( text_embeddings, text_embeddings[a:b].T).T

        for k in range(b-a):
            IDX = cupy.where(cts[k,]>0.75)[0]
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
    
    del model,text_embeddings
    gc.collect()
    return preds

In [None]:
KNN_DISTANCE_THRESH = 0.21
MAX_TEXT_TOKENS = 15_000

In [None]:
test_data = pd.read_csv(f"{BASE_DIR}/test.csv")

In [None]:
df,df_cu,image_paths = read_dataset(BASE_DIR)

image_embeddings = get_image_embeddings(BASE_DIR, CFG.model_name2, CFG.model_path2)
image_predictions = get_image_predictions(df, image_embeddings, threshold = KNN_DISTANCE_THRESH)
text_predictions = get_text_predictions(df, df_cu, max_features = MAX_TEXT_TOKENS)

In [None]:
def combine_predictions(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions']])
    return ' '.join( np.unique(x))

In [None]:
df['image_predictions'] = image_predictions
df['text_predictions'] = text_predictions
df['matches'] = df.apply(combine_predictions, axis = 1)
df[['posting_id', 'matches']].to_csv('submission.csv', index = False)

In [None]:
pd.read_csv('submission.csv').head()