## Dirty code to make it work...

In [None]:
import sys
!cp -r ../input/openai-clip/CLIP/CLIP-main /tmp/

# Kaggle likes to unpack .gz files in datasets... so we have to pack it back
!gzip -c /tmp/CLIP-main/clip/bpe_simple_vocab_16e6.txt > /tmp/CLIP-main/clip/bpe_simple_vocab_16e6.txt.gz
sys.path.append('/tmp/CLIP-main')

In [None]:
%%capture
!pip install ../input/openai-clip/ftfy-5.9/ftfy-5.9 \
             ../input/openai-clip/torch-1.7.1+cu110-cp37-cp37m-linux_x86_64.whl \
             ../input/openai-clip/torchvision-0.8.2+cu110-cp37-cp37m-linux_x86_64.whl \
             ../input/faiss-163/faiss_gpu-1.6.3-cp37-cp37m-manylinux2010_x86_64.whl

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import clip
from PIL import Image
from pathlib import Path
from tqdm.auto import tqdm
import re
from clip.simple_tokenizer import SimpleTokenizer
import faiss
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
df_test = pd.read_csv('../input/shopee-product-matching/test.csv', index_col='posting_id')

In [None]:
# Run train only for commit
DO_TRAIN = len(df_test) == 3

In [None]:
_tokenizer = SimpleTokenizer()

# Copied from https://github.com/openai/CLIP/blob/beba48f35392a73c6c47ae67ddffced81ad1916d/clip/clip.py#L164
# but with relaxed exception
def tokenize(texts, context_length: int = 77) -> torch.LongTensor:
    if isinstance(texts, str):
        texts = [texts]

    sot_token = _tokenizer.encoder["<|startoftext|>"]
    eot_token = _tokenizer.encoder["<|endoftext|>"]
    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)

    for i, tokens in enumerate(all_tokens):
        n = min(len(tokens), context_length)
        result[i, :n] = torch.tensor(tokens)[:n]
        if len(tokens) > context_length:
            result[i, -1] = tokens[-1]

    return result

In [None]:
RE_EMOJI = re.compile(r"\\x[A-Za-z0-9./]+", flags=re.UNICODE)

def strip_emoji(text):
    return RE_EMOJI.sub(r'', text)

In [None]:
# Load CLIP
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("../input/openai-clip/RN50.pt", device=device)

In [None]:
embed_dim = model.text_projection.shape[1]
embed_dim

In [None]:
class MyDataset(Dataset):
    def __init__(self, df, images_path):
        super().__init__()
        self.df = df
        self.images_path = images_path
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        image = preprocess(Image.open(self.images_path / row['image']))
        text = tokenize([strip_emoji(row['title'])])[0]
        
        return image, text

## Generate features for train

In [None]:
if DO_TRAIN:
    train_images_path = Path('../input/shopee-product-matching/train_images')
    
    df_train = pd.read_csv('../input/shopee-product-matching/train.csv', index_col='posting_id')

    dstrain = MyDataset(df_train, train_images_path)
    dltrain = DataLoader(dstrain, batch_size=32, shuffle=False, num_workers=2)

    train_features = np.empty((len(df_train), 2*embed_dim), dtype=np.float32)

    i = 0
    for images, texts in tqdm(dltrain):
        n = len(images)
        with torch.no_grad():
            images_features = model.encode_image(images.to(device))
            texts_features = model.encode_text(texts.to(device))

        train_features[i:i+n, :embed_dim] = images_features.cpu()
        train_features[i:i+n, embed_dim:] = texts_features.cpu()

        i += n
    
    np.save('train_features-no-norm.npy', train_features)

    # l2-normalize
    train_features /= np.linalg.norm(train_features, 2, axis=1, keepdims=True)

    # Create index
    index = faiss.IndexFlatIP(2*embed_dim)

    index.add(train_features)

In [None]:
%%time
if DO_TRAIN:
    similatiries, indexes = index.search(train_features, 50)

In [None]:
if DO_TRAIN:
    np.save('similatiries.npy', similatiries)
    np.save('indexes.npy', indexes)
    
    found_groups = df_train['label_group'].values[indexes]

    is_same_group = (found_groups == found_groups[:, [0]])

    plt.hist([similatiries[is_same_group], similatiries[~is_same_group]], density=True, bins=21,
             label=['Same group', 'Different group'])
    plt.legend();

## Run on test

In [None]:
GROUP_CUT = 0.84  # Use train code to find this number

In [None]:
test_images_path = Path('../input/shopee-product-matching/test_images')

In [None]:
dstest = MyDataset(df_test, test_images_path)
dltest = DataLoader(dstest, batch_size=32, shuffle=False, num_workers=2)

In [None]:
test_features = np.empty((len(df_test), 2*embed_dim), dtype=np.float32)

i = 0
for images, texts in tqdm(dltest):
    n = len(images)
    with torch.no_grad():
        images_features = model.encode_image(images.to(device))
        texts_features = model.encode_text(texts.to(device))
        
    test_features[i:i+n, :embed_dim] = images_features.cpu()
    test_features[i:i+n, embed_dim:] = texts_features.cpu()
    
    i += n

In [None]:
# l2-normalize
test_features /= np.linalg.norm(test_features, 2, axis=1, keepdims=True)

In [None]:
# Create index
index_test = faiss.IndexFlatIP(2 * embed_dim)


index_test.add(test_features)

In [None]:
%%time
similatiries, indexes = index_test.search(test_features, 50)

In [None]:
## TODO: try range_search
# lims, similatiries, indexes = index_test.range_search(test_features, GROUP_CUT)

In [None]:
test_are_same_groups = (similatiries > GROUP_CUT)

In [None]:
results = []

for i, (test_is_same_group, index_result) in enumerate(zip(test_are_same_groups, indexes)):
    row_results = df_test.index[index_result[test_is_same_group]]
    
    results.append({
        'posting_id': df_test.index[i],
        'matches': ' '.join(row_results)
    })
    
df_sub = pd.DataFrame(results)

In [None]:
df_sub.to_csv('submission.csv', index=False)

In [None]:
!head submission.csv