In [None]:
import numpy as np
import pandas as pd

SUBMIT = False

if SUBMIT:
    train = pd.read_csv('../input/shopee-product-matching/test.csv', usecols=["posting_id", "title"])
else:
    train = pd.read_csv('../input/shopee-product-matching/train.csv', usecols=["posting_id", "title", "label_group"])
    tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
    train['target'] = train.label_group.map(tmp)

In [None]:
from tqdm.notebook import tqdm
from transformers import *
import torch

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# downloaded from https://huggingface.co/sentence-transformers/stsb-xlm-r-multilingual/tree/main
tokenizer = AutoTokenizer.from_pretrained("../input/xlmxlm")
model = AutoModel.from_pretrained("../input/xlmxlm").to(device)

text_tensor = torch.zeros((train.shape[0], 768)).to(device)
chunk = 64
for i in tqdm(list(range(0, train.shape[0], chunk)) + [train.shape[0]-chunk]):
    titles = []
    for title in train.title[i : i + chunk].values:
        title = title.encode('utf-8').decode("unicode_escape")
        title = title.encode('ascii', 'ignore').decode("unicode_escape")
        title = title.lower()
        titles.append(title)
    
    encoded_input = tokenizer(titles, padding=True, truncation=True,
                              max_length=128, return_tensors='pt').to(device)
    with torch.no_grad():
        model_output = model(**encoded_input)

    text_tensor[i : i + chunk] = mean_pooling(model_output, encoded_input['attention_mask'])

text_tensor /= torch.norm(text_tensor, p=2, dim=-1, keepdim=True)

In [None]:
from tqdm.notebook import tqdm
import torch

out_preds = []
chunk = 32
for i in tqdm(list(range(0, train.shape[0], chunk)) + [train.shape[0]-chunk]):
    arr = text_tensor[i : i + chunk] @ text_tensor.T

    indices = torch.nonzero(arr > 0.86)

    preds = dict()
    for k in range(arr.shape[0]):
        preds[k] = []
    for ind in range(indices.size(0)):
        preds[indices[ind, 0].item()].append(indices[ind, 1].item())

    out_preds.extend([(train.iloc[k].posting_id, train.iloc[v].posting_id.tolist()) for k, v in preds.items()])

out_preds = out_preds[:train.shape[0]]
df = pd.DataFrame(out_preds, columns=["index","pred"])
df.set_index("index")

In [None]:
if not SUBMIT:
    df["true"] = train["target"]

    f1 = []
    for index, row in df[["true", "pred"]].iterrows():
        f1.append((2 * len(set(row["true"]) & set(row["pred"])))/(len(row["true"]) + len(row["pred"])))

    print(f'F1: {np.mean(f1)}')
else:
    df["posting_id"] = train["posting_id"]
    df["matches"] = df["pred"].apply(lambda x : " ".join(x))
    df[['posting_id','matches']].to_csv('submission.csv',index=False)