In [None]:
import sys

!cp ../input/rapids/rapids.0.18.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

!pip install ../input/shopee-inference-setup/textdistance-4.2.1-py3-none-any.whl
sys.path.append("../input/timm-pytorch-image-models/pytorch-image-models-master")

In [None]:
import gc
import re
import sys
import string
import operator
import itertools
from math import sqrt
import multiprocessing 
from pathlib import Path
from functools import reduce, reduce
from collections import Counter
from joblib import Parallel, delayed
from string import digits, ascii_letters
from more_itertools import chunked

import numpy as np
import pandas as pd 

import torch

import timm, textdistance
from textdistance import Jaccard

import lightgbm

pd.options.display.max_colwidth = 1000
pd.options.display.max_rows = 1000

In [None]:
USE_GPU = True
INFERENCE = True
DEBUG = False
PCT_RANK = False
POSTING_CHUNK_IDX = 1024 * 12
THRESHOLD = 0.8
ELIMINATE_SCORE = 0.5
RANK = 3

test = pd.read_csv('../input/shopee-product-matching/test.csv')

if len(test) > 3 or INFERENCE: 
    INFERENCE = True
    config_mode = "test"
    
else:
    config_mode = "train"

configuration = {"train": {"image_dir": "train_images",
                           "filename": "train.csv"},
                 "test": {"image_dir": "test_images",
                          "filename": "test.csv"}}

test = pd.read_csv(f'../input/shopee-product-matching/{configuration[config_mode]["filename"]}')
image_paths = f'../input/shopee-product-matching/{configuration[config_mode]["image_dir"]}/' + test['image']


post_mappings = dict(enumerate(test["posting_id"].values))
inverse_post_mappings = {val: key for key, val in post_mappings.items()}
test["posting_id"] = test["posting_id"].map(inverse_post_mappings)


if not INFERENCE:
    if DEBUG:
        test = pd.concat([test, test])
    test['target'] = test.label_group.map(test.groupby('label_group').posting_id.agg('unique').to_dict())

                   

image_model_path = "../input/shopee-inference-setup/efficientnet_b2_ra-bcdf34b7.pth"
lgbm_model_path = "../input/shopee-inference-setup/lgbm_interaction_model_v5.bin"
lgbm_model = lightgbm.Booster(model_file=lgbm_model_path)

In [None]:
from sklearn.metrics import make_scorer
from contextlib import contextmanager

@contextmanager
def timer(verbose, logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None):
    if prefix: format_str = str(prefix) + format_str
    if suffix: format_str = format_str + str(suffix)
    start = time.time()
    yield
    d = time.time() - start
    out_str = format_str.format(d)
    if verbose:
        if logger:
            logger.info(out_str)
        else:
            print(out_str)
            
def get_word_frequency(df, title_col="unit_cleaned_text", sep=" "):
    return pd.DataFrame(dict(Counter(itertools.chain(*df[title_col].str.split(sep).values.tolist()))).items(), columns=["word", "frequency"])

def get_cv_metric(col):
    
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    
    return f1score

def get_character_ngrams(w, n):
    if n > 1:
        w = ["<w>"] + list(w) + ["</w>"]
    else:
        w = list(w)
    return ["".join(w[i: i+n]) for i in range(len(w)-n+1)]


In [None]:
import tokenizers
from functools import partial


def lowercase_title(title: str):
    return title.lower()

def remove_string_group(title: str,
                        group):
    return title.translate(str.maketrans("", "", group))


remove_punctuation = partial(remove_string_group,
        group=string.punctuation)

remove_digits = partial(remove_string_group,
        group=string.digits)

remove_ascii_letters = partial(remove_string_group,
        group=string.ascii_letters)

def remove_multiple_whitespace(title: str):
    return re.sub('\s+',' ',title)

def get_alphanumerical_code_matching(title: str):
    #matches = ",".join(set(re.findall("[a-z]+\d+", title)))
    matches = set(re.findall("[a-z]+\d+", title))
    matches = " ".join([remove_digits(match) + " " + remove_ascii_letters(match) for match in matches])
    
    return matches

def extract_letter_plus_numeric_substr(title_col: pd.Series):
    return title_col.replace(regex="[a-z]+\d+", value='')

def extract_numeric_plus_letter_substr(title_col: pd.Series):
    return title_col.replace(regex="\d+[a-z]+", value='')

def extract_measurement_matching(col: pd.Series): #clean_text
    
    unit_word_regex = r"\d+\s+\b(?:{unit_words})\b"

    unit_match = col.apply(lambda x: ",".join(set(re.findall("\d+\s*[a-z]+\s*", x))))
    unit_match_mask = unit_match[unit_match != ""]
    
    units = set(",".join(unit_match_mask.apply(lambda x: remove_multiple_whitespace(x)).drop_duplicates().values).split(","))
    
    repl = {unit: remove_ascii_letters(unit) + " " + remove_digits(unit) + " " for unit in units}
    units = pd.Series(list(units)).apply(remove_digits).apply(remove_multiple_whitespace).str.replace(" ", "").value_counts()
    unit_words = "|".join([word for word in units[(units >= 3)].index.tolist() if len(word) <= 6])
    

    
    unit_match = col.apply(lambda x: " ".join(set(re.findall(unit_word_regex.format(unit_words=unit_words), x))))
    
    return repl, unit_match, unit_words

def extract_measurement_idenitifer(unit_col: pd.Series):
    return unit_col.apply(remove_digits)
    
def extract_numbers(title: str):
    return " ".join(re.findall(f"\d+", title))

def remove_unit_measurements(col: pd.Series, unit_words):
    return  col.replace(unit_word_regex.format(unit_words=unit_words), "", regex=True)

def get_ngrams(title: str, n=3):
    return " ".join(itertools.chain(*[get_character_ngrams(word, n) for word in title.split(" ")]))


def train_wordpice_tokenizer(title_col: pd.Series,
                             model_dir = "models",
                             model_out_path = "shopee_title",
                             vocab_size = 1000,
                             min_frequency=2
                             ):
    
    Path(model_dir).mkdir(exist_ok=True)

    with open(model_out_path, "w") as title:
        title.write(" ".join(title_col.tolist()))
        
    tokenizer = tokenizers.BertWordPieceTokenizer()
    tokenizer.train(model_out_path, vocab_size=vocab_size, min_frequency=min_frequency)
    tokenizer.save_model(model_dir, model_out_path)
    
    return tokenizer

def encode_title_with_tokenizer(title: str,
                                tokenizer):
    return " ".join(tokenizer.encode(title).tokens)


#df_["number_match"] = df_["clean_text"].apply(lambda x: " ".join(re.findall(f"\d+", x)))

In [None]:
import collections

import timm
from tqdm import tqdm 

USE_GPU = True

if USE_GPU:
    import cudf, cuml, cupy
    from cuml.feature_extraction.text import TfidfVectorizer

def ngrams(words, n):
    d = collections.deque(maxlen=n)
    d.extend(words[:n])
    words = words[n:]
    ngram_ = []
    for window, word in zip(itertools.cycle((d,)), words):
        ngram_.append('-'.join(window))

        d.append(word)
        
    return ngram_

def align_predictions_with_test_set(df_test, match, threshold=0.7):

    match = pd.merge(df_test.loc[:, ["posting_id", "target"]],
                         match,
                         on=["posting_id"],
                         how="outer")#.set_index("posting_id")

    is_not_matched = match["pred"].isna()

    match.loc[~is_not_matched, "pred"] = (match.loc[~is_not_matched, "pred"].apply(lambda x: ",".join(x)) \
                                          + "," + match.loc[~is_not_matched, "posting_id"]).str.split(",")
    match.loc[is_not_matched, "pred"] = match.loc[is_not_matched, "posting_id"].apply(lambda x: [x])
    
    return match

In [None]:
import multiprocess

In [None]:
import cv2
import albumentations as A 
from albumentations.pytorch.transforms import ToTensorV2
from torch.utils.data import Dataset

import matplotlib.pyplot as plt

from torch import nn
from torch.hub import load_state_dict_from_url

class ShopeeDataset(Dataset):
    def __init__(self, image_paths, transforms=None):

        self.image_paths = image_paths
        self.augmentations = transforms

    def __len__(self):
        return self.image_paths.shape[0]

    def __getitem__(self, index):
        image_path = self.image_paths[index]
        
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations:
            augmented = self.augmentations(image=image)
            image = augmented['image']       
    
        return image#,torch.tensor(1)
    
def get_transformations():

    return A.Compose(
        [
            A.Resize(288, 288,always_apply=True),
            A.Normalize(),
        ToTensorV2(p=1.0)
        ]
    )

def image_viz(image_path):
    """
    Function for visualization.
    Takes path to image as input.
    """
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)    
    plt.imshow(img)
    plt.axis('off')
    
    
def extract_image_embeddings(image_paths,
                             model_name = "efficientnet_b2a",
                             model_path = image_model_path,
                             global_pool = "avg",
                             batch_size = 256,
                             num_workers = 4,
                             transform = None):
    
    embeddings = []
    
    dataset = ShopeeDataset(image_paths, transform)
    dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model = timm.create_model(model_name, pretrained=False, global_pool="avg")
    
    model.to(dev)#.cuda()
    #state_dict = load_state_dict_from_url('https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b2_ra-bcdf34b7.pth')
    #torch.save(state_dict, "efficientnet_b2_ra-bcdf34b7.pth")
    model.load_state_dict(torch.load(model_path))
    model.classifier = nn.Identity()
    model.eval()
    #model = timm.create_model(model_name, num_classes=0, global_pool=global_pool, pretrained=False).cuda()
    
    import multiprocess
    
    image_loader = torch.utils.data.DataLoader(
                                        dataset,
                                        batch_size=batch_size,
                                        pin_memory=True,
                                        drop_last=False,
                                        num_workers=multiprocess.cpu_count()
                                    )
    
    selu = torch.nn.SELU()
    
    with torch.no_grad():
        for image in tqdm(image_loader): 
            feat = selu(model(image.cuda()))
            image_embeddings = feat.detach().cpu().numpy()
            embeddings.append(image_embeddings)

    image_embeddings = cupy.array(np.concatenate(embeddings))
    image_embeddings = (image_embeddings / cupy.linalg.norm(image_embeddings, ord=2, axis=1, keepdims=False).reshape(-1, 1))
    
    del embeddings, model, dataset


    _ = gc.collect()
    
    
    return image_embeddings

In [None]:
def preprocess_df(df, substring_n=None, vocab_sizes=None):
    df_ = df.copy()
    df_["title_"] = df_["title"].apply(lowercase_title).apply(remove_punctuation).apply(remove_multiple_whitespace)
    df_["title_"] = extract_letter_plus_numeric_substr(df_["title_"])
    
    
    repl, df_["unit_seperated_title"], unit_words = extract_measurement_matching(df_["title_"])
    df_["title_"] = df_["title_"].replace(repl, regex=True).apply(remove_multiple_whitespace)
    df_["unit_seperated_title"] = df_["title_"].apply(lambda x: ",".join(set(re.findall(r"\d+\s+\b(?:{unit_words})\b".format(unit_words=unit_words), x))))
    
    
    
    df_["matched_units"] = extract_measurement_idenitifer(df_["unit_seperated_title"]).apply(remove_multiple_whitespace)
    #df_["title_"] = extract_numeric_plus_letter_substr(extract_numeric_plus_letter_substr(df_["title_"]))
    
    #df_["title_"] = df_["title_"].apply(lambda x: " ".join(set(x.split(" "))))
    df_["numeric_cleaned_title"] = df_["title_"].apply(remove_digits)
    df_["numeric_text"] = df_["title_"].apply(extract_numbers).apply(remove_multiple_whitespace)
    
    if substring_n:
        for n in substring_n:
            df_[f"subword_{n}_text"] = df_["numeric_cleaned_title"].apply(get_ngrams, n)
        
    if vocab_sizes:
        for vocab_size in vocab_sizes:
            tokenizer = train_wordpice_tokenizer(df_.numeric_cleaned_title, vocab_size=vocab_size, model_out_path=f"shopee_{vocab_size}")
            df_[f"vocab_{vocab_size}_text"] = df_["numeric_cleaned_title"].apply(partial(encode_title_with_tokenizer, tokenizer=tokenizer))
            
    df_["matched_units"] = df_["matched_units"].replace({"gr": "g", "gram": "g"}, regex=True)
    df_["is_unit_available"] = (df_["unit_seperated_title"] != '')# * 1
    df_["is_number_available"] = (df_["numeric_text"] != '')# * 1

    df_["matched_units_measurement"] = df_["matched_units"].replace("\s", "", regex=True).apply(lambda x: " ".join(set(x.split(","))))
    df_["matched_units_number"] = df_["numeric_text"].apply(lambda x: " ".join(set(x.split(" "))))

    df_["matched_codes"] = df_["title"].apply(lambda x: " ".join(re.findall("[a-z]+\d+", x)))
    df_["numeric_cleaned_title"] = df_["numeric_cleaned_title"].apply(remove_multiple_whitespace)

    return df_


In [None]:
%%time
COMPUTE_WORD_FREQUENCY = False

idx = test.index

DEBUG = False
chunk = test.shape[0]

if DEBUG: chunk = 1000
subword_list = [3]
tokenization_list = None
 
print("Computing preprocessing ...\n")
df = preprocess_df(test, subword_list, tokenization_list)

In [None]:
import time

In [None]:
%%time
import torch
from cuml.neighbors import NearestNeighbors
from cuml.feature_extraction.text import TfidfVectorizer
model = TfidfVectorizer(stop_words='english', **{"binary": False,
                                                "lowercase": False,
                                                "sublinear_tf": False
                                                #"norm": "l2",
                                                })
text_embeddings = model.fit_transform(cudf.from_pandas(df.numeric_cleaned_title))
print('text embeddings shape is',text_embeddings.shape)

KNN = min(test.shape[0], 75)
model = NearestNeighbors(n_neighbors=KNN, metric="cosine")
model.fit(text_embeddings)


distances, indices = model.kneighbors(text_embeddings)

del model, text_embeddings
_ = gc.collect()

distances = 1 - distances

title_matches2 = cudf.DataFrame(distances.T.reshape(-1)).melt().reset_index()
title_matches2["index"] = title_matches2["index"] % test.shape[0]
title_matches2["variable"] = indices.T.reshape(-1)
title_matches2.columns = ["posting_id", "pred", "score__title__text"]
title_matches2 = title_matches2.astype({"posting_id": "uint16",
                                      "pred": "uint16"})

del distances, indices
_ = gc.collect()

title_matches2 = title_matches2.to_pandas()

In [None]:
%%time

with timer(True):
    embeddings = extract_image_embeddings(image_paths, 
                                          transform = get_transformations(), 
                                          global_pool="avg",
                                          batch_size = 512)
    torch.cuda.empty_cache()

In [None]:
%%time
from cuml.neighbors import NearestNeighbors

KNN = min(test.shape[0], 75)
model = NearestNeighbors(n_neighbors=KNN, metric="cosine")
model.fit(embeddings[:chunk])
distances, indices = model.kneighbors(embeddings[:chunk])

del model, embeddings
_ = gc.collect()

distances = 1 - distances

image_matches = cudf.DataFrame(distances.T.reshape(-1)).melt().reset_index()
image_matches["index"] = image_matches["index"] % chunk
image_matches["variable"] = indices.T.reshape(-1)
image_matches.columns = ["posting_id", "pred", "score__image"]
image_matches = image_matches.astype({"posting_id": "uint16",
                                      "pred": "uint16"})

del distances, indices
_ = gc.collect()

In [None]:
%%time
from cuml.feature_extraction.text import TfidfVectorizer
model = TfidfVectorizer(stop_words='english', **{"binary": False,
                                                "lowercase": False,
                                                "sublinear_tf": True
                                                #"norm": "l2",
                                                })
text_embeddings = model.fit_transform(cudf.from_pandas(df.subword_3_text))
print('text embeddings shape is',text_embeddings.shape)

KNN = min(test.shape[0], 75)
model = NearestNeighbors(n_neighbors=KNN, metric="cosine")
model.fit(text_embeddings)


distances, indices = model.kneighbors(text_embeddings)

del model, text_embeddings
_ = gc.collect()

distances = 1 - distances

matches = cudf.DataFrame(distances.T.reshape(-1)).melt().reset_index()
matches["index"] = matches["index"] % test.shape[0]
matches["variable"] = indices.T.reshape(-1)
matches.columns = ["posting_id", "pred", "score__subword3__text"]
matches = matches.astype({"posting_id": "uint16",
              "pred": "uint16"})

del distances, indices
_ = gc.collect()

In [None]:
%%time

matches = matches.to_pandas()
matches = pd.merge(matches,
                   title_matches2,
                   on=["posting_id", "pred"],
                   how="outer")

del title_matches2

image_matches = image_matches.to_pandas()
matches = matches.merge(image_matches, on=["posting_id", "pred"], how="outer")

del image_matches

In [None]:
matches["score__title__text"] = matches.loc[:, ["score__subword3__text", "score__title__text"]].max(axis=1)
del matches["score__subword3__text"]

In [None]:
%%time

freq_df = get_word_frequency(df, "numeric_cleaned_title")
mappings = dict(zip(freq_df["word"].astype("category").cat.categories, range(0, freq_df.shape[0])))
freq_df["freq_ratio"] = freq_df["frequency"] / freq_df["frequency"].sum()
ratio_map = dict(freq_df.loc[:, ["word", "freq_ratio"]].values)

In [None]:
from nltk import ngrams

def get_n_gramlist(text,n=2):
    try:
        nngramlist=[]
        for s in ngrams(text.split(),n=n):        
            nngramlist.append(s)                
        return set(nngramlist)
    except:
        return set()

def jacc(x, y):
    x = set(x.split(" "))
    y = set(y.split(" "))
    
    return len(x), len(x.intersection(y)) / len(x.union(y))

def extract_valuable_info(x, y):
    x = set(x.split(" "))
    y = set(y.split(" "))

    res = [len(x), len(x.intersection(y)) / len(x.union(y))]
    
    word_intersection = x.intersection(y)
    
    if len(word_intersection) > 0:
        word_intersection = np.array([ratio_map[word] for word in word_intersection])
    
        return res + [word_intersection.min(), word_intersection.mean(), word_intersection.sum()]
       
    else:
        return res + [np.nan, np.nan, np.nan]

In [None]:
def extract_features(df):
    
    df_ = df.copy()

    for group_col in ["posting_id", "pred"]:
        for title in ["subword3", "title", "image"]:
            if title == "title":
                feat_col = f"score__{title}__text"
            if title == "subword3":
                feat_col = f"score__{title}__text"
            if title == "image":   
                feat_col = f"score__{title}"

            features = {f"cos_dist__{group_col}_{feat_col}_mean": (feat_col, "mean"),
                        f"cos_dist__{group_col}_{feat_col}_std": (feat_col, "std"),
                        f"cos_dist__{group_col}_{feat_col}_skew": (feat_col, "skew"),
                       f"cos_dist__{group_col}_{feat_col}_ske": (feat_col, "skew")}

            df_ = df_.merge(df_.groupby(group_col).aggregate(**features).reset_index(),
                                            on=group_col)
            
            rank_feat_col = f"cos_dist__{group_col}_{feat_col}_rank_{{feat}}"
            
    
    return df_

In [None]:
non_tr_cols = ['label',
 'cos_dist__posting_id_score__title__text_quantile_75',
 'cos_dist__pred_score__image_skew',
 'cos_dist__pred_score__image_rank_image',
 'cos_dist__posting_id_score__title__text_mean',
 'cos_dist__pred_score__title__text_skew',
 'cos_dist__pred_score__title__text_quantile_75',
 'cos_dist__pred_score__title__text_mean',
 'cos_dist__posting_id_score__subword3__text_rank_text']

In [None]:
grouped_features = []

for group_col in ["posting_id", "pred"]:
    for title in ["title", "image"]:    
        if title == "title":
            feat_col = f"score__{title}__text"
        
        if title == "image":   
            feat_col = f"score__{title}"

        features = {f"cos_dist__{group_col}_{feat_col}_mean": (feat_col, "mean"),
                    f"cos_dist__{group_col}_{feat_col}_std": (feat_col, "std"),
                    f"cos_dist__{group_col}_{feat_col}_skew": (feat_col, "skew"),
                    f"cos_dist__{group_col}_{feat_col}_quantile_75": (feat_col, lambda x: x.quantile(0.75))}
        
        features = {key:val for key, val in features.items() if key not in non_tr_cols}
        
        grouped_features.append((group_col, features))

In [None]:
def extract_ordered_features(df):
    
    df_ = df.copy()
    
    for group_col in ["posting_id", "pred"]:
        for title in ["subword3", "image"]:
            if title == "subword3":
                feat_col = f"score__{title}__text"
            if title == "image":   
                feat_col = f"score__{title}"
            
            rank_feat_col = f"cos_dist__{group_col}_{feat_col}_rank_{{feat}}"
            
            if title == "image":
                if group_col == "posting_id":
                    df_[rank_feat_col.format(feat="image")] = df_.groupby([group_col])[feat_col].transform(lambda x: x.rank(ascending=False, pct=True))
                
                else:
                    df_[rank_feat_col.format(feat="image")] = df_.groupby([group_col])[feat_col].transform(lambda x: x.rank(ascending=False, pct=True))
                    df_[rank_feat_col.format(feat="image") + "_pct"] = df_.sort_values(rank_feat_col.format(feat="image"), ascending=False).groupby([group_col])[feat_col].transform(lambda x: x.pct_change())
            
            if title == "subword3":
                if group_col == "posting_id":
                    df_[rank_feat_col.format(feat="text")] = df_.groupby([group_col])[feat_col].transform(lambda x: x.rank(ascending=False, pct=True))
                
                else:
                    df_[rank_feat_col.format(feat="text")] = df_.groupby([group_col])[feat_col].transform(lambda x: x.rank(ascending=False, pct=True))
                    df_[rank_feat_col.format(feat="text") + "_pct"] = df_.sort_values(rank_feat_col.format(feat="text"), ascending=False).groupby([group_col])[feat_col].transform(lambda x: x.pct_change())
            

    return df_

In [None]:
def extract_match_text_features(df, 
                                metadata, 
                                threshold=0.925,
                                inference=True):
    
    df_ = df.copy()
    
    feat_cols = ["posting_id",
                 "numeric_cleaned_title",
                 "is_unit_available",
                 "is_number_available",
                 "matched_units_number",
                 "matched_units_measurement",
                 "matched_codes"]

    df_ = df_.rename(columns={"posting_id": "posting_id_1",
                              "pred": "posting_id_2"})

    df_ = df_.merge(metadata.loc[:, feat_cols],
                 left_on=["posting_id_1"],
                 right_on=["posting_id"],
                 suffixes=("", "_1"),
                 how="left")\
              .merge(metadata.loc[:, feat_cols],
                     left_on=["posting_id_2"],
                     right_on=["posting_id"],
                     suffixes=("", "_2"),
                    how="left").rename(columns={col: col + "_1" for col in feat_cols if col != 'posting_id'}).drop(columns=["posting_id"])
    
    get_similarity_condition = lambda col: f"{col}_1 != '' and {col}_2 != '' and score__subword3__text > 0.05"
    match_idx = ((df_[f"numeric_cleaned_title_1"] != '') & (df_[f"numeric_cleaned_title_2"] != '') & (df_["score__subword3__text"] > 0.05))
    match_idx = match_idx[match_idx == True].index
    
    
    sim_cols = ["matched_units_number", "matched_units_measurement"]

    jaccard = Jaccard(qval=None)

    for col in sim_cols:
        df_.loc[match_idx, [f"{col}_1", f"{col}_2"]] = df_.loc[match_idx, [f"{col}_1", f"{col}_2"]].fillna("")
        df_.loc[match_idx, f"{col}__jaccard_similarity"] = [jaccard(*keys) for idx, keys in df_.loc[match_idx, [f"{col}_1", f"{col}_2"]].iterrows()]
        #df_.loc[match_idx, f"both__{col}"] = (df_.loc[match_idx, [f"{col}_1", f"{col}_2"]].all(axis=1) * 1)

    _ = gc.collect()
    
    df_ = df_.astype({wd: "uint8" for wd in df_.columns[df_.columns.str.contains("|".join(["lt", "gt"]))]})

    df_.loc[match_idx, ["numeric_cleaned_title_1", "numeric_cleaned_title_2"]] = df_.loc[match_idx, ["numeric_cleaned_title_1", "numeric_cleaned_title_2"]].fillna("")
        
    df_.loc[match_idx, "ngram2__intersection"] = [len(get_n_gramlist(key).intersection(get_n_gramlist(val))) \
                                       for idx, (key, val) in df_.loc[match_idx, ["numeric_cleaned_title_1", "numeric_cleaned_title_2"]].iterrows()]

    df_.loc[match_idx, ["tit__int", "jacc__intsc", "word__match__min", "word__match__mean", "word__match__sum"]] = [extract_valuable_info(titles[0], titles[1]) \
                                                                                         for idx, (titles) in df_.loc[match_idx, 
                                                                                           ["numeric_cleaned_title_1", "numeric_cleaned_title_2"]].iterrows()]
    
    preds = df_.loc[:, ["posting_id_1", "posting_id_2"]]
    preds = preds.loc[:, ~preds.columns.duplicated()]
    
    non_training_cols =  pd.Index(["label", 
                                   "both__matched_units_number",
                                  ])\
                         .union(df_.columns[df_.columns.isin([col + f"_{num}" for num in [1,2] for col in sim_cols])])

    cols = df_.columns[df_.columns.str.contains("__")].difference(non_training_cols).union(["posting_id_1", "posting_id_2"])
    df_.drop(columns=df_.columns[~df_.columns.isin(cols)], inplace=True)
    df_ = df_.loc[:, ~df_.columns.duplicated()]
    
    """col_order = ['both__matched_units_measurement',
                 'cos_dist__posting_id_score__image_mean',
                 'cos_dist__posting_id_score__image_rank_image',
                 'cos_dist__posting_id_score__image_rank_image_pct',
                 'cos_dist__posting_id_score__image_skew',
                 'cos_dist__posting_id_score__image_quantile_75',
                 'cos_dist__posting_id_score__image_std',
                 'cos_dist__posting_id_score__subword3__text_mean',
                 'cos_dist__posting_id_score__subword3__text_quantile_75',
                 'cos_dist__posting_id_score__subword3__text_skew',
                 'cos_dist__posting_id_score__subword3__text_std',
                 'cos_dist__posting_id_score__title__text_mean',
                 'cos_dist__posting_id_score__title__text_ske',
                 'cos_dist__posting_id_score__title__text_skew',
                 'cos_dist__posting_id_score__title__text_std',
                 'cos_dist__pred_score__image_mean',
                 'cos_dist__pred_score__image_rank_image',
                 'cos_dist__pred_score__image_rank_image_pct',
                 'cos_dist__pred_score__image_ske',
                 'cos_dist__pred_score__image_skew',
                 'cos_dist__pred_score__image_std',
                 'cos_dist__pred_score__subword3__text_mean',
                 'cos_dist__pred_score__subword3__text_ske',
                 'cos_dist__pred_score__subword3__text_skew',
                 'cos_dist__pred_score__subword3__text_std',
                 'cos_dist__pred_score__title__text_mean',
                 'cos_dist__pred_score__title__text_ske',
                 'cos_dist__pred_score__title__text_skew',
                 'cos_dist__pred_score__title__text_std',
                 'jacc__intsc',
                 'matched_units_measurement__jaccard_similarity',
                 'matched_units_number__jaccard_similarity',
                 'ngram2__intersection',
                 'score__image',
                 'score__subword3__text',
                 'score__title__text',
                 'tit__int',
                 'word__match__mean',
                 'word__match__min',
                 'word__match__sum']
    df_ = df_.loc[:, ~df_.columns.duplicated()]
    
    preds["score"] = lgbm_model.predict(df_.loc[:, col_order].values)
    preds = preds.query(f"score > {threshold}").loc[:, ["posting_id_1", "posting_id_2"]].iloc[:, [0, 1]].reset_index(drop=True)"""
    
    if not inference:
        return df_
    
    col_order = ['cos_dist__posting_id_score__image_mean',
       'cos_dist__posting_id_score__image_quantile_75',
       'cos_dist__posting_id_score__image_rank_image',
       'cos_dist__posting_id_score__image_skew',
       'cos_dist__posting_id_score__image_std',
       'cos_dist__posting_id_score__title__text_skew',
       'cos_dist__posting_id_score__title__text_std',
       'cos_dist__pred_score__image_mean',
       'cos_dist__pred_score__image_quantile_75',
       'cos_dist__pred_score__image_rank_image_pct',
       'cos_dist__pred_score__image_std',
       'cos_dist__pred_score__subword3__text_rank_text',
       'cos_dist__pred_score__subword3__text_rank_text_pct',
       'cos_dist__pred_score__title__text_std', 'jacc__intsc',
       'matched_units_measurement__jaccard_similarity',
       'matched_units_number__jaccard_similarity', 'ngram2__intersection',
       'score__image', 'score__subword3__text', 'tit__int',
       'word__match__mean', 'word__match__min', 'word__match__sum']
    
    preds["score"] = lgbm_model.predict(df_.loc[:, col_order].values)
    #preds = preds.query(f"score > {threshold}").loc[:, ["posting_id_1", "posting_id_2"]].iloc[:, [0, 1]].reset_index(drop=True)

    return preds

In [None]:
posts = matches.groupby("posting_id").aggregate(**{**grouped_features[0][1], **grouped_features[1][1]})#,  **grouped_features[2][1]})
post_preds = matches.groupby("pred").aggregate(**{**grouped_features[2][1], **grouped_features[3][1]})#,  **grouped_features[5][1]})

In [None]:
_ = gc.collect()

matches = matches.rename(columns={"score__title__text": "score__subword3__text"})
matches["chunk_idx"] = (matches["posting_id"] // POSTING_CHUNK_IDX).astype("uint8")
matches = matches[((matches.loc[:, matches.columns.str.contains("score")] > ELIMINATE_SCORE).any(axis=1))].reset_index(drop=True)#.groupby("label").size()
matches = extract_ordered_features(matches)

In [None]:
match_predictions = pd.DataFrame()

for chunk_idx in matches["chunk_idx"].unique():
    with timer(True, prefix=f"Part - {chunk_idx} computing ... "):
        match_df = extract_match_text_features(matches.query(f"chunk_idx == {chunk_idx}").merge(posts.reset_index(),
                                                                                      on="posting_id", how="left")\
                                                                               .merge(post_preds.reset_index(),
                                                                                      on="pred", how="left"), 
                                               df, threshold=THRESHOLD, inference=True)
        #match_df.to_parquet(f"final_feats{chunk_idx}-t1.parquet")
        match_df["posting_id_1"], match_df["posting_id_2"] = match_df.posting_id_1.map(post_mappings), match_df.posting_id_2.map(post_mappings)
        match_predictions = pd.concat([match_predictions, match_df])

In [None]:
#match_predictions_ = match_predictions
#match_predictions = match_predictions_.copy()

In [None]:
"""self_match_groups = pd.DataFrame([match_predictions.posting_id_1.unique(),
              match_predictions.posting_id_1.unique()], index=["posting_id_1", "posting_id_2"]).T
self_match_groups["score"] = 1

match_predictions = match_predictions[~(match_predictions["posting_id_1"] == match_predictions["posting_id_2"])].reset_index(drop=True)"""

#match_idx = match_predictions["posting_id_1"] != match_predictions["posting_id_2"]

#match_predictions["score_rank"] = match_predictions.groupby(["posting_id_1"])["score"].transform(lambda x: x.rank(ascending=False))

#pred_matches = match_predictions[(match_predictions["score"] > THRESHOLD)].groupby(["posting_id_1"])["posting_id_2"].nunique()
#non_match_groups = pred_matches[pred_matches == 1].index

#match_predictions.loc[(match_predictions["posting_id_1"].isin(non_match_groups)) & (match_predictions["score_rank"] < RANK), "score"] = 1

In [None]:
self_match_groups = pd.DataFrame([match_predictions.posting_id_1.unique(),
              match_predictions.posting_id_1.unique()], index=["posting_id_1", "posting_id_2"]).T
self_match_groups["score"] = 1

match_predictions = match_predictions[~(match_predictions["posting_id_1"] == match_predictions["posting_id_2"])].reset_index(drop=True)

#print(match_predictions.shape)

#print((match_predictions["score"] > THRESHOLD).sum())

if PCT_RANK:
    match_predictions["rank_value"] = match_predictions.groupby(["posting_id_1"])["score"].transform(lambda x: (x > 0.825).sum())
    match_predictions["rank_pct"] = match_predictions.groupby(["posting_id_1"])["score"].transform(lambda x: x.rank(ascending=False, pct=True))
    non_match_idx = match_predictions.query("rank_value == 0 and score > 0.5 and score < 0.825").index
    match_predictions.loc[non_match_idx, "score"] = 1
    
    del match_predictions["rank_value"]
    del match_predictions["rank_pct"]

    #print((match_predictions["score"] > THRESHOLD).sum())
match_predictions = pd.concat([match_predictions, self_match_groups]).reset_index(drop=True)

#print(match_predictions.shape)

In [None]:
match_predictions = match_predictions.loc[match_predictions["score"] > THRESHOLD, ["posting_id_1", "posting_id_2"]]
match_predictions = pd.concat([match_predictions, match_predictions.iloc[:, [1, 0]]]).drop_duplicates()

In [None]:
test["posting_id"] = test["posting_id"].map(post_mappings)

match_predictions = pd.DataFrame(match_predictions.groupby(["posting_id_1"])["posting_id_2"].unique().to_dict().items(), 
                                 columns=["posting_id", "pred"])

match_predictions["pred"] = match_predictions["pred"].apply(lambda x: " ".join(x))
match_predictions.columns = ['posting_id', 'matches']

match_predictions = test.loc[:, ["posting_id"]].merge(match_predictions, on=["posting_id"], how="left")
match_predictions.loc[match_predictions["matches"].isna(), "matches"] = match_predictions.loc[match_predictions["matches"].isna(), "posting_id"]

In [None]:
if not INFERENCE:
    
    test["target"] = test["target"].apply(lambda x: [post_mappings[target] for target in x])
    
    match_predictions = match_predictions.merge(test.loc[:, ["posting_id", "target"]], on=["posting_id"], how="left")
    match_predictions.matches = match_predictions.matches.apply(lambda x: np.array(x.split(" ")))
    match_predictions.loc[:, 'f1'] = match_predictions.apply(get_cv_metric('matches'),axis=1).values
    print("CV SCORE : {}".format(match_predictions["f1"].mean()))
    
    match_predictions = match_predictions.loc[:, ["posting_id", "matches"]]
    match_predictions["matches"] = match_predictions["matches"].apply(lambda x: " ".join(x))

In [None]:
match_predictions

In [None]:
match_predictions.to_csv("submission.csv", index=False)