In [None]:
import sys
from shutil import copyfile

copyfile(src = "../input/shopee-utils/utils.py", dst = "../working/utils.py")
sys.path.append("../input/timm-pytorch-image-models/pytorch-image-models-master")

In [None]:
class config:
    PATH = "../input/shopee-product-matching/"
    
    image_model_name = "eca_nfnet_l0"
    image_model_path = "../input/shopeemodel/eca_nfnet_l0_flexibleMargin_epoch_8.pt"
    text_model_name = "distilbert-base-multilingual-cased"
    text_model_path = "../input/shopeemodel/distilbert-base-multilingual-cased_epoch_6.pt"
    
    n_classes = 9024
    batch_size = 8

In [None]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.nn import functional as F
from torchvision import transforms
from utils import ShopeeTrainDataset, ShopeeImageDataset, ShopeeTextTrainDataset, ShopeeTextDataset
from utils import get_metric, validate

from transformers import AutoTokenizer, AutoModel
import timm

import cudf
import cuml

import os
from tqdm import tqdm
import math

In [None]:
class ArcFace(nn.Module):
    """ NN module for projecting extracted embeddings onto the sphere surface """
    
    def __init__(self, in_features, out_features, s=30, m=0.5):
        super(ArcFace, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.cos_m = math.cos(self.m)
        self.sin_m = math.sin(self.m)
        self.arc_min = math.cos(math.pi - self.m)
        self.margin_min = math.sin(math.pi - self.m) * self.m
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)
    
    def _update_margin(self, new_margin):
        self.m = new_margin
        self.cos_m = math.cos(self.m)
        self.sin_m = math.sin(self.m)
        self.arc_min = math.cos(math.pi - self.m)
        self.margin_min = math.sin(math.pi - self.m) * self.m

    def forward(self, embedding, label):
        cos = F.linear(F.normalize(embedding), F.normalize(self.weight))
        sin = torch.sqrt(1.0 - torch.pow(cos, 2)).clamp(0, 1)
        phi = cos * self.cos_m - sin * self.sin_m
        phi = torch.where(cos > self.arc_min, phi, cos - self.margin_min)

        one_hot = torch.zeros(cos.size(), device=device)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        logits = one_hot * phi + (1.0 - one_hot) * cos
        logits *= self.s
        return logits

In [None]:
class Model(nn.Module):
    def __init__(self, model_name, n_classes, margin=0.5, fc_dim=1024):
        super(Model, self).__init__()
        print("Building Model Backbone for {} model".format(model_name))
        self.model_name = model_name
        self.backbone = timm.create_model(model_name)
        
        if "eca_nfnet" in model_name:
            feat_size = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
            self.backbone.head.global_pool = nn.Identity()
                
        elif "efficientnet" in model_name:
            feat_size = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()
        
        self.pooling =  nn.AdaptiveAvgPool2d(1)
        self.dropout = nn.Dropout(p=0.1)
        self.fc = nn.Linear(feat_size, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self.margin = ArcFace(fc_dim, n_classes, m=margin)
        self._init_params()

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, x, labels=None):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)
        
        x = self.dropout(x)
        x = self.fc(x)
        x = self.bn(x)
        x = F.normalize(x,dim=1)
        if labels is not None:
            return self.margin(x,labels)
        else:
            return x

In [None]:
class TextModel(nn.Module):
    def __init__(self, model_name, n_classes, margin=0.5, fc_dim=1024):
        super(TextModel, self).__init__()
        print("Building Model Backbone for {} model".format(model_name))
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained("../input/huggingface-bert-variants/{model_name}/{model_name}/".format(model_name=model_name), TOKENIZERS_PARALLELISM=False)
        self.backbone = AutoModel.from_pretrained("../input/huggingface-bert-variants/{model_name}/{model_name}/".format(model_name=model_name))
        self.feat_size = self.backbone.config.hidden_size

        self.pooling =  nn.AdaptiveAvgPool2d(1)
        self.dropout = nn.Dropout(p=0.1)
        self.fc = nn.Linear(self.feat_size, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self.margin = ArcFace(fc_dim, n_classes, m=margin)
        self._init_params()

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, text, labels=None):
        inputs = self.tokenizer(text, max_length=100, truncation=True, padding=True, return_tensors="pt")
        output = self.backbone(input_ids = inputs["input_ids"].to(device), attention_mask = inputs["attention_mask"].to(device))
        embedding = output[0][:, 0, :] 
        x = self.dropout(embedding)
        x = self.fc(x)
        x = self.bn(x)
        x = F.normalize(x,dim=1)
        if labels is not None:
            return self.margin(x,labels)
        else:
            return x

In [None]:
def read_dataset(name="train"):
    assert name in {"train", "test"}
    df = pd.read_csv(config.PATH + '{}.csv'.format(name))
    df["image_path"] = config.PATH + '{}_images/'.format(name) + df['image']

    return df

In [None]:
test = pd.read_csv(config.PATH + 'test.csv')
if len(test) > 3:
    TRAIN = False
else:
    TRAIN = True
    
TRAIN = False

In [None]:
if TRAIN:
    df = read_dataset("train")
    label_group_dict = df.groupby("label_group").posting_id.agg("unique").to_dict()
    df['target'] = df.label_group.map(label_group_dict)
else:
    df = read_dataset("test")

id_to_idx = df.reset_index().set_index("posting_id")["index"].to_dict()
idx_to_id = df["posting_id"].to_dict()
    
if torch.cuda.is_available():
    df_cu = cudf.DataFrame(df) 
    
df.head()

In [None]:
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
])

df_dataset = ShopeeImageDataset(df, transform = transform)
df_dataloader = torch.utils.data.DataLoader(df_dataset, batch_size=16, shuffle=False, num_workers=2)

df_text_dataset = ShopeeTextDataset(df)
df_text_dataloader = torch.utils.data.DataLoader(df_text_dataset, batch_size=16, shuffle=False, num_workers=2)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

def get_model(model_name, model_path, n_classes):
    model = Model(model_name, n_classes)
    model.load_state_dict(torch.load(model_path, map_location=device))
    return model.to(device)

def get_text_model(model_name, model_path, n_classes):
    model = TextModel(model_name, n_classes)
    model.load_state_dict(torch.load(model_path, map_location=device))
    return model.to(device)

In [None]:
def get_image_feature(model, dataloader):
    image_features = []
    with torch.no_grad():
        for images in tqdm(dataloader):
            images = images.to(device)
            features = model(images)
            image_features.append(features)
            del images
    image_features = torch.cat(image_features, axis=0)

    torch.cuda.empty_cache()   
    return image_features


def get_tfidf_feature(df, max_features):
    if torch.cuda.is_available():
        from cuml.feature_extraction.text import TfidfVectorizer
    else:
        from sklearn.feature_extraction.text import TfidfVectorizer
    
    model = TfidfVectorizer(stop_words='english', max_features=max_features)
    model.fit(df.title)

    tfidf_features = model.transform(df.title).toarray()
    tfidf_features = torch.Tensor(tfidf_features).to(device)
    return tfidf_features

def get_bert_feature(model, dataloader):
    text_features = []
    with torch.no_grad():
        for text in tqdm(dataloader):
            text = list(text)
            features = model(text)
            text_features.append(features)
            del text
    text_features = torch.cat(text_features, axis=0)

    torch.cuda.empty_cache()   
    return text_features

In [None]:
CHUNK = 1024

def cosine_dist(x,y):
    m, n = x.size(0), y.size(0)
    
    norm_x = x.norm(p=2, dim=1, keepdim=True).expand(m,n)
    norm_y = y.norm(p=2, dim=1, keepdim=True).expand(n,m).t()
    dist = torch.matmul(x, y.t()) / (norm_x * norm_y)
    return dist

def KNN(embeddings, k=50):
    topK_index, topK_distance = list(), list()
    
    n = (embeddings.size(0) + CHUNK - 1) // CHUNK
    with torch.no_grad():
        for i in range(n):
            a = i*CHUNK
            b = min((i+1)*CHUNK, embeddings.size(0))
            x = embeddings[a:b]
            y = embeddings
            chunk_distance = x @ y.T
            topK = torch.topk(chunk_distance, k=min(k, embeddings.size(0)))
            topK_idx, topK_dist = topK[1].detach().cpu().numpy(), topK[0].detach().cpu().numpy() # size: [chunk, k]
                
            topK_index.append(topK_idx)
            topK_distance.append(topK_dist)
    
    topK_index = np.concatenate(topK_index, axis=0)
    topK_distance = np.concatenate(topK_distance, axis=0)
    
    return topK_index, topK_distance

def DistancePredict(embeddings, df, threshold=0.9, least_threshold=0.5, k=50):
    predict = []

    n = (embeddings.size(0) + CHUNK - 1) // CHUNK
    with torch.no_grad():
        for i in range(n):
            a = i*CHUNK
            b = min((i+1)*CHUNK, embeddings.size(0))
            x = embeddings[a:b]
            y = embeddings
            chunk_distance = cosine_dist(x,y)
            topK = torch.topk(chunk_distance, k=min(k, embeddings.size(0)))
            topK_idx, topK_dist = topK[1].detach().cpu().numpy(), topK[0].detach().cpu().numpy() # size: [chunk, k]

            for j, (idx, dist) in enumerate(zip(topK_idx, topK_dist)):
                mask = dist >= threshold
                # release threshold if match < 2
                if not mask[1]:
                    mask[1] = True if dist[1] >= least_threshold else False
                target_index = idx[mask]
                pred = df.iloc[target_index].posting_id.to_numpy()
                predict.append(pred)
            
    return predict

def QueryExpansion(query_embeddings, embeddings, alpha=2, k=3):
    qe_embeddings = []

    n = (query_embeddings.size(0) + CHUNK - 1) // CHUNK
    with torch.no_grad():
        for i in range(n):
            a = i*CHUNK
            b = min((i+1)*CHUNK, embeddings.size(0))
            
            dist = cosine_dist(query_embeddings[a:b], embeddings)
            topK = torch.topk(dist, k=min(k, embeddings.size(0)))
            topK_idx, topK_dist = topK[1], topK[0] # size: [CHUNK, k]

            coef = topK_dist ** alpha
            qe_embedding = (embeddings[topK_idx] * coef.unsqueeze(-1)).sum(-2)
            qe_embedding = F.normalize(qe_embedding, dim=-1)

            qe_embeddings.append(qe_embedding)

            del dist, topK, topK_idx, topK_dist, coef
            torch.cuda.empty_cache()    
            
    qe_embeddings = torch.cat(qe_embeddings, dim=0)

    return qe_embeddings

IMAGE

In [None]:
IMAGE_THRESHOLD = 0.85
IMAGE_LEAST_THRESHOLD = 0.6

model = get_model(config.image_model_name, config.image_model_path, config.n_classes)
model.eval()
image_features = get_image_feature(model, df_dataloader)
image_features_aug = QueryExpansion(image_features, image_features, k=2)

image_pred = DistancePredict(image_features_aug, df, threshold=IMAGE_THRESHOLD, least_threshold=IMAGE_LEAST_THRESHOLD)
df["image_pred"] = image_pred

del model, image_features_aug
torch.cuda.empty_cache()

if TRAIN:
    f1, prec, rec = get_metric(df["target"], df["image_pred"])
    print("Mean F1: {:f}".format(f1))
    print("Mean Precision: {:f}".format(prec))
    print("Mean Recall: {:f}".format(rec))

In [None]:
BERT_THRESHOLD = 0.85
BERT_LEAST_THRESHOLD = 0.65

text_model = get_text_model(config.text_model_name, config.text_model_path, config.n_classes)
text_model.eval()
bert_features = get_bert_feature(text_model, df_text_dataloader)
bert_features_aug = QueryExpansion(bert_features, bert_features, k=2)

bert_pred = DistancePredict(bert_features_aug, df, threshold=BERT_THRESHOLD, least_threshold=BERT_LEAST_THRESHOLD)
df["bert_pred"] = bert_pred

del text_model, bert_features_aug
torch.cuda.empty_cache()

if TRAIN:
    f1, prec, rec = get_metric(df["target"], df["bert_pred"])
    print("Mean F1: {:f}".format(f1))
    print("Mean Precision: {:f}".format(prec))
    print("Mean Recall: {:f}".format(rec))

In [None]:
# TFIDF_FEATURES = 25000
# TFIDF_THRESHOLD = 0.7

# if torch.cuda.is_available():
#     tfidf_features = get_tfidf_feature(df_cu, TFIDF_FEATURES)
# else:
#     tfidf_features = get_tfidf_feature(df, TFIDF_FEATURES)

# tfidf_pred = DistancePredict(df, tfidf_features, threshold = TFIDF_THRESHOLD, distance_type="cosine")
# df["tfidf_pred"] = tfidf_pred

# if TRAIN:
#     f1, prec, rec = get_metric(df["target"], df["tfidf_pred"])
#     print("Mean F1: {:f}".format(f1))
#     print("Mean Precision: {:f}".format(prec))
#     print("Mean Recall: {:f}".format(rec))

In [None]:
CONCAT_THRESHOLD = 0.65
CONCAT_LEAST_THRESHOLD = 0.5

concat_features = torch.cat([image_features, bert_features], axis=1)
concat_features_aug = QueryExpansion(concat_features, concat_features, k=2)


concat_pred = DistancePredict(concat_features_aug, df, threshold=CONCAT_THRESHOLD, least_threshold=CONCAT_LEAST_THRESHOLD)
df["concat_pred"] = concat_pred

del image_features, bert_features, concat_features, concat_features_aug
torch.cuda.empty_cache()

if TRAIN:
    f1, prec, rec = get_metric(df["target"], df["concat_pred"])
    print("Mean F1: {:f}".format(f1))
    print("Mean Precision: {:f}".format(prec))
    print("Mean Recall: {:f}".format(rec))

In [None]:
# candidate_pairs = []

# knn_idx, knn_dist = KNN(image_features, k=50)
# df["img_recall"] = list(knn_idx)

# knn_idx, knn_dist = KNN(bert_features, k=50)
# df["bert_recall"] = list(knn_idx)

# knn_idx, knn_dist = KNN(concat_features, k=50)
# df["concat_recall"] = list(knn_idx)

# def union(x,y):
#     return np.union1d(x,y)

# def intersect(x,y):
#     return np.intersect1d(x,y)

# df["recall"] = df.apply(lambda row: union(row['img_recall'], row["bert_recall"]),axis=1)
# df["recall"] = df.apply(lambda row: union(row['recall'], row["concat_recall"]),axis=1)


# for i, row in df.iterrows():
#     for j in row.recall:
#         candidate_pairs.append({"subject": i, "object": j})


# del knn_idx, knn_dist
# torch.cuda.empty_cache()

# candidate_pairs = pd.DataFrame.from_records(candidate_pairs)
# candidate_pairs.drop_duplicates(inplace=True)
# candidate_pairs.head()

In [None]:
# def get_anchor_feature(embeddings, feat_type = "median"):
#     features = []
    
#     n = (embeddings.size(0) + CHUNK - 1) // CHUNK
#     with torch.no_grad():
#         for i in range(n):
#             a = i*CHUNK
#             b = min((i+1)*CHUNK, embeddings.size(0))
#             x = embeddings[a:b]
#             y = embeddings
#             chunk_distance = cosine_dist(x, y)
#             if feat_type == "median":
#                 chunk_feat = chunk_distance.median(dim=1).values
#             elif feat_type == "mean":
#                 chunk_feat = chunk_distance.mean(dim=1)
#             elif feat_type == "max":
#                 chunk_feat = chunk_distance.max(dim=1).values
#             elif feat_type == "min":
#                 chunk_feat = chunk_distance.min(dim=1).values
                
#             features.append(chunk_feat)        

#     features = torch.cat(features, dim=0)
    
#     return features.detach().cpu().numpy()

# from torch.nn import CosineSimilarity
# cos = CosineSimilarity(dim=-1)

# def get_dist(embeddings, df):
#     distance = []
    
#     n = (len(df) + CHUNK - 1) // CHUNK
#     with torch.no_grad():
#         for i in range(n):
#             a = i*CHUNK
#             b = min((i+1) * CHUNK, len(df))
#             sub_idx = df.iloc[a:b].subject.to_numpy()
#             obj_idx = df.iloc[a:b].object.to_numpy()
#             dist = cos(embeddings[sub_idx], embeddings[obj_idx]).detach().cpu().numpy()
#             distance.append(dist)
            
#     distance = np.concatenate(distance, axis=0)
    
#     return distance

In [None]:
# candidate_pairs["img_dist"] = get_dist(image_features, candidate_pairs)
# candidate_pairs["bert_dist"] = get_dist(bert_features, candidate_pairs)
# candidate_pairs["concat_dist"] = get_dist(concat_features, candidate_pairs)

# print("done 0")

# for feat_type in ["median", "mean", "max", "min"]:
#     img_dist_type = get_anchor_feature(image_features, feat_type = feat_type)
#     bert_dist_type = get_anchor_feature(bert_features, feat_type = feat_type)
#     concat_dist_type = get_anchor_feature(concat_features, feat_type = feat_type)
    
#     candidate_pairs["img_dist_{}".format(feat_type)] = candidate_pairs.subject.apply(lambda x: img_dist_type[x])
#     candidate_pairs["bert_dist_{}".format(feat_type)] = candidate_pairs.subject.apply(lambda x: bert_dist_type[x])
#     candidate_pairs["concat_dist_{}".format(feat_type)] = candidate_pairs.subject.apply(lambda x: concat_dist_type[x])
    
# print("done 1")
    
# for feat in ["img", "bert", "concat"]:
#     candidate_pairs["{}_dist_max_percent".format(feat)] = candidate_pairs["{}_dist".format(feat)] / candidate_pairs["{}_dist_max".format(feat)]
#     candidate_pairs["{}_dist_min_percent".format(feat)] = candidate_pairs["{}_dist".format(feat)] / candidate_pairs["{}_dist_min".format(feat)]

# candidate_pairs.head()

In [None]:
# import xgboost as xgb

# xgb_model = xgb.XGBClassifier()
# xgb_model.load_model("../input/shopeemodel/xgb(2).json")

# feature = candidate_pairs.iloc[:, 2:]

# pair_predict = xgb_model.predict_proba(feature)
# candidate_pairs["predict_proba"] = pair_predict[:, 1]
# candidate_pairs.head()

In [None]:
# pred_df = candidate_pairs[["subject", "object","predict_proba"]]
# pred_df.head()

In [None]:
# THRESHOLD = 0.8

# final_df = []

# pred_df = pred_df[pred_df.predict_proba >= THRESHOLD]

# for i in range(len(df)):
#     matchs = []
#     for _, row in pred_df[pred_df.subject == i].iterrows():
#         matchs.append(idx_to_id[np.int64(row.object)])
#     final_df.append({"posting_id": idx_to_id[i], "pred": matchs})

# final_df = pd.DataFrame.from_records(final_df)

# if TRAIN:
#     f1, prec, rec = get_metric(df["target"], final_df["pred"])
#     print("Mean F1: {:f}".format(f1))
#     print("Mean Precision: {:f}".format(prec))
#     print("Mean Recall: {:f}".format(rec))

In [None]:
# def submission(row):
#     return ' '.join(row)

# final_df["matches"] = final_df["pred"].apply(lambda x: submission(x))

# # submit
# final_df[['posting_id','matches']].to_csv('submission.csv',index=False)
# submission = pd.read_csv('submission.csv')
# submission.head()

In [None]:
# def union(x,y):
#     return np.union1d(x,y)

# def intersect(x,y):
#     return np.intersect1d(x,y)

# df["pred"] = df.apply(lambda row: union(row['image_pred'], row["bert_pred"]),axis=1)
# df["pred"] = df.apply(lambda row: union(row['pred'], row["concat_pred"]),axis=1)

# #df["wait"] = df.apply(lambda row: intersect(row['image_pred_wait'], row["text_pred_wait"]),axis=1)
# #df["pred"] = df.apply(lambda row: union(row['pred'], row["wait"]),axis=1)

# if TRAIN:
#     f1, prec, rec = get_metric(df["target"], df["pred"])
#     print("Mean F1: {:f}".format(f1))
#     print("Mean Precision: {:f}".format(prec))
#     print("Mean Recall: {:f}".format(rec))

In [None]:
def union(x,y):
    return np.union1d(x,y)

def intersect(x,y):
    return np.intersect1d(x,y)

def submission(row):
    return ' '.join(row)

df["pred"] = df.apply(lambda row: union(row['image_pred'], row["bert_pred"]),axis=1)
df["pred"] = df.apply(lambda row: union(row['pred'], row["concat_pred"]),axis=1)

df["matches"] = df["pred"].apply(lambda x: submission(x))
# submit
df[['posting_id','matches']].to_csv('submission.csv',index=False)
submission = pd.read_csv('submission.csv')
submission.head()