## Summary
This is the inference notebook for https://www.kaggle.com/underwearfitting/pytorch-densenet-arcface-validation-training. In this notebook, I submitted a single fold trained Arcface Densenet121 with a CV 0.731. I computed the cosine similarities between the feature vectors. To make it faster, I put this process on GPU and compute by batches to avoid OOM issue. Don't hesitate if you have any questions; answering your questions can help me learn as well. 

## Configuration

In [None]:
image_size = 640
batch_size = 32
num_workers = 4
n_batch = 10 # to avoid oom, split 70000+ images into 10 batches
sim_thresh = 0.8
text_sim_thresh = 0.9
CHANGE_P = False
text_filter_threshold = 0.1

## Imports

In [None]:
!pip install ../input/pytorchtabnet/pytorch_tabnet-3.1.0-py3-none-any.whl

In [None]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
import os
import sys
import time
import cv2
import PIL.Image
import random
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
import albumentations
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import gc
from sklearn.metrics import roc_auc_score
%matplotlib inline
import seaborn as sns
from pylab import rcParams
import timm
from warnings import filterwarnings
from sklearn.preprocessing import LabelEncoder
import math
import glob
filterwarnings("ignore")

device = torch.device('cuda') 
import pickle
import cudf
import cuml
import cupy
from sklearn.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
from pytorch_tabnet.tab_model import TabNetClassifier

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f'Setting all seeds to be {seed} to reproduce...')
seed_everything(42)

## Transforms

In [None]:
transforms_valid = albumentations.Compose([
    albumentations.Resize(image_size, image_size),
    albumentations.Normalize()
])

transforms_valid_768 = albumentations.Compose([
    albumentations.Resize(768, 768),
    albumentations.Normalize()
])

## Dataset

In [None]:
from transformers import AutoTokenizer

class SHOPEEDataset(Dataset):
    def __init__(self, df, mode, tokenizer_path="../input/distilbert-base-indonesian", transform=None, use_image=True):
        
        self.df = df.reset_index(drop=True)
        self.mode = mode
        self.transform = transform
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        self.use_image = use_image
#         self.tokenizer = AutoTokenizer.from_pretrained("../input/bert-base-uncased")
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.loc[index]
        if self.use_image:
            img = cv2.imread(row.file_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img_512 = img.copy()
            img_768 = img.copy()
        
            if self.transform is not None:
                res = self.transform(image=img_512)
                img_512 = res['image']
                
            img_512 = img_512.astype(np.float32)
            img_512 = img_512.transpose(2,0,1)
            
            res = transforms_valid_768(image=img_768)
            img_768 = res['image']
            img_768 = img_768.astype(np.float32)
            img_768 = img_768.transpose(2,0,1)
        else:
            img_512 = [0, 1, 2] # it's dummy
            img_768 = [0, 1, 2]
         
        text = row.title
        text = self.tokenizer(text, padding='max_length', truncation=True, max_length=35, return_tensors="pt")
        input_ids = text['input_ids'][0]
        attention_mask = text['attention_mask'][0]
#         token_type_ids = text["token_type_ids"][0]
        
        if "token_type_ids" in text:
            token_type_ids = text["token_type_ids"][0]
#             return torch.tensor(img).float(),text['input_ids'][0], text['attention_mask'][0], text["token_type_ids"][0]
            return torch.tensor(img_512).float(),torch.tensor(img_768).float() ,text['input_ids'][0], text['attention_mask'][0],token_type_ids
        else:
            return torch.tensor(img_512).float(),torch.tensor(img_768).float(),text['input_ids'][0], text['attention_mask'][0]

In [None]:
class SHOPEETextDataset(Dataset):
    def __init__(self, df, data_dir, vectorizer=None):
        le = LabelEncoder()
        self.df = df
        self.df['file_path'] = self.df.image.apply(lambda x: os.path.join(data_dir, x))
        self.data_dir = data_dir

        titles = self.df["title"].tolist()
        if vectorizer is None:
            self.vectorizer = CountVectorizer(analyzer="char", ngram_range=(1, 2))
            self.array = self.vectorizer.fit_transform(titles).toarray()
        else:
            self.vectorizer = vectorizer
#             self.titles = titles
            self.array = self.vectorizer.transform(titles)
        print(self.array.shape)

    def get_vectorizer(self):
        return self.vectorizer, len(self.array[0])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
#         print (self.array[index])
        arr = self.array[index].toarray()[0]
#         arr = self.array[index]
        return torch.tensor(arr).float()


## Model

In [None]:
from torch.nn.parameter import Parameter

class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, s=30, m=0.5):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_normal_(self.weight)

        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = torch.tensor(math.cos(math.pi - m))
        self.mm = torch.tensor(math.sin(math.pi - m) * m)

    def forward(self, inputs, labels):
        cos_th = F.linear(F.normalize(inputs.float()), F.normalize(self.weight.float()))
        cos_th = cos_th.clamp(-1, 1).float()
        sin_th = torch.sqrt(1.0 - torch.pow(cos_th, 2)).float()
        cos_th_m = cos_th * self.cos_m - sin_th * self.sin_m
        # print(type(cos_th), type(self.th), type(cos_th_m), type(self.mm))
        cos_th_m = torch.where(cos_th > self.th, cos_th_m, cos_th - self.mm)

        cond_v = cos_th - self.th
        cond = cond_v <= 0
        cos_th_m[cond] = (cos_th - self.mm)[cond]

        if labels.dim() == 1:
            labels = labels.unsqueeze(-1)
        onehot = torch.zeros(cos_th.size()).cuda()
        labels = labels.type(torch.LongTensor).cuda()
        onehot.scatter_(1, labels, 1.0)
        outputs = onehot * cos_th_m + (1.0 - onehot) * cos_th
        outputs = outputs * self.s
        return outputs


def gem(x, p=3, eps=1e-6):
    return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1. / p)


class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6, p_trainable=True):
        super(GeM, self).__init__()
        if p_trainable:
            self.p = Parameter(torch.ones(1) * p)
        else:
            self.p = p
        self.eps = eps

    def forward(self, x):
        return gem(x, p=self.p, eps=self.eps)

    def __repr__(self):
        return self.__class__.__name__ + '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + ', ' + 'eps=' + str(
            self.eps) + ')'


class Backbone(nn.Module):

    def __init__(self, name='resnet18', pretrained=True):
        super(Backbone, self).__init__()
        self.net = timm.create_model(name, pretrained=pretrained)

        if 'regnet' in name:
            self.out_features = self.net.head.fc.in_features
        elif 'csp' in name:
            self.out_features = self.net.head.fc.in_features
        elif 'res' in name:  # works also for resnest
            self.out_features = self.net.fc.in_features
        elif 'efficientnet' in name:
            self.out_features = self.net.classifier.in_features
        elif 'densenet' in name:
            self.out_features = self.net.classifier.in_features
        elif 'senet' in name:
            self.out_features = self.net.fc.in_features
        elif 'inception' in name:
            self.out_features = self.net.last_linear.in_features

        else:
            self.out_features = self.net.classifier.in_features

    def forward(self, x):
        x = self.net.forward_features(x)

        return x
    
    

sigmoid = torch.nn.Sigmoid()
class Swish(torch.autograd.Function):
    @staticmethod
    def forward(ctx, i):
        result = i * sigmoid(i)
        ctx.save_for_backward(i)
        return result
    @staticmethod
    def backward(ctx, grad_output):
        i = ctx.saved_variables[0]
        sigmoid_i = sigmoid(i)
        return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))

class Swish_module(nn.Module):
    def forward(self, x):
        return Swish.apply(x)

In [None]:
from transformers import AutoModel
from torch.nn.parameter import Parameter

class TransformerModel(nn.Module):
    def __init__(self, transformer_type, pooling="avg", p=3., p_trainable=True):
        super(TransformerModel, self).__init__()
        self.bert = AutoModel.from_pretrained(transformer_type)
        self.pooling = pooling
        self.eps = 1e-6
        if p_trainable:
            self.p = Parameter(torch.ones(1) * p)
        else:
            self.p = p

    def forward(self, input_ids, attention_mask, token_type_ids):
        if token_type_ids is None:
            out = self.bert(
                input_ids,
                attention_mask=attention_mask,
                # token_type_ids = token_type_ids
            )[0]
        else:
            out = self.bert(
                input_ids,
                attention_mask=attention_mask,
                token_type_ids = token_type_ids
            )[0]
        att_mask = input_ids > 0
        # x_bert = super().forward(ids, att_mask, token_type_ids=seg_ids)[0]
        att_mask = att_mask.unsqueeze(-1)
        if self.pooling == "avg":
            return (out * att_mask).sum(dim=1) / att_mask.sum(dim=1)
        else:
            return((out * att_mask).sum(dim=1).clamp(self.eps).pow(self.p) / att_mask.sum(dim=1)).pow(1. / self.p)
        # return out

class Net(nn.Module):
    def __init__(self, args, pretrained=True):
        super(Net, self).__init__()

        self.args = args
        self.image_features(args, pretrained=pretrained)
        self.embedding_size = args["embedding_size"]

        # https://www.groundai.com/project/arcface-additive-angular-margin-loss-for-deep-face-recognition
        if args["neck"] == "option-D":
            self.neck = nn.Sequential(
                nn.Linear(self.backbone.out_features, self.embedding_size, bias=True),
                nn.BatchNorm1d(self.embedding_size),
                torch.nn.PReLU()
            )
        elif args["neck"] == "option-F":
            self.neck = nn.Sequential(
                nn.Dropout(0.3),
                nn.Linear(self.backbone.out_features, self.embedding_size, bias=True),
                nn.BatchNorm1d(self.embedding_size),
                torch.nn.PReLU()
            )
        else:
            self.neck = nn.Sequential(
                nn.Linear(self.backbone.out_features, self.embedding_size, bias=False),
                nn.BatchNorm1d(self.embedding_size),
            )

        if args["neck"] == "option-D":
            self.neckv2 = nn.Sequential(
                nn.Linear(self.backbone.out_features, self.embedding_size, bias=True),
                nn.BatchNorm1d(self.embedding_size),
                torch.nn.PReLU()
            )
        elif args["neck"] == "option-F":
            self.neckv2 = nn.Sequential(
                nn.Dropout(0.3),
                nn.Linear(self.backbone.out_features, self.embedding_size, bias=True),
                nn.BatchNorm1d(self.embedding_size),
                torch.nn.PReLU()
            )
        else:
            self.neckv2 = nn.Sequential(
                nn.Linear(self.backbone.out_features, self.embedding_size, bias=False),
                nn.BatchNorm1d(self.embedding_size),
            )
        # self.neckv2 = nn.Linear(768 + self.backbone.out_features, self.embedding_size)
        self.swish = Swish_module()
        self.dropout = nn.Dropout(0.5)

        self.head = ArcMarginProduct(self.embedding_size, args["n_classes"], s=args["s"], m=args["m"])
        # self.head = ArcMarginProduct_subcenter(self.embedding_size, args["n_classes"])

        if args["pretrained_weights"] is not None:
            self.load_state_dict(torch.load(args.pretrained_weights, map_location='cpu'), strict=False)
            print('weights loaded from', args.pretrained_weights)

    def image_features(self, args, pretrained):
        self.args = args
        self.backbone = Backbone(args["backbone"], pretrained=pretrained)

        if args["pool"] == "gem":
            self.global_pool = GeM(p_trainable=3)
        elif args["pool"] == "identity":
            self.global_pool = torch.nn.Identity()
        else:
            self.global_pool = nn.AdaptiveAvgPool2d(1)
            
    def change_p(self):
        self.global_pool.p = Parameter(self.global_pool.p.data + torch.tensor(1.0))

    def forward(self, images, labels,input_ids, attention_mask, token_type_ids, get_embeddings=False, get_attentions=False):
        x = self.backbone(images)
        x = self.global_pool(x)
        x = x[:, :, 0, 0]
        x = self.neckv2(x)
        return F.normalize(x)
        
class NetNLP(nn.Module):
    def __init__(self, args, pretrained=True):
        super(NetNLP, self).__init__()

        self.args = args
        self.text_features(args, pretrained)
        self.embedding_size = args["embedding_size"]

        self.neckv2 = nn.Sequential(
                nn.Dropout(0.1),
                nn.Linear(768, self.embedding_size, bias=False),
                nn.BatchNorm1d(self.embedding_size),
            )

#         self.head = ArcMarginProduct(self.embedding_size, args["n_classes"], s=args["s"], m=args["m"])
        # self.head = ArcMarginProduct_subcenter(self.embedding_size, args["n_classes"])

        if args["pretrained_weights"] is not None:
            self.load_state_dict(torch.load(args.pretrained_weights, map_location='cpu'), strict=False)
            print('weights loaded from', args.pretrained_weights)

        # for param in self.bert.parameters():
        #     param.requires_grad = False

    def image_features(self, args, pretrained):
        self.args = args
        self.backbone = Backbone(args["backbone"], pretrained=pretrained)

        if args["pool"] == "gem":
            self.global_pool = GeM(p_trainable=True)
        elif args["pool"] == "identity":
            self.global_pool = torch.nn.Identity()
        else:
            self.global_pool = nn.AdaptiveAvgPool2d(1)


    def text_features(self, args, pretrained):
        self.bert = TransformerModel(args["transformer_type"], pooling="avg")
        

    def forward(self, images, labels,input_ids, attention_mask, token_type_ids, get_embeddings=False, get_attentions=False):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        # out = self.neckv2(out)
        # x = torch.cat([x, out], 1)
        # print (x.size())
        x = self.neckv2(out)
        # logits = self.head(x)
#         logits = self.head(x, labels)

        # print (logits)

        return F.normalize(x)
        
        
class NetV2(nn.Module):
    def __init__(self, args, pretrained=True):
        super(NetV2, self).__init__()

        self.args = args
        self.image_features(args, pretrained=pretrained)
        self.text_features(args, pretrained)

        self.embedding_size = args["embedding_size"]

        if args["neck"] == "option-D":
            self.neckv2 = nn.Sequential(
                nn.Linear(768 + self.backbone.out_features, self.embedding_size, bias=True),
                nn.BatchNorm1d(self.embedding_size),
                torch.nn.PReLU()
            )
        elif args["neck"] == "option-F":
            self.neckv2 = nn.Sequential(
                nn.Dropout(0.3),
                nn.Linear(768 + self.backbone.out_features, self.embedding_size, bias=True),
                nn.BatchNorm1d(self.embedding_size),
                torch.nn.PReLU()
            )
        else:
            self.neckv2 = nn.Sequential(
                nn.Linear(768 + self.backbone.out_features, self.embedding_size, bias=False),
                nn.BatchNorm1d(self.embedding_size),
                Swish_module()
            )
        # self.neckv2 = nn.Linear(768 + self.backbone.out_features, self.embedding_size)
        #
        # self.dropout = nn.Dropout(0.5)
#         if args["metric_type"] == "arc_margin":
#             self.head = ArcMarginProduct(self.embedding_size, args["n_classes"], s=args["s"], m=args["m"])
#         else:
#             self.head = ArcMarginProduct_subcenter(self.embedding_size, args["n_classes"])


        if args["pretrained_weights"] is not None:
            self.load_state_dict(torch.load(args.pretrained_weights, map_location='cpu'), strict=False)
            print('weights loaded from', args.pretrained_weights)

    def image_features(self, args, pretrained):
        self.args = args
        self.backbone = Backbone(args["backbone"], pretrained=pretrained)

        if args["pool"] == "gem":
            self.global_pool = GeM(p_trainable=False)
        elif args["pool"] == "identity":
            self.global_pool = torch.nn.Identity()
        else:
            self.global_pool = nn.AdaptiveAvgPool2d(1)


    def text_features(self, args, pretrained):
        self.bert = TransformerModel(args["transformer_type"], pooling="avg")

    def forward(self, images, labels,input_ids, attention_mask, token_type_ids, get_embeddings=False, get_attentions=False):
        x = self.backbone(images)
        x = self.global_pool(x)
        x = x[:, :, 0, 0]
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        # out = self.neckv2(out)
        x = torch.cat([x, out], 1)
        # print (x.size())
        x = self.neckv2(x)
        # print (logits)

        return F.normalize(x)

## Read in test data

In [None]:
def fix_encoding(x):
    return normalize("NFD", codecs.escape_decode(x, 'hex')[0].decode("utf-8"))

In [None]:
data_dir = '../input/shopee-product-matching/train_images'
df_train = pd.read_csv('../input/shopee-product-matching/train.csv')
df_train['file_path'] = df_train.image.apply(lambda x: os.path.join(data_dir, x))

In [None]:
# torch.load('../input/shopeae-models/efficientnetb3_baseline.pth', map_location='cuda:0')

In [None]:
model2 = Net(args={
            "backbone": "resnet200d",
            "pool": "gem",
            "s": 30,
            "m": 0.5,
            "neck": "",
            "embedding_size": 512,
            "n_classes": 11014,
            "pretrained_weights": None,
            "transformer_type": "bert-base-uncased"
        }, pretrained=False)
model2.load_state_dict(torch.load('../input/shopee-models/resnet200d_images_baseline_optionH_cutout_bh_gnoise_all_img640.pth', map_location='cuda:0')["state_dict"], strict=True)
model2.to(device);
if CHANGE_P:
    print (model2.global_pool.p.data)
    model2.change_p()
    print (model2.global_pool.p.data)

In [None]:
model = Net(args={
            "backbone": "tf_efficientnet_b3_ns",
            "pool": "gem",
            "s": 30,
            "m": 0.5,
            "neck": "",
            "embedding_size": 512,
            "n_classes": 11014,
            "pretrained_weights": None,
            "transformer_type": "bert-base-uncased"
        }, pretrained=False)
model.load_state_dict(torch.load('../input/shopee-models/efficientnetb3_ns_images_baseline_optionH_cutout_bh_v2_768_all.pth', map_location='cuda:0')["state_dict"], strict=False)
model.to(device);
if CHANGE_P:
    print (model.global_pool.p.data)
    model.change_p()
    print (model.global_pool.p.data)

In [None]:
model3 = Net(args={
            "backbone": "tf_efficientnet_b5_ns",
            "pool": "gem",
            "s": 30,
            "m": 0.5,
            "neck": "",
            "embedding_size": 512,
            "n_classes": 11014,
            "pretrained_weights": None,
            "transformer_type": "bert-base-uncased"
        }, pretrained=False)
model3.load_state_dict(torch.load('../input/shopee-models/efficientnetb5_ns_images_baseline_optionH_cutout_bh_gnoise_all_size640.pth', map_location='cuda:0')["state_dict"], strict=True)
model3.to(device);
if CHANGE_P:
    print (model3.global_pool.p.data)
    model3.change_p()
    print (model3.global_pool.p.data)

In [None]:
nlp_model = NetNLP(args={
            "backbone": "tf_efficientnet_b3",
            "pool": "gem",
            "s": 30,
            "m": 0.5,
            "neck": "option-D",
            "embedding_size": 512,
            "n_classes": 11014,
            "pretrained_weights": None,
            "transformer_type": "../input/distilbert-base-indonesian/",
#     "pretrained_weights": 
})
nlp_model.load_state_dict(torch.load('../input/shopee-bert/distilbert_baseline_optionH_count_bs255_all.pth', map_location='cuda:0')["state_dict"], strict=False)
nlp_model.to(device);

In [None]:
# nlp_bert_model = NetNLP(args={
#             "backbone": "tf_efficientnet_b3",
#             "pool": "gem",
#             "s": 30,
#             "m": 0.5,
#             "neck": "option-D",
#             "embedding_size": 512,
#             "n_classes": 11014,
#             "pretrained_weights": None,
#             "transformer_type": "../input/bertindo15g/",
# #     "pretrained_weights": 
# })
# nlp_bert_model.load_state_dict(torch.load('../input/shopee-bert/bert_indonesian15G_baseline_optionH_count_bs255_all.pth', map_location='cuda:0')["state_dict"], strict=False)
# nlp_bert_model.to(device);

In [None]:
class NLPMLP(nn.Module):
    def __init__(self, n_feat, n_class):
        super(NLPMLP, self).__init__()
        self.feature_extract = nn.Sequential(
            nn.Linear(n_feat, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
        )

#         self.neckv2 = ArcMarginProduct(512, n_class, s=32, m=0.5)

    def forward(self, feat1):
        x = self.feature_extract(feat1)
#         logits = self.neckv2(x, label)

        return F.normalize(x)

class NLPMLPV3(nn.Module):
    def __init__(self, n_feat, n_class):
        super(NLPMLPV3, self).__init__()
        self.feature_extract = nn.Sequential(
            nn.Linear(n_feat, 1024),
            nn.BatchNorm1d(1024),
            nn.PReLU(),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
        )

#         self.neckv2 = ArcMarginProduct(512, n_class, s=32, m=0.5)

    def forward(self, feat1):
        x = self.feature_extract(feat1)
#         logits = self.neckv2(x, label)

        return F.normalize(x)

In [None]:
nlp_model2 = NLPMLP(2716, 11014)
vectorizer = pickle.load(open("../input/shopee-mlp/count_vector.pkl", "rb"))
nlp_model2.load_state_dict(torch.load('../input/shopee-mlp/epoch_0099.pth', map_location='cuda:0')["state_dict"], strict=False)
nlp_model2.to(device);

In [None]:
nlp_model3 = NLPMLPV3(30210, 11014)
vectorizer2 = pickle.load(open("../input/shopee-mlp/count_vector_1_3_v2.pkl", "rb"))
nlp_model3.load_state_dict(torch.load('../input/shopee-mlp/mlpv3_images_cutout_bh_ngram13_v2.pth', map_location='cuda:0')["state_dict"], strict=False)
nlp_model3.to(device);

In [None]:
import codecs
from unicodedata import normalize

def fix_encoding(x):
    return normalize("NFD", codecs.escape_decode(x, 'hex')[0].decode("utf-8"))

In [None]:
test = pd.read_csv('../input/shopee-product-matching/test.csv')
# test = pd.concat([test]*25000)
test['file_path'] = test.image.apply(lambda x: os.path.join('../input/shopee-product-matching/test_images',x))
test["title"] = test["title"].apply(fix_encoding)

In [None]:
dataset_test = SHOPEEDataset(test, 'test', transform=transforms_valid)
test_loader = torch.utils.data.DataLoader(dataset_test, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)

In [None]:
distilbert_dataset_test = SHOPEEDataset(test, 'test', transform=transforms_valid,tokenizer_path="../input/distilbert-base-indonesian",use_image=False)
distilbert_test_loader = torch.utils.data.DataLoader(distilbert_dataset_test, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)

In [None]:
# bert_dataset_test = SHOPEEDataset(test, 'test', transform=transforms_valid,tokenizer_path="../input/bertindo15g",use_image=False)
# bert_test_loader = torch.utils.data.DataLoader(bert_dataset_test, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)

In [None]:
test['file_path'] = test.image.apply(lambda x: os.path.join('../input/shopee-product-matching/test_images',x))
dataset_text_test = SHOPEETextDataset(test, 'test', vectorizer)
test_text_loader = torch.utils.data.DataLoader(dataset_text_test, batch_size=128, shuffle=False, num_workers=num_workers, pin_memory=True)

In [None]:
test['file_path'] = test.image.apply(lambda x: os.path.join('../input/shopee-product-matching/test_images',x))
dataset_text_test2 = SHOPEETextDataset(test, 'test', vectorizer2)
test_text_loader2 = torch.utils.data.DataLoader(dataset_text_test2, batch_size=32, shuffle=False, num_workers=num_workers, pin_memory=True)

## Generate Features

In [None]:
def generate_test_features(test_loader):
    model.eval()
    model2.eval()
    model3.eval()
    bar = tqdm(test_loader)
    
    FEAS = []
    TARGETS = []

    with torch.no_grad():
        for batch_idx, (images512, images768,input_ids, attention_mask) in enumerate(bar):

            images768 = images768.to(device)
            images512 = images512.to(device)
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
#             token_type_ids = token_type_ids.to(device)

            features = model(images768, None, input_ids, attention_mask, None, get_embeddings=True)
            features2 = model2(images512, None, input_ids, attention_mask, None, get_embeddings=True)
            features3 = model3(images512, None, input_ids, attention_mask, None, get_embeddings=True)
            
            concat_feat = torch.cat([features, features2, features3], axis=1)
            
            FEAS += [concat_feat.detach().cpu()]

    FEAS = torch.cat(FEAS).cpu().numpy()
    
    return FEAS

In [None]:
FEAS = generate_test_features(test_loader)
FEAS.shape

In [None]:
n, _ = FEAS.shape
bs = n // 10 

In [None]:
del model
gc.collect()
torch.cuda.empty_cache()

In [None]:
def generate_test_features2(test_loader):
    model2.eval()
    bar = tqdm(test_loader)
    
    FEAS_MODEL2 = []
    TARGETS = []

    with torch.no_grad():
        for batch_idx, (images, input_ids, attention_mask) in enumerate(bar):
            images = images.to(device)
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
#             token_type_ids = token_type_ids.to(device)

            features = model2(images, None, input_ids, attention_mask, None, get_embeddings=True)

            FEAS_MODEL2 += [features.detach().cpu()]

    FEAS_MODEL2 = torch.cat(FEAS_MODEL2).cpu().numpy()
    
    return FEAS_MODEL2

In [None]:
# FEAS_MODEL2 = generate_test_features2(test_loader)

In [None]:
del model2
gc.collect()
torch.cuda.empty_cache()

In [None]:
def generate_test_features3(test_loader):
    model3.eval()
    bar = tqdm(test_loader)
    
    FEAS_MODEL3 = []
    TARGETS = []

    with torch.no_grad():
        for batch_idx, (images, input_ids, attention_mask, _) in enumerate(bar):

            images = images.to(device)
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
#             token_type_ids = token_type_ids.to(device)

            features = model3(images, None, input_ids, attention_mask, None, get_embeddings=True)

            FEAS_MODEL3 += [features.detach().cpu()]

    FEAS_MODEL3 = torch.cat(FEAS_MODEL3).cpu().numpy()
    
    return FEAS_MODEL3

In [None]:
# FEAS_MODEL3  = generate_test_features3(test_loader)

In [None]:
del model3
gc.collect()
torch.cuda.empty_cache()

In [None]:
def generate_nlp_test_features(test_loader):
    nlp_model2.eval()
    bar = tqdm(test_loader)
    
    FEAS = []
    TARGETS = []

    with torch.no_grad():
        for batch_idx, text_feat in enumerate(bar):

            text_feat = text_feat.to(device)
#             input_ids = input_ids.to(device)
#             attention_mask = attention_mask.to(device)
#             token_type_ids = token_type_ids.to(device)

            features = nlp_model2(text_feat)

            FEAS += [features.detach().cpu()]

    FEAS = torch.cat(FEAS).cpu().numpy()
    
    return FEAS

In [None]:
TEXT_FEAS = generate_nlp_test_features(test_text_loader)

In [None]:
def generate_nlp_test_features3(test_loader):
    nlp_model3.eval()
    bar = tqdm(test_loader)
    
    FEAS = []
    TARGETS = []

    with torch.no_grad():
        for batch_idx, text_feat in enumerate(bar):

            text_feat = text_feat.to(device)
#             input_ids = input_ids.to(device)
#             attention_mask = attention_mask.to(device)
#             token_type_ids = token_type_ids.to(device)

            features = nlp_model3(text_feat)

            FEAS += [features.detach().cpu()]

    FEAS = torch.cat(FEAS).cpu().numpy()
    
    return FEAS

In [None]:
TEXT_FEAS4 = generate_nlp_test_features3(test_text_loader2)

In [None]:
def generate_nlp_test_features(test_loader):
    nlp_model.eval()
    bar = tqdm(test_loader)
    
    TEXT_FEAS_2 = []
    TARGETS = []

    with torch.no_grad():
        for batch_idx, (images, images2,input_ids, attention_mask) in enumerate(bar):

            images = images.to(device)
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
#             token_type_ids = token_type_ids.to(device)

            features = nlp_model(images, None, input_ids, attention_mask, None, get_embeddings=True)

            TEXT_FEAS_2 += [features.detach().cpu()]

    TEXT_FEAS_2 = torch.cat(TEXT_FEAS_2).cpu().numpy()
    
    return TEXT_FEAS_2

In [None]:
TEXT_FEAS_2 = generate_nlp_test_features(distilbert_test_loader)
TEXT_FEAS_2.shape

In [None]:
# def generate_nlp_test_features_v2(test_loader):
#     nlp_bert_model.eval()
#     bar = tqdm(test_loader)
    
#     TEXT_FEAS_3 = []
#     TARGETS = []

#     with torch.no_grad():
#         for batch_idx, (images, images2, input_ids, attention_mask, token_type_ids) in enumerate(bar):

#             images = images.to(device)
#             input_ids = input_ids.to(device)
#             attention_mask = attention_mask.to(device)
#             token_type_ids = token_type_ids.to(device)

#             features = nlp_bert_model(images, None, input_ids, attention_mask, token_type_ids, get_embeddings=True)

#             TEXT_FEAS_3 += [features.detach().cpu()]

#     TEXT_FEAS_3 = torch.cat(TEXT_FEAS_3).cpu().numpy()
    
#     return TEXT_FEAS_3

In [None]:
# TEXT_FEAS_3 = generate_nlp_test_features_v2(bert_test_loader)
# TEXT_FEAS_3.shape

### TFIDF

In [None]:
#https://www.kaggle.com/cdeotte/part-2-rapids-tfidfvectorizer-cv-0-700#Use-Text-Embeddings

def get_text_predictions(df, max_features = 25_000):
    
    model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
    text_embeddings = model.fit_transform(df_cu['title']).toarray()
    preds = []
    idx_list = []
    CHUNK = 1024*4

    print('Finding similar titles...')
    CTS = len(df)//CHUNK
    if len(df)%CHUNK!=0: CTS += 1
    for j in range( CTS ):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(df))
        print('chunk',a,'to',b)

        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul( text_embeddings, text_embeddings[a:b].T).T

        for k in range(b-a):
            IDX = cupy.where(cts[k,]>0.7)[0]
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
            idx_list.append(IDX)
            preds.append(o)
    
    del model,text_embeddings
    gc.collect()
    return preds, idx_list

In [None]:
# df_cu = cudf.DataFrame(test)
# text_predictions,text_idx_list = get_text_predictions(test, max_features = 25_000)

In [None]:
def return_dba_feat(feat, thresh, k):
    feat = torch.tensor(feat).cuda()
    batches = []
    for i in range(n_batch):
        left = bs * i
        right = bs * (i+1)
        if i == n_batch - 1:
            right = n
        batches.append(feat[left:right,:])
            
    matches = []
    dba_feat = torch.zeros_like(feat)
    cnt = 0
    for batch in tqdm(batches):
        batch = batch.cuda()
        similarity_matrix = batch@feat.T
#         print (similarity_matrix.topk(len(test)))
        selection = ((similarity_matrix > sim_thresh)).cpu().numpy()
#         selection_indexes = similarity_matrix.topk(len(test))[1].cpu().numpy()[:, k:]
#         selection = ((batch@feat.T) > sim_thresh).cpu().numpy()
        
        for i, row in enumerate(selection):
#             print (len(row), len(selection_index))
#             print (feat[row].size())
            if len(feat[row]) == 1:
                dba_feat[cnt] = batch[i]
            else:
#                 print (feat[row].mean(axis=0))
                dba_feat[cnt] = feat[row].mean(axis=0)
            cnt += 1
#             print (np.mean(feat[row, :], axis=0))
#             matches.append(' '.join(test.iloc[row].posting_id.tolist()))

    return dba_feat

## Inference by batches

In [None]:
def db_aug(V):
    model = NearestNeighbors(n_neighbors=2, metric="cosine")
    model.fit(V)
    distances, indices = model.kneighbors(V)
    
    w = np.power(np.clip(2.0 - distances, 0, 2.0), 0.5)
    
    V = (w[:, 0, None]*V[indices[:, 0]] + w[:, 1, None]*V[indices[:, 1]])/w.sum(axis=1)[:, None]
    
    return V

In [None]:
def get_neighbors(embeddings, KNN = 2, image = True):
    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    new_embeddings = np.zeros_like(embeddings)
    for i, idx in enumerate(indices):
#         print (embeddings[idx, :].shape)
        new_embeddings[i] = embeddings[idx, :].mean(axis=0)
    return new_embeddings

In [None]:
import collections
def combine_for_sub(row):
#     x = np.concatenate([row.image_matches.split(" "),row.nlp_matches.split(" "), row.image_matches2.split(" ")])
#     x = np.concatenate([row.image_matches.split(" "),row.nlp_matches.split(" "), row.text_predictions])
    text_unmatches = row.text_unmatches.split(" ")
    nlp_matches = [elem for elem in row.nlp_matches.split(" ") if elem not in text_unmatches]
#     text_predictions = [elem for elem in row.text_predictions if elem not in text_unmatches]
#     x = np.concatenate([row.image_matches.split(" "), nlp_matches, text_predictions])
    x = np.concatenate([row.image_matches.split(" "), nlp_matches])
    
    return ' '.join( np.unique(x) )

def combine_for_cv(row):
    x = np.concatenate([row.image_matches,row.nlp_matches])
    return np.unique(x)

def combine_graph(df):
    posting_id_list = df["posting_id"].tolist()
    match_list = []
    for posting_id in posting_id_list:
        match = " ".join(df[(df["matches"].str.contains(posting_id)) & (df["posting_id"] != posting_id)]["matches"].tolist())
        orig_match = df[df["posting_id"] != posting_id]["matches"].iloc[0]
        
        c = collections.Counter(match.split(" "))
        expand_items = [key for key, value in c.items() if value > (len(df[df["posting_id"] != posting_id]) / 2)]
            
        match_str = ' '.join( np.unique(orig_match.split(" ") + expand_items))
        match_list.append(match_str)
    df["matches"] = match_list
    return df

In [None]:
CONCAT_FEAS = F.normalize(torch.tensor(np.hstack([FEAS, TEXT_FEAS_2])).cuda())
DBA_FEAS = db_aug(CONCAT_FEAS.cpu().numpy())
DBA_FEAS = F.normalize(torch.tensor(DBA_FEAS).cuda())
    
CONCAT_TEXT_FEAS = torch.tensor(np.hstack([TEXT_FEAS4, TEXT_FEAS])).cuda()
CONCAT_TEXT_FEAS = db_aug(CONCAT_TEXT_FEAS.cpu().numpy())
CONCAT_TEXT_FEAS = F.normalize(torch.tensor(CONCAT_TEXT_FEAS).cuda())

In [None]:
image_batches = []
for i in range(n_batch):
    left = bs * i
    right = bs * (i+1)
    if i == n_batch - 1:
        right = n
    image_batches.append(DBA_FEAS[left:right,:])

text_batches = []
for i in range(n_batch):
    left = bs * i
    right = bs * (i+1)
    if i == n_batch - 1:
        right = n
    text_batches.append(CONCAT_TEXT_FEAS[left:right,:])

In [None]:
posting_ids = test["posting_id"].tolist()

In [None]:
match_list = []
cnt = 0

for image_batch, text_batch in zip(image_batches, text_batches):
    batch = image_batch.cuda()
    text_batch = text_batch.cuda()
    
    similarity_matrix = batch@DBA_FEAS.T
    selection = (similarity_matrix > sim_thresh).cpu().numpy()
    
    text_similarity_matrix = text_batch@CONCAT_TEXT_FEAS.T
    text_selection = (text_similarity_matrix > sim_thresh).cpu().numpy()
    
    for idx, (img_row, text_row) in enumerate(zip(selection, text_selection)):
        row = img_row | text_row
        match_ids = test.iloc[row].posting_id.tolist()
        text_distances = text_similarity_matrix[idx, row]
        image_distances = similarity_matrix[idx, row]
        max_text_dist = text_distances.max().item()
        max_image_dist = image_distances.max().item()
        
        posting_id = posting_ids[cnt]
        for match_id,text_distance, image_distance in zip(match_ids, text_distances, image_distances):
            match_list.append({
                "posting_id": posting_id, 
                "matches": match_id, 
                "text_dist": text_distance.item(), 
                "image_dist": image_distance.item(),
                "max_text_dist": max_text_dist,
                "max_image_dist": max_image_dist,
                "diff_max_text_dist": max_text_dist - text_distance.item(),
                "diff_max_image_dist": max_image_dist - image_distance.item(),
            })
        cnt += 1
#         matches.append(' '.join(test.iloc[row].posting_id.tolist()))

In [None]:
res_df = pd.DataFrame(match_list)
print(res_df.shape)
res_df.head()

In [None]:
res_df["multiply_dist"] = res_df["text_dist"] * res_df["image_dist"]
res_df["total_dist"] = res_df["text_dist"] + res_df["image_dist"]
res_df["dist_rank"] = res_df.groupby("posting_id")["text_dist"].rank()
res_df["image_dist_rank"] = res_df.groupby("posting_id")["image_dist"].rank()

In [None]:
preds_df = res_df[res_df["posting_id"] < res_df["matches"]]

In [None]:
preds_df = preds_df.merge(test[["posting_id", "title"]], on="posting_id", how="left")
preds_df = preds_df.merge(test[["posting_id", "title"]].rename(columns={"posting_id": "matches"}), on="matches", how="left")

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1,1), binary=True)
tfidf.fit(test["title"])
tfidf2 = TfidfVectorizer(analyzer="char", ngram_range=(5, 5))
tfidf2.fit(test["title"])

preds_df["cos_sim"] = tfidf.transform(preds_df["title_x"]).multiply(tfidf.transform(preds_df["title_y"])).sum(axis=1)
preds_df["cos_sim2"] = tfidf2.transform(preds_df["title_x"]).multiply(tfidf2.transform(preds_df["title_y"])).sum(axis=1)

In [None]:
import xgboost as xgb

features = [
    "image_dist", "text_dist", "dist_rank", "image_dist_rank",
    "multiply_dist", "total_dist", "cos_sim", "cos_sim2",
    "max_text_dist", "max_image_dist", 
    "diff_max_text_dist", "diff_max_image_dist"
]

xgb_model = xgb.XGBClassifier()
xgb_model.load_model("../input/shopee-2ndstage/xgb_8076.json")

xgb_pred = xgb_model.predict_proba(preds_df[features])[:, 1]
nn_pred = np.zeros(len(preds_df))
for f in range(4):
    with open(f"../input/shopee-2ndstage/model_{f}.pkl", 'rb') as model_file:
        new_clf = pickle.load(model_file)
    nn_pred += new_clf.predict_proba(np.array(preds_df[features]).astype(np.float32))[:, 1].ravel()
nn_pred /= 4.
normal_pred = (nn_pred + xgb_pred) / 2.0
preds_df["pred"] = normal_pred
# preds_df[features] = preds_df[features].rank(pct=True, axis=0)

# xgb_model = xgb.XGBClassifier()
# xgb_model.load_model("../input/shopee-2ndstage/xgb_8111_pct.json")
# xgb_pct_pred = xgb_model.predict_proba(preds_df[features])[:, 1]

# nn_pct_pred = np.zeros(len(preds_df))
# for f in range(4):
#     with open(f"../input/shopee-2ndstage/model_{f}.pkl", 'rb') as model_file:
#         new_clf = pickle.load(model_file)
#     nn_pct_pred += new_clf.predict_proba(np.array(preds_df[features]).astype(np.float32))[:, 1].ravel()
# nn_pct_pred /= 4.
# pct_pred = (nn_pct_pred + xgb_pct_pred) / 2.0

# preds_df["pred"] = (normal_pred + pct_pred) / 2.0
preds_df.head()

In [None]:
def agglomerative_clustering(preds_df, single_link_threshold=0.30, group_link_threshold=0.50, group_merge_threshold=0.60):

    groups = dict()
    group_members = dict()

    gix = 0
    for i, row in tqdm(preds_df.sort_values("pred", ascending=False).iterrows(), total=preds_df.shape[0]):
        if row.pred > single_link_threshold:
            g1 = groups.get(row.posting_id)
            g2 = groups.get(row.matches)

            if g1 is None and g2 is None:
                groups[row.posting_id] = gix
                groups[row.matches] = gix
                group_members[gix] = {row.posting_id, row.matches}
                gix += 1
            elif g1 is None:
                if row.pred > group_link_threshold:
                    groups[row.posting_id] = g2
                    group_members[g2].add(row.posting_id)
            elif g2 is None:
                if row.pred > group_link_threshold:
                    groups[row.matches] = g1
                    group_members[g1].add(row.matches)
            elif (g1 != g2) and (row.pred > group_merge_threshold):
                groups[row.matches] = g1
                group_members[g1].update(group_members[g2])

                del group_members[g2]

                for k, v in groups.items():
                    if v == g2:
                        groups[k] = g1


    print(len(groups))

    out_df = []

    for k, v in groups.items():
        for k2 in group_members[v]:
            if k != k2:
                out_df.append({"posting_id": k, "matches": k2})

    return pd.DataFrame(out_df)

out_df = agglomerative_clustering(preds_df, group_link_threshold=0.75, group_merge_threshold=0.80)
out_df.shape

same_df =test[["posting_id"]].copy()
same_df["matches"] = same_df["posting_id"].values

out_df = out_df.append(same_df)
out_df.shape

In [None]:
# THRESHOLD = 0.63

# out_df = preds_df[preds_df["pred"] > THRESHOLD][["posting_id", "matches"]]
# out_df = out_df.append(out_df.rename(columns={"posting_id": "matches", "matches": "posting_id"})).drop_duplicates()
# out_df.shape

In [None]:
# same_df = test[["posting_id"]].copy()
# same_df["matches"] = same_df["posting_id"].values

# out_df = out_df.append(same_df)
# out_df.shape

In [None]:
out_df = out_df.groupby("posting_id")["matches"].agg(list).reset_index()
out_df["matches"] = out_df["matches"].apply(lambda x: " ".join(x))
out_df.head()

In [None]:
out_df.to_csv('submission.csv', index=False, columns=['posting_id', 'matches'])

In [None]:
# if True:
# #     FEAS = return_dba_feat(FEAS, 0.95, 3)
#     FEAS = F.normalize(torch.tensor(np.hstack([FEAS, TEXT_FEAS_2])).cuda())
# #     FEAS = F.normalize(torch.tensor(np.hstack([FEAS, FEAS_MODEL2])).cuda())
#     FEAS = db_aug(FEAS.cpu().numpy())
#     FEAS = F.normalize(torch.tensor(FEAS).cuda())
# #     FEAS = torch.tensor(FEAS).cuda()
#     batches = []
#     for i in range(n_batch):
#         left = bs * i
#         right = bs * (i+1)
#         if i == n_batch - 1:
#             right = n
#         batches.append(FEAS[left:right,:])
    
#     matches = []
#     un_matches = []
#     cnt = 0
#     for batch in tqdm(batches):
#         batch = batch.cuda()
#         similarity_matrix = batch@FEAS.T
#         selection = (similarity_matrix > sim_thresh).cpu().numpy()
        
#         for row in selection:
#             matches.append(' '.join(test.iloc[row].posting_id.tolist()))
        
#         for i in range(len(similarity_matrix)):
#             un_matches_ids = []
#             for text_idx in text_idx_list[cnt]:
#                 if similarity_matrix[i, int(text_idx)] < text_filter_threshold:
#                     un_matches_ids.append(text_idx)
#             un_matches.append(' '.join(test.iloc[un_matches_ids].posting_id.tolist()))
#             cnt += 1
            
# #     print (matches)

#     submission = pd.read_csv('../input/shopee-product-matching/sample_submission.csv')
#     submission['image_matches'] = matches
#     submission['text_unmatches'] = un_matches
            
# #     FEAS_MODEL2 = torch.tensor(FEAS_MODEL2).cuda()
# #     batches = []
# #     for i in range(n_batch):
# #         left = bs * i
# #         right = bs * (i+1)
# #         if i == n_batch - 1:
# #             right = n
# #         batches.append(FEAS_MODEL2[left:right,:])
    
# #     matches = []
# #     for batch in tqdm(batches):
# #         batch = batch.cuda()
# #         selection = ((batch@FEAS_MODEL2.T) > sim_thresh).cpu().numpy()
        
# #         for row in selection:
# #             matches.append(' '.join(test.iloc[row].posting_id.tolist()))
    
# #     submission = pd.read_csv('../input/shopee-product-matching/sample_submission.csv')
# #     submission['image_matches2'] = matches

#     TEXT_FEAS = torch.tensor(np.hstack([TEXT_FEAS, TEXT_FEAS4])).cuda()
#     TEXT_FEAS = db_aug(TEXT_FEAS.cpu().numpy())
#     TEXT_FEAS = F.normalize(torch.tensor(TEXT_FEAS).cuda())
#     batches = []
#     for i in range(n_batch):
#         left = bs * i
#         right = bs * (i+1)
#         if i == n_batch - 1:
#             right = n
#         batches.append(TEXT_FEAS[left:right,:])
    
#     matches = []
#     for batch in tqdm(batches):
#         batch = batch.cuda()
#         selection = ((batch@TEXT_FEAS.T) > text_sim_thresh).cpu().numpy()
        
#         for row in selection:
#             matches.append(' '.join(test.iloc[row].posting_id.tolist()))

#     submission['nlp_matches'] = matches
# #     submission['matches'] = matches
#     submission['text_predictions'] = text_predictions
#     submission['matches'] = submission.apply(combine_for_sub, axis = 1)

# #     submission = pd.read_csv('../input/shopee-product-matching/sample_submission.csv')
# #     TEXT_FEAS = torch.tensor(np.hstack([TEXT_FEAS, FEAS])).cuda()
# #     batches = []
# #     for i in range(n_batch):
# #         left = bs * i
# #         right = bs * (i+1)
# #         if i == n_batch - 1:
# #             right = n
# #         batches.append(TEXT_FEAS[left:right,:])
    
# #     matches = []
# #     for batch in tqdm(batches):
# #         batch = batch.cuda()
# #         selection = ((batch@TEXT_FEAS.T) > text_sim_thresh).cpu().numpy()
        
# #         for row in selection:
# #             matches.append(' '.join(test.iloc[row].posting_id.tolist()))
# #     submission = pd.read_csv('../input/shopee-product-matching/sample_submission.csv')
# #     submission['nlp_matches'] = matches
# #     submission['matches'] = matches
# #     submission['matches'] = submission.apply(combine_for_sub, axis = 1)
# #     submission['matches'] = matches
# #     submission = combine_graph(submission)
# submission[['posting_id', 'matches']].to_csv('submission.csv', index=False)