In [None]:
DATA_PATH = '../input/shopee-product-matching/'
!ls ../input/shopee-pytorch-xlmroberta-doubles-relativebinclf/xlm-roberta-large_128_fold0_min_val_loss.pth

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2, matplotlib.pyplot as plt
from   tqdm import tqdm_notebook
import pickle
import gc
import torch
import torch.nn as nn
from   torch.utils.data import Dataset, DataLoader

from   sklearn.preprocessing import normalize
from   transformers import BertConfig, AutoTokenizer, AutoModelForSequenceClassification
from   transformers import RobertaTokenizer, RobertaForSequenceClassification, XLMRobertaModel
import cudf, cuml, cupy
from   cuml.feature_extraction.text import TfidfVectorizer
from   cuml.neighbors import NearestNeighbors

def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

COMPUTE_CV = True
test = pd.read_csv (DATA_PATH + 'test.csv')
if len(test)>3: COMPUTE_CV = False
else: print('this submission notebook will compute CV score, but commit notebook will not')

In [None]:
# COMPUTE_CV = False

if COMPUTE_CV:
    train = pd.read_csv(DATA_PATH + 'train.csv')
    train['image'] = DATA_PATH + 'train_images/' + train['image']
    tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
    train['target'] = train.label_group.map(tmp)
    # train_gf = cudf.read_csv(DATA_PATH + 'train.csv')
else:
    train = pd.read_csv(DATA_PATH + 'test.csv')
    train['image'] = DATA_PATH + 'test_images/' + train['image']
    # train_gf = cudf.read_csv(DATA_PATH + 'test.csv')
    
print('train shape is', train.shape )
train.head()

In [None]:
DEVICE = torch.device ('cuda' if torch.cuda.is_available () else 'cpu')
"""
test_df = None
if COMPUTE_CV:
    test_df = pd.read_csv('../input/shopee-folds/train_fold.csv')
    print('Using train as test to compute CV (since commit notebook). Shape is', test_df.shape )
else:
    test_df = pd.read_csv('../input/shopee-product-matching/test.csv')
    print('Test shape is', test_df.shape )
"""

In [None]:
if COMPUTE_CV:
    train_gf = cudf.read_csv(DATA_PATH + 'train.csv')
else:
    train_gf = cudf.read_csv(DATA_PATH + 'test.csv')

# Text BERT

bert_model_name = '../input/xlm-roberta-large'
max_len         = 128
tokenizer       = AutoTokenizer.from_pretrained (bert_model_name)
TXT_MODEL_PATH  = "../input/shopee-pytorch-xlmroberta-doubles-relativebinclf/xlm-roberta-large_128_fold0_min_val_loss.pth"

def encode (premise):
    
    encoded_dict = tokenizer (
        premise,                   # 1st of the Sentence pair to encode.
        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
        truncation=True,           # just max_len will not automatically truncate
        max_length = max_len,      # Pad & truncate all sentences.
        padding='max_length',
        return_attention_mask = True,   # Construct attn. masks.
        return_tensors = 'pt',     # Return pytorch tensors.
    ) 
    # print ('encoded_dict =', encoded_dict)
    # 1-D tensors are expected for a sample. Hence squeeze these 2-D tensors e.g [1,256] shaped tensors to 1-D [256] shape 
    for k in encoded_dict:
        encoded_dict[k] = torch.squeeze (encoded_dict[k])
    return encoded_dict

class TextDataset (Dataset):
    
    def __init__(self, df):
        self.df = df
        return
    
    def __getitem__(self, index):
        
        title = self.df['title'][index]
        title = encode (title)
        return title
    
    def __len__(self):
        return self.df.shape[0]


class MyGAPModelForSeqClf (nn.Module):
    
    def __init__(self, bert_model_name=bert_model_name, outputCount=1, drop_prob=0.2, nonlin=nn.SiLU ()):
        
        super (MyGAPModelForSeqClf, self).__init__()
        self.model       = AutoModelForSequenceClassification.from_pretrained (bert_model_name).base_model  # adding .base_model if using pretrained XLMRobertaForSequenceClassification
        """
        self.drop_prob   = drop_prob
        self.nonlin      = nonlin
        self.outputCount = outputCount
        hidden_size      = self.model.config.hidden_size
        self.dense       = nn.Linear (hidden_size, hidden_size)
        self.batchnorm   = nn.BatchNorm1d (hidden_size)
        self.outDense    = nn.Linear (hidden_size, outputCount)
        self.dropout     = nn.Dropout (drop_prob) """
        return
    
    def freeze (self):
        
        for param in self.model.base_model.parameters ():
            param.requires_grad = False
        return
    
    def unfreeze (self):
        
        for param in self.model.base_model.parameters ():
            param.requires_grad = True
        return
    
    def forward (self, input_ids, attention_mask, token_type_ids=None, labels=None, **kwargs):
        
        last_hidden_states = None
        
        # The base bert model do not take labels as input
        if token_type_ids is None:
            moutput = self.model (input_ids=input_ids, attention_mask=attention_mask)
            last_hidden_states = moutput[0]
        else:
            moutput = self.model (input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            last_hidden_states = moutput[0]
        #print('last_hidden_states.size=', last_hidden_states.size())
        
        # GAP: last_hidden_states shape = batch_size * max_seq_len * emb_dim(1024?)
        # output shape = batch_size * emb_dim(1024?)  i.e avg across the sequence
        last_hidden_states = torch.mean (last_hidden_states, 1)             #;print('GAP last_hidden_states.size=', last_hidden_states.size())
        # OR
        # Use just the 1st [CLS] embedding
        # last_hidden_states = last_hidden_states[:, 0, :]                      #;print ('last_hidden_states.shape =', last_hidden_states.shape)
        return last_hidden_states
    
text_model = MyGAPModelForSeqClf (bert_model_name)
# load the pretrained model which was trained this code only (by commenting out these 4 line)
# try:
#     text_model.load_state_dict (torch.load (TXT_MODEL_PATH)['model_state_dict'])
# except:
#     text_model.load_state_dict (torch.load (TXT_MODEL_PATH, map_location='cpu')['model_state_dict'])
    
import warnings
warnings.filterwarnings ("ignore")
text_model.to (DEVICE)

text_dataset = TextDataset (train)
text_dl = DataLoader (text_dataset, batch_size=10, num_workers=4)
text_embeddings = []

for batch in text_dl:
    with torch.no_grad ():        
        for k in batch:
            batch[k] = batch[k].to (DEVICE)        
        emb = text_model (**batch)
        text_embeddings.append (emb.cpu ().numpy ())

text_embeddings = np.vstack (text_embeddings)
text_embeddings = normalize (text_embeddings)
# Store data (serialize)
with open ('text_embeddings.np', 'wb') as handle:
    pickle.dump (text_embeddings, handle)

del text_model
gc.collect ()
torch.cuda.empty_cache ()

[](http://)!ls ../input/shopee-img-txt-knn-inf-v2/text_embeddings.np

from scipy.spatial.distance import cdist

# Load data (deserialize)
with open('../input/k/sapthrishi007/shopee-img-txt-knn-inf-v2/text_embeddings.np', 'rb') as handle:
    text_embeddings = pickle.load (handle)

text_embeddings = cupy.array (text_embeddings)
gc.collect ()
for thresh in [0.9993]:
    
    preds = []
    CHUNK = 1024*4
    print('Finding similar titles...')

    """
    # Find nearest neighbours e.g using Siamese embeddings
    model = NearestNeighbors (n_neighbors = 10)
    model.fit (text_embeddings)
    distances, indices = model.kneighbors (text_embeddings)
    thresholds = list (np.arange (0.5, 1.5, 0.1))
    scores = []
    for threshold in thresholds:

        preds = []
        for k in range (text_embeddings.shape[0]):
            idx = np.where (distances[k,] < threshold)[0]
            ids = indices[k,idx]
            posting_ids = train.iloc[cupy.asnumpy(ids)].posting_id.values
            preds.append (posting_ids)
        train['oof_bert'] = preds
        train['f1'] = train.apply (getMetric ('oof_bert'), axis=1)
        score = train.f1.mean ()
        print (f'Our f1 score for threshold {threshold} is {score}')
        scores.append (score)
    thresholds_scores = pd.DataFrame ({'thresholds': thresholds, 'scores': scores})
    max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
    best_threshold = max_score['thresholds'].values[0]
    best_score = max_score['scores'].values[0]
    print(f'Our best score is {best_score} and has a threshold {best_threshold}')
    # OR :-
    """
    
    CTS = len (train) // CHUNK
    if len(train)%CHUNK!=0: CTS += 1
    for j in range( CTS ):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(train))
        print('chunk',a,'to',b)

        # COSINE SIMILARITY DISTANCE
        # cts = np.dot( text_embeddings, text_embeddings[a:b].T).T
        cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T

        for k in range(b-a):
            # IDX = np.where(cts[k,]>0.7)[0]
            IDX = cupy.where(cts[k,] > thresh)[0]
            o = train.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)

    train['oof_bert'] = preds
    if COMPUTE_CV:
        train['f1'] = train.apply (getMetric ('oof_bert'), axis=1)
        print(f'CV score for bert-gap thresh {thresh} baseline =', train.f1.mean ())

        # del txt_model, text_embeddings, text_dataset, text_dl, test_df
del text_embeddings; gc.collect ()

## CV score for bert-gap thresh 0.9993 baseline = 0.5149385335423771

# Text TFIDF

In [None]:
# title TFIDF

# from sklearn.feature_extraction.text import TfidfVectorizer
model = TfidfVectorizer (stop_words=None, binary=True, max_features=25000)
text_embeddings = model.fit_transform (train_gf.title).toarray()
print ('text embeddings shape',text_embeddings.shape)

preds = []
CHUNK = 1024*4

print('Finding similar titles...')
CTS = len(train)//CHUNK
if len(train)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(train))
    print('chunk',a,'to',b)
    
    # COSINE SIMILARITY DISTANCE
    # cts = np.dot( text_embeddings, text_embeddings[a:b].T).T
    cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T
    
    for k in range(b-a):
        # IDX = np.where(cts[k,]>0.7)[0]
        IDX = cupy.where(cts[k,]>0.75)[0]
        o = train.iloc[cupy.asnumpy(IDX)].posting_id.values
        preds.append(o)        
del model, text_embeddings

train['oof_text'] = preds

if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_text'),axis=1)
    print('CV score for tfidf baseline =',train.f1.mean())

> # image hash 

In [None]:
tmp = train.groupby('image_phash').posting_id.agg('unique').to_dict()
train['oof_hash'] = train.image_phash.map(tmp)

In [None]:
if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_hash'),axis=1)
    print('CV score for phash baseline =',train.f1.mean())
# imagehash

# image Siamese

In [None]:
!ls ../input/shopee-siamese-effnetb6-img224/efficientnet_b6_224_fold0_min_val_loss.pth
IMG_MODEL_PATH = "../input/shopee-siamese-effnetb6-img224/efficientnet_b6_224_fold0_min_val_loss.pth"

In [None]:
import sys
sys.path.append ('/kaggle/input/timm-pytorch-image-models/pytorch-image-models-master')
import timm
import albumentations as A
from   albumentations.pytorch import ToTensorV2

IMG_SIZE = 224
valid_transforms = A.Compose ([
    A.Resize (IMG_SIZE, IMG_SIZE),
    A.Normalize (),
    ToTensorV2 (p=1.0),
])

class SiameseImageDataset (Dataset):    
    def __init__(self, img_paths, transform=valid_transforms):
        self.img_paths = img_paths
        self.transform = transform
        
    def __getitem__(self, index):
        
        img_path = self.img_paths[index]
        img      = self.transform (image=cv2.imread (img_path))['image']        
        return img
    
    def __len__(self):
        return len (self.img_paths)
    
class ShopeeNet (nn.Module):

    def __init__(self,
                 n_classes=512,
                 model_name='efficientnet_b6',
                 use_fc=False,
                 fc_dim=0,
                 dropout=0.1,
                 pretrained=False):
        """
        :param n_classes:
        :param model_name: name of model from pretrainedmodels
            e.g. resnet50, resnext101_32x4d, pnasnet5large
        """
        super(ShopeeNet, self).__init__()
        print('Building Model Backbone for {} model'.format(model_name))
        
        self.backbone = timm.create_model (model_name, pretrained=pretrained)
        final_in_features = self.backbone.classifier.in_features
        
        self.backbone.classifier = nn.Identity()
        self.backbone.global_pool = nn.Identity()
        
        self.pooling =  nn.AdaptiveAvgPool2d(1)
        
        self.use_fc = use_fc
        if use_fc:
            self.dropout = nn.Dropout(p=dropout)
            self.fc = nn.Linear(final_in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            final_in_features = fc_dim
        
        self.final = nn.Linear (final_in_features, n_classes)
    
    def freeze (self):
        
        for param in self.parameters ():
            param.requires_grad = False        
        for param in self.backbone.parameters ():
            param.requires_grad = False
        
        for param in self.final.parameters ():
            param.requires_grad = True
        if self.use_fc:
            for param in self.fc.parameters ():
                param.requires_grad = True
            for param in self.bn.parameters ():
                param.requires_grad = True
        return
    
    def unfreeze (self):
        
        for param in self.backbone.parameters ():
            param.requires_grad = True
        for param in self.parameters ():
            param.requires_grad = True
            
        for param in self.final.parameters ():
            param.requires_grad = True
        if self.use_fc:
            for param in self.fc.parameters ():
                param.requires_grad = True
            for param in self.bn.parameters ():
                param.requires_grad = True
        return
    
    def _init_params (self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)
    
    def forward(self, x):
        feature = self.extract_feat (x)
        logits = self.final (feature)
        return logits
    
    def extract_feat (self, x):
        batch_size = x.shape[0]
        x = self.backbone (x)
        x = self.pooling (x).view (batch_size, -1)

        if self.use_fc:
            x = self.dropout (x)
            x = self.fc (x)
            x = self.bn (x)
        return x
    
class TripletModel (nn.Module):
    
    def __init__(self, embeddingModel):
        
        super (TripletModel, self).__init__()
        self.embeddingModel = embeddingModel
    
    def forward (self, i1, i2, i3):
        
        E1 = self.embeddingModel (i1)
        E2 = self.embeddingModel (i2)
        E3 = self.embeddingModel (i3)
        return E1, E2, E3
    
    def freeze (self):        
        self.embeddingModel.freeze ()
        return
    
    def unfreeze (self):
        self.embeddingModel.unfreeze ()
        return
    
imgmodel = ShopeeNet ()
imgmodel = TripletModel (imgmodel)
imgmodel.load_state_dict (torch.load (IMG_MODEL_PATH, map_location=torch.device ('cpu'))['model_state_dict'])
imgmodel = imgmodel.embeddingModel

In [None]:
imagedataset = SiameseImageDataset (
    train['image'].values,
    valid_transforms)

imageloader = torch.utils.data.DataLoader(
    imagedataset,
    batch_size=256, shuffle=False, num_workers=8
)

In [None]:
DEVICE = 'cuda'

imgmodel = imgmodel.to (DEVICE)

imagefeat = []
with torch.no_grad():
    for data in tqdm_notebook(imageloader):
        data = data.to(DEVICE)
        feat = imgmodel(data)
        feat = feat.reshape(feat.shape[0], feat.shape[1])
        feat = feat.data.cpu().numpy()        
        imagefeat.append(feat)

In [None]:
# l2 norm to kill all the sim in 0-1
# 3.8w * 512
# 归一化
imagefeat = np.vstack (imagefeat)
# imagefeat = normalize(imagefeat)

In [None]:
gc.collect ()

print('Finding similar titles...')

# Find nearest neighbours e.g using Siamese embeddings
model = NearestNeighbors (n_neighbors = 50)
model.fit (imagefeat)
distances, indices = model.kneighbors (imagefeat)
thresholds = list (np.arange (1, 10, 1))  # 0.04
scores = []
for threshold in thresholds:

    preds = []
    for k in range (imagefeat.shape[0]):
        idx = np.where (distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = train.iloc[cupy.asnumpy(ids)].posting_id.values
        preds.append (posting_ids)
    train['oof_bert'] = preds
    train['f1'] = train.apply (getMetric ('oof_bert'), axis=1)
    score = train.f1.mean ()
    print (f'Our f1 score for threshold {threshold} is {score}')
    scores.append (score)
thresholds_scores = pd.DataFrame ({'thresholds': thresholds, 'scores': scores})
max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
best_threshold = max_score['thresholds'].values[0]
best_score = max_score['scores'].values[0]
print(f'Our best score is {best_score} and has a threshold {best_threshold}')

In [None]:
train['oof_cnn'] = preds

if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_cnn'),axis=1)
    print('CV score for cnn baseline =',train.f1.mean())

In [None]:
def combine_for_sub(row):
    x = np.concatenate([row.oof_text, row.oof_cnn]) #, row.oof_hash])
    return ' '.join( np.unique(x) )

def combine_for_cv(row):
    x = np.concatenate([row.oof_text, row.oof_cnn]) #, row.oof_hash])
    return np.unique(x)

In [None]:
if COMPUTE_CV:
    tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
    train['target'] = train.label_group.map(tmp)
    train['oof'] = train.apply(combine_for_cv,axis=1)
    train['f1'] = train.apply(getMetric('oof'),axis=1)
    print('CV tfidf+cnn+phash Score =', train.f1.mean() )

train['matches'] = train.apply(combine_for_sub,axis=1)

In [None]:
train[['posting_id','matches']].to_csv('submission.csv',index=False)
sub = pd.read_csv('submission.csv')
sub.head()