In [None]:
# DATA_PATH = '../input/'
DATA_PATH = '../input/shopee-product-matching/'

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2, matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from sklearn.preprocessing import normalize
import math
import torch

# import cudf, cuml, cupy
# from cuml.feature_extraction.text import TfidfVectorizer
# from cuml.neighbors import NearestNeighbors


# 定义评价函数：准确率、召回率，F1
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]))
        if len(row[col])==0:
            p = 0
        else:
            p = n/len(row[col])
        if len(row.target) == 0:
            r = 0
        else:
            r = n/len(row.target)
        return p, r, 2*n/(len(row.target)+len(row[col]))
    return f1score

In [None]:
from contextlib import contextmanager
import os, sys, time, psutil

# 计算当前代码所使用的内存和时间
@contextmanager
def timer_memory(name):
    t0 = time.time()
    yield
    print(f'Memory: {(psutil.Process(os.getpid()).memory_info().rss/2**30):.02f}GB')
    print(f'{name} done in {time.time()-t0:.0f}s')

In [None]:
COMPUTE_CV = True
device = 'cuda'

test = pd.read_csv(DATA_PATH + 'test.csv')
if len(test)>3: COMPUTE_CV = False
else: print('this submission notebook will compute CV score, but commit notebook will not')

# COMPUTE_CV = False
with timer_memory('Reading CSV'):
    if COMPUTE_CV:
        train = pd.read_csv(DATA_PATH + 'train.csv')
        train['image'] = DATA_PATH + 'train_images/' + train['image']
        tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
        train['target'] = train.label_group.map(tmp)
    else:
        train = pd.read_csv(DATA_PATH + 'test.csv')
        train['image'] = DATA_PATH + 'test_images/' + train['image']
    
print('train shape is', train.shape )
train.head()

# image hash

In [None]:
tmp = train.groupby('image_phash').posting_id.agg('unique').to_dict()
train['oof_hash'] = train.image_phash.map(tmp)

In [None]:
if COMPUTE_CV:
    train['cv_score'] = train.apply(getMetric('oof_hash'),axis=1)
    print('P score for baseline =',train['cv_score'].apply(lambda x:x[0]).mean())
    print('R score for baseline =',train['cv_score'].apply(lambda x:x[1]).mean())
    print('F1 score for baseline =',train['cv_score'].apply(lambda x:x[2]).mean())

# image CNN

In [None]:
from PIL import Image

import torch
torch.manual_seed(0)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True

import torchvision.models as models
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data.dataset import Dataset

class SHOPEEDataset(Dataset):
    def __init__(self, df, mode, transform=None):
        
        self.df = df.reset_index(drop=True)
        self.mode = mode
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.loc[index]
        img = cv2.imread(row.image)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        if self.transform is not None:
            res = self.transform(image=img)
            img = res['image']
                
        img = img.astype(np.float32)
        img = img.transpose(2,0,1)
        
        if self.mode == 'test':
            return torch.tensor(img).float()
        else:
            return torch.tensor(img).float(), torch.tensor(row.label_group).float()

class ArcModule(nn.Module):
    def __init__(self, in_features, out_features, s = 10, m = 0.5):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_normal_(self.weight)

        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = torch.tensor(math.cos(math.pi - m))
        self.mm = torch.tensor(math.sin(math.pi - m) * m)

    def forward(self, inputs, labels):
        cos_th = F.linear(inputs, F.normalize(self.weight))
        cos_th = cos_th.clamp(-1, 1)
        sin_th = torch.sqrt(1.0 - torch.pow(cos_th, 2))
        cos_th_m = cos_th * self.cos_m - sin_th * self.sin_m
        # print(type(cos_th), type(self.th), type(cos_th_m), type(self.mm))
        cos_th_m = torch.where(cos_th > self.th, cos_th_m, cos_th - self.mm)

        cond_v = cos_th - self.th
        cond = cond_v <= 0
        cos_th_m[cond] = (cos_th - self.mm)[cond]

        if labels.dim() == 1:
            labels = labels.unsqueeze(-1)
        onehot = torch.zeros(cos_th.size()).cuda()
        labels = labels.type(torch.LongTensor).cuda()
        onehot.scatter_(1, labels, 1.0)
        outputs = onehot * cos_th_m + (1.0 - onehot) * cos_th
        outputs = outputs * self.s
        return outputs
    
    
class SHOPEEDenseNet(nn.Module):

    def __init__(self, channel_size, out_feature, dropout=0.5, backbone='densenet121', pretrained=True):
        super(SHOPEEDenseNet, self).__init__()
        self.channel_size = channel_size
        self.out_feature = out_feature
        
        if backbone == 'resnet101':
            self.backbone = models.resnet101(False)
            self.in_features = self.backbone.fc.in_features
            self.backbone = nn.Sequential(*list(self.backbone.children())[:-2])
            self.fc1 = nn.Linear(self.in_features * 7 * 7 , self.channel_size)
        print(self.backbone)
        
        self.margin = ArcModule(in_features=self.channel_size, out_features = self.out_feature)
        self.bn1 = nn.BatchNorm2d(self.in_features)
        self.dropout = nn.Dropout2d(dropout)
        self.bn2 = nn.BatchNorm1d(self.channel_size)
        
    def forward(self, x, labels=None):
        features = self.backbone(x)
        features = self.dropout(features)
        features = features.view(features.size(0), -1)
        # print(features.shape)
        features = self.fc1(features)
        features = F.normalize(features)
        if labels is not None:
            return self.margin(features, labels)
        return features
    
    def test(self):
        x = torch.rand(1, 3, 224, 224).cuda()
        print(self.forward(x))

In [None]:
model = SHOPEEDenseNet(512, 11014, backbone='resnet101')
model.load_state_dict(torch.load('../input/shopee-models/baseline_fold0_densenet_224_epoch50.pth'))
model.to('cuda')

In [None]:
def generate_test_features(test_loader):
    model.eval()
    bar = tqdm_notebook(test_loader)
    
    FEAS = []
    TARGETS = []

    with torch.no_grad():
        for batch_idx, (images) in enumerate(bar):
            images = images.to('cuda')
            features = model(images)
            FEAS += [features.detach().cpu()]
    FEAS = torch.cat(FEAS).cpu().numpy()
    return FEAS

In [None]:
# !mkdir -p /root/.cache/torch/hub/checkpoints/
# !cp ../input/pretrained-pytorch-models/resnet18-5c106cde.pth /root/.cache/torch/hub/checkpoints/

In [None]:
import albumentations
transforms_valid = albumentations.Compose([
    albumentations.Resize(224, 224),
    albumentations.Normalize()
])

dataset_test = SHOPEEDataset(train, 'test', transform=transforms_valid)
test_loader = torch.utils.data.DataLoader(dataset_test, batch_size=16, 
                                          shuffle=False, num_workers=2, pin_memory=True)
imagefeat = generate_test_features(test_loader)
imagefeat = torch.tensor(imagefeat)

In [None]:
imagefeat = imagefeat.cuda()

In [None]:
print('Finding similar images...')

preds = []
preds_index = []
CHUNK = 1024*4

CTS = len(imagefeat)//CHUNK
if len(imagefeat)%CHUNK!=0: CTS += 1

for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b, len(imagefeat))
    print('chunk',a,'to',b)
    
    distances = torch.matmul(imagefeat, imagefeat[a:b].T).T
    distances = distances.data.cpu().numpy()
    # distances = np.dot(imagefeat[a:b,], imagefeat.T)
    
    for k in range(b-a):
        # IDX = cupy.where(distances[k,]>0.95)[0]
        IDX = np.where(distances[k,]>0.9)[0][:]
        o = train.iloc[IDX].posting_id.values
#         o = train.iloc[cupy.asnumpy(IDX)].posting_id.values
        preds.append(o)
        preds_index.append(IDX)
        
# del imagefeat, imgmodel

In [None]:
train['oof_cnn'] = preds
print(train['oof_cnn'].apply(len).mean())

if COMPUTE_CV:
    train['cv_score'] = train.apply(getMetric('oof_cnn'),axis=1)
    print('P score for baseline =',train['cv_score'].apply(lambda x:x[0]).mean())
    print('R score for baseline =',train['cv_score'].apply(lambda x:x[1]).mean())
    print('F1 score for baseline =',train['cv_score'].apply(lambda x:x[2]).mean())

# title TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
model = TfidfVectorizer(stop_words=None, binary=True, max_features=None)
text_embeddings = model.fit_transform(train.title).toarray()
print('text embeddings shape',text_embeddings.shape)

In [None]:
text_embeddings = torch.from_numpy(text_embeddings)
text_embeddings = text_embeddings.cuda()

In [None]:
preds = []
CHUNK = 1024*4

print('Finding similar titles...')
CTS = len(train)//CHUNK
if len(train)%CHUNK!=0: CTS += 1
CTS_index = 0
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(train))
    print('chunk',a,'to',b)
    
    # COSINE SIMILARITY DISTANCE
    # cts = np.dot( text_embeddings, text_embeddings[a:b].T).T
    cts = torch.matmul(text_embeddings, text_embeddings[a:b].T).T
    cts = cts.data.cpu().numpy()
    print(cts.shape)
    for k in range(b-a):
        IDX = np.where(cts[k,]>0.7)[0]
        # IDX = np.where(cts[k,list(preds_index[CTS_index])]>0.7)[0]
        # IDX = [preds_index[CTS_index][x] for x in IDX]
        o = train.iloc[IDX].posting_id.values
        preds.append(o)
        CTS_index += 1
# del model, text_embeddings

In [None]:
train['oof_text'] = preds

if COMPUTE_CV:
    train['cv_score'] = train.apply(getMetric('oof_text'),axis=1)
    print('P score for baseline =',train['cv_score'].apply(lambda x:x[0]).mean())
    print('R score for baseline =',train['cv_score'].apply(lambda x:x[1]).mean())
    print('F1 score for baseline =',train['cv_score'].apply(lambda x:x[2]).mean())

In [None]:
def combine_for_sub(row):
    x = np.concatenate([row.oof_cnn, row.oof_hash, row.oof_text])
    return ' '.join( np.unique(x) )

def combine_for_cv(row):
    x = np.concatenate([row.oof_cnn, row.oof_hash, row.oof_text])
    return np.unique(x)

In [None]:
if COMPUTE_CV:
    tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
    train['target'] = train.label_group.map(tmp)
    train['oof'] = train.apply(combine_for_cv,axis=1)
    
    train['cv_score'] = train.apply(getMetric('oof'),axis=1)
    print('P score for baseline =',train['cv_score'].apply(lambda x:x[0]).mean())
    print('R score for baseline =',train['cv_score'].apply(lambda x:x[1]).mean())
    print('F1 score for baseline =',train['cv_score'].apply(lambda x:x[2]).mean())
    
train['matches'] = train.apply(combine_for_sub,axis=1)

In [None]:
train[['posting_id','matches']].to_csv('submission.csv',index=False)
sub = pd.read_csv('submission.csv')
sub.head()