In [None]:
# 要添加一个新单元，输入 '# %%'
# 要添加一个新的标记单元，输入 '# %% [markdown]'

## 环境配置

In [None]:

EXT_PATH=r''
# EXT_PATH=r'/home/aistudio/external-libraries'

DATA_PATH=r'../input/shopee-product-matching/'

CNN_MODEL='resnet34'
CNN_MODEL_PATH='../input/shopee-models/shopee34_119.pth'

IMG_DIST_THRESHOLD = 0.7
IMG_CHUNK = 1024*4

NUM_WORKERS=2

import os
if EXT_PATH:
    os.sys.path.insert(0, EXT_PATH)

---
## import

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.notebook import tqdm
# from tqdm import tqdm
import torch
from sklearn.feature_extraction.text import TfidfVectorizer

---
## 配置

In [None]:
# 计算交叉验证
COMPUTE_CV = True
HAS_CUDA = torch.cuda.is_available()
DEVICE = 'cuda' if HAS_CUDA else 'cpu'
if COMPUTE_CV:
    print('this submission notebook will compute CV score, but commit notebook will not')
else:
    print('this submission notebook will not compute CV score')

test = pd.read_csv(DATA_PATH + 'test.csv')
if len(test)>3: COMPUTE_CV = False

print('COMPUTE_CV:', COMPUTE_CV)

---
## utils

In [None]:
def getMetric(col_name):
    """计算f1_score

    Args:
        col_name: 预测列
    """
    def f1score(row):
        # f1 = 2tp/(tp+fn+tp+fp) = 2tp/(len(target)+len(predict))
        n = len( np.intersect1d(row.target,row[col_name]) )
        return 2*n / (len(row.target)+len(row[col_name]))
    return f1score

---
## 加载数据

In [None]:
if COMPUTE_CV:
    train = pd.read_csv(DATA_PATH + 'train.csv')
    train['image'] = DATA_PATH + 'train_images/' + train['image']
    tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
    # target: [pid1, pid2, ...]
    train['target'] = train.label_group.map(tmp)
else:
    train = pd.read_csv(DATA_PATH + 'test.csv')
    train['image'] = DATA_PATH + 'test_images/' + train['image']
    
print('train shape is', train.shape )
train.head()

---
## image hash 特征

In [None]:
tmp = train.groupby('image_phash').posting_id.agg('unique').to_dict()
train['oof_hash'] = train.image_phash.map(tmp)

In [None]:
if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_hash'),axis=1)
    print('CV score for baseline =',train.f1.mean())

---
## image CNN 特征

In [None]:
# 拷贝预训练模型
import os
import shutil
pretrained_pytorch_models = r'../input/pretrained-pytorch-models/'
if os.path.isdir(pretrained_pytorch_models):    
    pretrained_dir = f'{torch.hub.get_dir()}/checkpoints/'
    os.makedirs(pretrained_dir, exist_ok=True)
    shutil.copy(os.path.join(pretrained_pytorch_models, 'resnet18-5c106cde.pth'), pretrained_dir)

---
### Models

In [None]:

import torch
torch.manual_seed(0)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
from torch.nn import functional as F
from torch.nn import DataParallel

import torch
import torch.nn as nn

class ChannelAttention(nn.Module):
    def __init__(self, in_planes, ratio=16):
        super(ChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)

        self.fc1   = nn.Conv2d(in_planes, in_planes // 16, 1, bias=False)
        self.relu1 = nn.ReLU()
        self.fc2   = nn.Conv2d(in_planes // 16, in_planes, 1, bias=False)

        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        avg_out = self.fc2(self.relu1(self.fc1(self.avg_pool(x))))
        max_out = self.fc2(self.relu1(self.fc1(self.max_pool(x))))
        out = avg_out + max_out
        return self.sigmoid(out)

class SpatialAttention(nn.Module):
    def __init__(self, kernel_size=7):
        super(SpatialAttention, self).__init__()

        assert kernel_size in (3, 7), 'kernel size must be 3 or 7'
        padding = 3 if kernel_size == 7 else 1

        self.conv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        x = torch.cat([avg_out, max_out], dim=1)
        x = self.conv1(x)
        return self.sigmoid(x)


def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)

class BasicBlockShopee(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None, use_se=True):
        super(BasicBlockShopee, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)

        self.use_se = use_se
        self.ca = ChannelAttention(planes)
        self.sa = SpatialAttention()

        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.use_se:            
            out = self.ca(out) * out
            out = self.sa(out) * out

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

class ResNetShopee(nn.Module):
    def __init__(self, block, layers, use_se=True):
        self.inplanes = 64
        self.use_se = use_se
        super(ResNetShopee, self).__init__()
        self.conv1 = nn.Conv2d(1, 64, kernel_size=3, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.prelu = nn.PReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.bn4 = nn.BatchNorm2d(512)
        self.dropout = nn.Dropout()
        self.fc5 = nn.Linear(512 * 8 * 8, 512)
        self.bn5 = nn.BatchNorm1d(512)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.xavier_normal_(m.weight)
            elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                nn.init.constant_(m.bias, 0)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )
        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, use_se=self.use_se))
        self.inplanes = planes
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes, use_se=self.use_se))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.prelu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.bn4(x)
        x = self.dropout(x)
        x = x.view(x.size(0), -1)
        x = self.fc5(x)
        x = self.bn5(x)

        return x

def resnet_shopee18(use_se=True, **kwargs):
    model = ResNetShopee(BasicBlockShopee, [2, 2, 2, 2], use_se=use_se, **kwargs)
    return model

def resnet_shopee34(use_se=True, **kwargs):
    model = ResNetShopee(BasicBlockShopee, [3, 4, 6, 3], use_se=use_se, **kwargs)
    return model

def get_model(model, model_path=None, device='cuda', use_se=True) -> nn.Module:
    if model == 'resnet18':
        model = resnet_shopee18(use_se=use_se)
    elif model == 'resnet34':
        model = resnet_shopee34(use_se=use_se)
    else:
        raise NotImplemented
    model = DataParallel(model)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model = model.to(device)
    return model

---
### Datasets

In [None]:
from torch.utils.data.dataset import Dataset
from PIL import Image
import torch
import numpy as np
from torchvision import transforms as T
import pandas as pd

class ShopeeImageDataset(Dataset):

    def __init__(self, imgs, input_shape=(1, 128, 128)):
        self.input_shape = input_shape
        # columns: 'posting_id', 'image', 'image_phash', 'title', 'label_group'

        self.imgs = imgs
        self.transforms = T.Compose([
            # T.CenterCrop(self.input_shape[1:]),
            T.Resize(self.input_shape[1:]),
            T.ToTensor(),
            T.Normalize(mean=[0.5], std=[0.5])
        ])

    def __getitem__(self, index):
        img_path = self.imgs[index]
        data = Image.open(img_path)
        data = data.convert('L')
        data = self.transforms(data)
        return data.float()

    def __len__(self):
        return len(self.imgs)

### 加载数据集

In [None]:
imagedataset = ShopeeImageDataset(train['image'].values)

imageloader = torch.utils.data.DataLoader(
    imagedataset,
    batch_size=10, shuffle=False,  num_workers=NUM_WORKERS,drop_last=False,
)

### 加载模型

In [None]:
# backbone, model_path=None, device='cuda', use_se=False

print('CNN_MODEL:', CNN_MODEL)
print('CNN_MODEL_PATH:', CNN_MODEL_PATH)

imgmodel = get_model(CNN_MODEL, os.path.join(CNN_MODEL_PATH), device=DEVICE)
imgmodel = imgmodel.to(DEVICE)

### 计算 image CNN 特征

In [None]:
imgmodel.eval()
imagefeat = []
with torch.no_grad():
    for data in tqdm(imageloader):
        data = data.to(DEVICE)
        feat = imgmodel(data)
        feat = feat.reshape(feat.shape[0], feat.shape[1])
        # feat = feat.data.cpu().numpy()
        
        imagefeat.append(feat)

imagefeat = torch.cat(imagefeat)
imagefeat = F.normalize(imagefeat)

print('img embeddings shape',imagefeat.shape)
print('img embeddings device',imagefeat.device)

### 计算 image CNN 预测结果

In [None]:
preds = []
CHUNK = IMG_CHUNK

print('Finding similar images...')
CTS = len(imagefeat)//CHUNK
if len(imagefeat)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b, len(imagefeat))
    print('chunk',a,'to',b)
    
    distances = torch.matmul(imagefeat[a:b], imagefeat.T)
    
    for k in range(b-a):
        # dists, IDX = torch.topk(distances[k,], limit_count)
        # IDX = IDX[dists > IMG_DIST_THRESHOLD]

        IDX = torch.where(distances[k,]>IMG_DIST_THRESHOLD)[0]
        top_idx = torch.topk(distances[k,][IDX], min(len(IDX), 51))[1]
        IDX = IDX[top_idx]

        # IDX = torch.where(distances[k,]>IMG_DIST_THRESHOLD)[0]
        o = train.iloc[IDX.data.cpu().numpy()].posting_id.values
        preds.append(o)

    del distances

In [None]:
del imgmodel, imagefeat
train['oof_cnn'] = preds
if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_cnn'),axis=1)
    print('CV score for baseline =',train.f1.mean())

---
## title TFIDF

In [None]:
model = TfidfVectorizer(stop_words=None, binary=True, max_features=25000)
text_embeddings = model.fit_transform(train.title).toarray()

text_embeddings = torch.from_numpy(text_embeddings).to(DEVICE)
print('text embeddings shape',text_embeddings.shape)
print('text embeddings device',text_embeddings.device)

In [None]:
preds = []
CHUNK = 1024*4

print('Finding similar titles...')
CTS = len(text_embeddings)//CHUNK
if len(text_embeddings)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(text_embeddings))
    print('chunk',a,'to',b)
    
    # COSINE SIMILARITY DISTANCE
    # cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T
    cts = torch.matmul(text_embeddings[a:b], text_embeddings.T)

    for k in range(b-a):
        # dists, IDX = torch.topk(cts[k,], limit_count)
        # IDX = IDX[dists > 0.7]
        IDX = torch.where(cts[k,]>0.7)[0]
        top_idx = torch.topk(cts[k,][IDX], min(len(IDX), 51))[1]
        IDX = IDX[top_idx]

        # IDX = np.where(cts[k,]>0.7)[0]
        # IDX = cupy.where(cts[k,]>0.7)[0]
        o = train.iloc[IDX.data.cpu().numpy()].posting_id.values
        preds.append(o)
    del cts

In [None]:
del model, text_embeddings
train['oof_text'] = preds

if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_text'),axis=1)
    print('CV score for baseline =',train.f1.mean())

---
## 组合所有特征

In [None]:
def combine_for_sub(row):
    x = np.concatenate([row.oof_text,row.oof_cnn, row.oof_hash])
    return ' '.join( np.unique(x) )

def combine_for_cv(row):
    x = np.concatenate([row.oof_text,row.oof_cnn, row.oof_hash])
    return np.unique(x)

In [None]:
if COMPUTE_CV:
    tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
    train['target'] = train.label_group.map(tmp)
    train['oof'] = train.apply(combine_for_cv,axis=1)
    train['f1'] = train.apply(getMetric('oof'),axis=1)
    print('CV Score =', train.f1.mean() )

train['matches'] = train.apply(combine_for_sub,axis=1)

In [None]:
train[['posting_id','matches']].to_csv('submission.csv',index=False)
sub = pd.read_csv('submission.csv')
sub.head()