In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import numpy as np
import pandas as pd, gc
import matplotlib.pyplot as plt
import cv2

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision

import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
import timm

import cudf, cuml, cupy
from cuml.neighbors import NearestNeighbors

from tqdm import tqdm
# timm.list_models(pretrained=True)

In [None]:
# Constant
TRAIN_PATH="../input/shopee-product-matching/train.csv"
TEST_PATH="../input/shopee-product-matching/test.csv"
TRAIN_IMAGE_PATH="../input/shopee-product-matching/train_images/" # + image id
TEST_IMAGE_PATH="../input/shopee-product-matching/test_images/"

'''
efficientnet_b0
efficientnet_b4
efficientnetv2_rw_m
vgg16
swin_large_patch4_window12_384
gluon_resnext101_32x4d
adv_inception_v3
'''
BASE_MODEL = "efficientnet_b0"

DIM = (512, 512)
# DIM = (384, 384)
# DIM = (320, 320)
# DIM = (256, 256)
N_CLASS = 11014

BATCH_SIZE=32

COMPUTE_CV = False

In [None]:
def get_device():
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
        torch.backends.cudnn.benchmark = True
    else:
        device = torch.device('cpu') # don't have GPU 
    return device
DEVICE = get_device()
print(DEVICE)

In [None]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

train_df['target'] = train_df.label_group.map(
    train_df.groupby('label_group').posting_id.agg('unique').to_dict()
)

N_CLASS = train_df['label_group'].nunique()
print("n_trian: {} n_unique: {} n_per_images: {}"
         .format(train_df['label_group'].shape[0],
                 train_df['label_group'].nunique(),
                 train_df['label_group'].shape[0]/train_df['label_group'].nunique())
         )

sample_img = cv2.imread(TRAIN_IMAGE_PATH+train_df.loc[0, 'image'])
sample_img = cv2.cvtColor(sample_img, cv2.COLOR_BGR2RGB)
# plt.imshow(sample_img)
# plt.show()
train_df.head()

In [None]:
'''Transform only image'''
def train_transforms():
    return A.Compose([
        A.Normalize(
            max_pixel_value=255.0, always_apply=True
        ),
        ToTensorV2()
    ])

x = train_transforms()(image=sample_img)['image']
print(x.shape, x.min(), x.max())

In [None]:
tmp = train_df.groupby('image_phash').posting_id.agg('unique').to_dict()
train_df['oof'] = train_df.image_phash.map(tmp)
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score
train_df['f1'] = train_df.apply(getMetric('oof'),axis=1)
print('CV score for baseline =',train_df.f1.mean())

In [None]:
if COMPUTE_CV:
    test = pd.read_csv('../input/shopee-product-matching/train.csv')
    test_gf = cudf.DataFrame(test)
    print('Using train as test to compute CV (since commit notebook). Shape is', test_gf.shape )
else:
    test = pd.read_csv('../input/shopee-product-matching/test.csv')
    test_gf = cudf.read_csv('../input/shopee-product-matching/test.csv')
    print('Test shape is', test_gf.shape )
test_gf.head()

In [None]:
class ImageModel(nn.Module):
    def __init__(self, model_name=BASE_MODEL, pretrained=True):
        self.model_name = model_name
        self.pretrained=pretrained
        super().__init__()
        
        if model_name=="resnet50":
            self.backbone_model = torchvision.models.resnet50(pretrained=True)
        
        else:
            self.backbone_model = timm.create_model(model_name, pretrained=self.pretrained)
            if not self.pretrained:
                checkpoint_path = '../input/new-efficentnetb0weights/efficientnet_b0_ra-3dd342df.pth'
                pretrained_weights = torch.load(checkpoint_path)
#                 print(pretrained_weights)
                self.backbone_model.load_state_dict(pretrained_weights)
            
        
    def forward(self, image):
        output = self.backbone_model(image)
        return output


In [None]:
class ShopeeDataset(Dataset):
    def __init__(self, image,labels,dim=DIM, augmentation=None, is_train=True):
        self.image = image
        self.labels = labels
        self.dim = dim
        self.is_train = is_train
        self.augmentation = augmentation
        
    def get_image(self, image_path, is_train=True):
        if self.is_train:
            img = cv2.imread(os.path.join(TRAIN_IMAGE_PATH, image_path))
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = cv2.resize(img, self.dim)
        else:
            img = cv2.imread(os.path.join(TEST_IMAGE_PATH, image_path))
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = cv2.resize(img, self.dim)
        return img
        
    def __len__(self):
        return len(self.image)
    
    def __getitem__(self, idx):
        img = self.image[idx]
        img = self.get_image(img)
        
        if self.augmentation:
            tmp = self.augmentation(image=img)
            img = tmp['image']
            
        return img
        

In [None]:
# BASE = '../input/shopee-product-matching/test_images/'
# if COMPUTE_CV: BASE = '../input/shopee-product-matching/train_images/'

model = ImageModel(pretrained=False)
model = model.to(DEVICE)
embeds = []
# print(len(train_df['label_group'].values.tolist()))

if COMPUTE_CV:
    train_ds = ShopeeDataset(
        image = train_df['image'].values.tolist(),
        labels = 0,
        dim = DIM,
        is_train = True,
        augmentation=train_transforms(),
    )
else:
    train_ds = ShopeeDataset(
        image = test_df['image'].values.tolist(),
        labels = 0,
        dim = DIM,
        is_train = False,
        augmentation=train_transforms(),
    )
    print(COMPUTE_CV, test_df['image'].values.tolist())

train_dl = DataLoader(
    train_ds,
    batch_size=32,
#     pin_memory=True,
    num_workers=2
)

# for e,  in train_dl:
#     print(e)

print('Computing image embeddings...')
for i, tmp in enumerate(tqdm(train_dl)):
    with torch.no_grad():
        tmp = tmp.to(DEVICE)
        image_embeddings = model(tmp)
        embeds.append(image_embeddings.cpu())

image_embeddings = np.concatenate(embeds)
del train_ds, train_dl, embeds, model
_ = gc.collect()
print('image embeddings shape',image_embeddings.shape)

In [None]:
KNN = 50
if len(test)==3: KNN = 2
model = NearestNeighbors(n_neighbors=KNN)
model.fit(image_embeddings)

In [None]:
def predict(TS=18.0, print_data=True):
    preds = []
    CHUNK = 1024

    print('Finding similar images...')
    CTS = len(image_embeddings)//CHUNK
    if len(image_embeddings)%CHUNK!=0: CTS += 1
    for j in range( CTS ):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(image_embeddings))
        if print_data: print('chunk',a,'to',b)
        distances, indices = model.kneighbors(image_embeddings[a:b,])

        for k in range(b-a):
            IDX = np.where(distances[k,]<TS)[0]
            IDS = indices[k,IDX]
            o = test.iloc[IDS].posting_id.values
            preds.append(o)
    return preds

preds = predict()
# preds
# del model, distances, indices, image_embeddings, embeds
# _ = gc.collect()
# best ts for efficientnet_b0: 18.0 f1_score: 0.6358098027565081 (512, 512)
# best ts for efficientnet_b0: 24.0 f1_score: 0.6201323875459158 (256, 256)
# best ts for efficientnetv2_rw_m: 13.5 f1_score: 0.5539241111324119 (320, 320) from paper
# best ts for efficientnetv2_rw_m: 12.0 f1_score: 0.5642019980069831 (512, 512)
# best ts for vgg16: 17.0 f1_score: 0.6318011447485347 (512, 512)

In [None]:
test['preds'] = preds
tmp = [e.shape[0] for e in test['preds']]
print(max(tmp))
test.head()

In [None]:
def combine_for_sub(row):
    x = np.concatenate([row.preds,row.preds2, row.preds3])
    return ' '.join( np.unique(x) )

def combine_for_cv(row):
    x = np.concatenate([row.preds,row.preds2, row.preds3])
    return np.unique(x)

def my_combine_for_sub(row):
    x = row.preds
    return ' '.join( np.unique(x) )

def my_for_cv(row):
    return row.preds

In [None]:
def get_score(combine=my_for_cv):
    if COMPUTE_CV:
        tmp = test.groupby('label_group').posting_id.agg('unique').to_dict()
        test['target'] = test.label_group.map(tmp)
        test['oof'] = test.apply(combine,axis=1)
        test['f1'] = test.apply(getMetric('oof'),axis=1)
        print('CV Score =', test.f1.mean() )

get_score()

In [None]:
def something():
    if COMPUTE_CV:
        log = []
        index_log = []
        for i in np.arange(9.0, 30.0, 0.5):
            test['preds'] = predict(i, print_data=False)
            tmp = test.groupby('label_group').posting_id.agg('unique').to_dict()
            test['target'] = test.label_group.map(tmp)
            test['oof'] = test.apply(my_for_cv,axis=1)
            test['f1'] = test.apply(getMetric('oof'),axis=1)
            print('TS = {},CV Score = {}'.format(i, test.f1.mean()))
            index_log.append(i)
            log.append(test.f1.mean())
    
something()

In [None]:
test['matches'] = test.apply(my_combine_for_sub,axis=1)
test[['posting_id','matches']].to_csv('submission.csv',index=False)
sub = pd.read_csv('submission.csv')
sub.head()