In [None]:
# DATA_PATH = '../input/'
DATA_PATH = '../input/shopee-product-matching/'

import psutil

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2, matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import gc
import cupy

# import cudf, cuml, cupy
# from cuml.feature_extraction.text import TfidfVectorizer
# from cuml.neighbors import NearestNeighbors

def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

In [None]:
COMPUTE_CV = True

test = pd.read_csv(DATA_PATH + 'test.csv')
if len(test)>3: COMPUTE_CV = False
else: print('this submission notebook will compute CV score, but commit notebook will not')

# COMPUTE_CV = False

if COMPUTE_CV:
    train = pd.read_csv(DATA_PATH + 'train.csv')
    train['image'] = DATA_PATH + 'train_images/' + train['image']
    tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
    train['target'] = train.label_group.map(tmp)
    # train_gf = cudf.read_csv(DATA_PATH + 'train.csv')
else:
    train = pd.read_csv(DATA_PATH + 'test.csv')
    train['image'] = DATA_PATH + 'test_images/' + train['image']
    # train_gf = cudf.read_csv(DATA_PATH + 'test.csv')
    
print('train shape is', train.shape )
train.head()

# image hash

In [None]:
tmp = train.groupby('image_phash').posting_id.agg('unique').to_dict()
train['oof_hash'] = train.image_phash.map(tmp)

In [None]:
train.head()

In [None]:
if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_hash'),axis=1)
    print('CV score for baseline =',train.f1.mean())

# text word2vec

In [None]:
# train['title_word'] = train['title'].apply(lambda x: x.lower().split(' '))

# from gensim.test.utils import get_tmpfile
# from gensim.models import KeyedVectors

# vectors = KeyedVectors.load_word2vec_format("../input/glove2word2vec/glove_w2v.txt") # import the data file

In [None]:
# title_feats = []
# for title in tqdm(train['title_word'].values[:]):
#     title_feat = []
#     for word in title:
#         if word in vectors:
#             title_feat.append(vectors[word])
#     if len(title_feat) == 0:
#         title_feat = np.random.rand(200)
#     else:
#         title_feat = np.vstack(title_feat).max(0)
#     title_feats.append(title_feat)
#     # break
    
# del vectors;

In [None]:
# from sklearn.preprocessing import normalize

# # l2 norm to kill all the sim in 0-1
# title_feats = np.vstack(title_feats)
# title_feats = normalize(title_feats)

In [None]:
# import cupy

# preds = []
# CHUNK = 1024*4

# title_feats = cupy.array(title_feats)

# print('Finding similar images...')
# CTS = len(title_feats)//CHUNK
# if len(title_feats)%CHUNK!=0: CTS += 1
# for j in range( CTS ):
    
#     a = j*CHUNK
#     b = (j+1)*CHUNK
#     b = min(b, len(title_feats))
#     print('chunk',a,'to',b)
    
#     distances = cupy.matmul(title_feats, title_feats[a:b].T).T
#     # distances = np.dot(imagefeat[a:b,], imagefeat.T)
    
#     for k in range(b-a):
#         IDX = cupy.where(distances[k,]>0.90)[0]
#         # IDX = np.where(distances[k,]>0.95)[0][:]
#         o = train.iloc[cupy.asnumpy(IDX)].posting_id.values
#         preds.append(o)
        
# # del imagefeat, imgmodel

In [None]:
# train['oof_w2v'] = preds

# if COMPUTE_CV:
#     train['f1'] = train.apply(getMetric('oof_w2v'),axis=1)
#     print('CV score for baseline =',train.f1.mean())

# image CNN

In [None]:
from PIL import Image

import torch
torch.manual_seed(0)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True

import torchvision.models as models
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data.dataset import Dataset

class ShopeeImageDataset(Dataset):
    def __init__(self, img_path, transform):
        self.img_path = img_path
        self.transform = transform
        
    def __getitem__(self, index):
        img = Image.open(self.img_path[index]).convert('RGB')
        img = self.transform(img)
        return img
    
    def __len__(self):
        return len(self.img_path)

In [None]:
imagedataset = ShopeeImageDataset(
    train['image'].values,
    transforms.Compose([
        transforms.Resize((512, 512)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]))
    
imageloader = torch.utils.data.DataLoader(
    imagedataset,
    batch_size=40, shuffle=False, num_workers=2
)

In [None]:
class ShopeeImageEmbeddingNet(nn.Module):
    def __init__(self):
        super(ShopeeImageEmbeddingNet, self).__init__()
              
#         model = models.resnet18(True)
        model = models.resnet50(True)
        model.avgpool = nn.AdaptiveMaxPool2d(output_size=(1, 1))
        model = nn.Sequential(*list(model.children())[:-1])
        model.eval()
        self.model = model
        
    def forward(self, img):
        out = self.model(img)
        return out

In [None]:
!mkdir -p /root/.cache/torch/hub/checkpoints/
!cp ../input/pretrained-pytorch-models/resnet18-5c106cde.pth /root/.cache/torch/hub/checkpoints/
!cp ../input/pretrained-pytorch-models/resnet50-19c8e357.pth /root/.cache/torch/hub/checkpoints/

In [None]:
DEVICE = 'cuda'

imgmodel = ShopeeImageEmbeddingNet()
imgmodel = imgmodel.to(DEVICE)

imagefeat = []
with torch.no_grad():
    for data in tqdm(imageloader):
        data = data.to(DEVICE)
        feat = imgmodel(data)
        feat = feat.reshape(feat.shape[0], feat.shape[1])
        feat = feat.data.cpu().numpy()
        
        imagefeat.append(feat)

In [None]:
from sklearn.preprocessing import normalize

# l2 norm to kill all the sim in 0-1
imagefeat = np.vstack(imagefeat)
imagefeat = normalize(imagefeat)

In [None]:
imagefeat = torch.from_numpy(imagefeat)
imagefeat = imagefeat.cuda()

In [None]:
# Thresshold tuning
# preds_list = []
# for thresshold in range(98, 90, -1):
#     preds = []
#     CHUNK = 1024*4


# #     print('Finding similar images...')
#     CTS = len(imagefeat)//CHUNK
#     if len(imagefeat)%CHUNK!=0: CTS += 1
#     for j in range( CTS ):

#         a = j*CHUNK
#         b = (j+1)*CHUNK
#         b = min(b, len(imagefeat))
# #         print('chunk',a,'to',b)

#         distances = torch.matmul(imagefeat, imagefeat[a:b].T).T
#         distances = distances.data.cpu().numpy()
#         # distances = np.dot(imagefeat[a:b,], imagefeat.T)

#         for k in range(b-a):
#             # IDX = cupy.where(distances[k,]>0.95)[0]
#             IDX = np.where(distances[k,]>thresshold/100)[0][:]
#             o = train.iloc[IDX].posting_id.values
#     #         o = train.iloc[cupy.asnumpy(IDX)].posting_id.values
#             preds.append(o)
#     preds_list.append(preds)
        
# # del imagefeat, imgmodel

In [None]:
# for thres, preds in zip(range(98, 90, -1), preds_list):
#     train['oof_cnn'] = preds

#     if COMPUTE_CV:
#         train['f1'] = train.apply(getMetric('oof_cnn'),axis=1)
#         print('Thresshold: ', thres/100, 'CV score for baseline =',train.f1.mean())

In [None]:
preds = []
CHUNK = 1024*4


print('Finding similar images...')
CTS = len(imagefeat)//CHUNK
if len(imagefeat)%CHUNK!=0: CTS += 1
for j in range( CTS ):

    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b, len(imagefeat))
    print('chunk',a,'to',b)

    distances = torch.matmul(imagefeat, imagefeat[a:b].T).T
    distances = distances.data.cpu().numpy()
    # distances = np.dot(imagefeat[a:b,], imagefeat.T)

    for k in range(b-a):
        # IDX = cupy.where(distances[k,]>0.95)[0]
        IDX = np.where(distances[k,]>0.95)[0][:]
        o = train.iloc[IDX].posting_id.values
        o = train.iloc[cupy.asnumpy(IDX)].posting_id.values
        preds.append(o)
        
# del imagefeat, imgmodel

In [None]:
train['oof_cnn'] = preds

if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_cnn'),axis=1)
    print('CV score for baseline =',train.f1.mean())
    
# 0.6527899883424048 0.95
# 0.6686372611222741 0.94
# 0.6762305764407363 0.93

In [None]:
if COMPUTE_CV:
    label_group_count = train.groupby(['label_group']).size().reset_index()
    label_group_count.columns = ['label_group', 'count']
    label_group_count.sort_values(by='count', ascending=False, inplace=True)
    label_group_count

In [None]:
if COMPUTE_CV:
    t1 = train[train['label_group'] == 1163569239].index
    t2 = train[train['label_group'] == 159351600].index

    cts = torch.matmul(imagefeat[t1], imagefeat[t2].T).T
    cts = cts.data.cpu().numpy()

    # rows = 3
    # cols = 4
    # fig, cells = plt.subplots(nrows=rows, ncols=cols, figsize=(15, 10))
    # i = 0
    # for r in range(rows):
    #     for c in range(cols):
    #         cells[r, c].set_ylim([0, 10])
    #         cells[r, c].hist(cts[i][np.where(cts[i] >= 0)], bins=20)
    #         i += 1
    # plt.show()

    vs = []
    for dist in cts:
        vs.extend(dist)

    plt.hist(vs, bins=50)
    plt.show()

# title TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
model = TfidfVectorizer(stop_words=None, binary=True, max_features=55000)
text_embeddings = model.fit_transform(train.title).toarray()
print('text embeddings shape',text_embeddings.shape)

In [None]:
text_embeddings = torch.from_numpy(text_embeddings)
text_embeddings = text_embeddings.cuda()

In [None]:
preds = []
CHUNK = 1024*4

print('Finding similar titles...')
CTS = len(train)//CHUNK
if len(train)%CHUNK!=0: CTS += 1
CTS_index = 0
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(train))
    print('chunk',a,'to',b)
    
    # COSINE SIMILARITY DISTANCE
    # cts = np.dot( text_embeddings, text_embeddings[a:b].T).T
    cts = torch.matmul(text_embeddings, text_embeddings[a:b].T).T
    cts = cts.data.cpu().numpy()
    print(cts.shape)
    for k in range(b-a):
        # IDX = np.where(cts[k,]>0.7)[0]
        IDX = np.where(cts[k,]>0.7)[0]
        o = train.iloc[IDX].posting_id.values
        preds.append(o)
        CTS_index += 1
        
# del model, text_embeddings

In [None]:
train['oof_text'] = preds

if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_text'),axis=1)
    print('CV score for baseline =',train.f1.mean())
    
    
# 0.6137154152579091 0.7
# 0.6507316994356058 0.6

In [None]:
if COMPUTE_CV:
    label_group_count = train.groupby(['label_group']).size().reset_index()
    label_group_count.columns = ['label_group', 'count']
    label_group_count.sort_values(by='count', ascending=False, inplace=True)
    label_group_count

In [None]:
def combine_for_sub(row):
    x = np.concatenate([row.oof_text,row.oof_cnn, row.oof_hash])
    return ' '.join( np.unique(x) )

def combine_for_cv(row):
    x = np.concatenate([row.oof_text,row.oof_cnn, row.oof_hash])
    return np.unique(x)

In [None]:
if COMPUTE_CV:
    t1 = train[train['label_group'] == 1163569239].index
    t2 = train[train['label_group'] == 159351600].index

    cts = torch.matmul(text_embeddings[t1], text_embeddings[t2].T).T
    cts = cts.data.cpu().numpy()

    # rows = 3
    # cols = 4
    # fig, cells = plt.subplots(nrows=rows, ncols=cols, figsize=(15, 10))
    # i = 0
    # for r in range(rows):
    #     for c in range(cols):
    #         cells[r, c].set_ylim([0, 10])
    #         cells[r, c].hist(cts[i][np.where(cts[i] > 0)], bins=20)
    #         i += 1
    # plt.show()

    vs = []
    for dist in cts:
        vs.extend(dist)

    plt.hist(vs, bins=50)
    plt.show()

In [None]:
if COMPUTE_CV:
    tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
    train['target'] = train.label_group.map(tmp)
    train['oof'] = train.apply(combine_for_cv,axis=1)
    train['f1'] = train.apply(getMetric('oof'),axis=1)
    print('CV Score =', train.f1.mean() )

train['matches'] = train.apply(combine_for_sub,axis=1)

In [None]:
train[['posting_id','matches']].to_csv('submission.csv',index=False)
sub = pd.read_csv('submission.csv')
sub.head()