# Baseline

## 0. Info

### Features
* Image hash + Image embedding + Text embedding
* Image embedding: pretrained resnet50
* Text embedding: TF-IDF


### Reference
* https://www.kaggle.com/finlay/unsupervised-image-text-baseline-in-20min

## 1. Setting

In [1]:
# Libraries
import os
from glob import glob
from tqdm.auto import tqdm

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image

from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms

In [2]:
class CONFIG:
    img_size = 256
    chunk_size = 1024
    batch_size = 32
    data_dir = '../input/shopee-product-matching'
    phase = 'train' if len(pd.read_csv(os.path.join(data_dir, 'test.csv'))) == 3 else 'test'
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

## 2. Data

In [3]:
def f1_score(true:list, pred:list):
    intersection = set(true) & set(pred)
    f1 = 2 * len(intersection) / (len(true) + len(pred))
    return f1

In [4]:
data = pd.read_csv(os.path.join(CONFIG.data_dir, f'{CONFIG.phase}.csv'))
data['image'] = data['image'].apply(lambda x : os.path.join(CONFIG.data_dir, 'train_images', x))
data.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,../input/shopee-product-matching/train_images/...,94974f937d4c2433,Paper Bag Victoria Secret,249114794
1,train_3386243561,../input/shopee-product-matching/train_images/...,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045
2,train_2288590299,../input/shopee-product-matching/train_images/...,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891
3,train_2406599165,../input/shopee-product-matching/train_images/...,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188
4,train_3369186413,../input/shopee-product-matching/train_images/...,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069


In [5]:
print("Shape")
display(data.shape)
print('\n')

print("Info")
display(data.info())
print('\n')

print("Nunique")
display(data.nunique())

Shape


(34250, 5)



Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34250 entries, 0 to 34249
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   posting_id   34250 non-null  object
 1   image        34250 non-null  object
 2   image_phash  34250 non-null  object
 3   title        34250 non-null  object
 4   label_group  34250 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.3+ MB


None



Nunique


posting_id     34250
image          32412
image_phash    28735
title          33117
label_group    11014
dtype: int64

In [6]:
if CONFIG.phase == 'train':
    group_members = data.groupby('label_group')['posting_id'].agg('unique').to_dict()
    data['true_matches'] = data['label_group'].map(group_members)
    display(data.head())

Unnamed: 0,posting_id,image,image_phash,title,label_group,true_matches
0,train_129225211,../input/shopee-product-matching/train_images/...,94974f937d4c2433,Paper Bag Victoria Secret,249114794,"[train_129225211, train_2278313361]"
1,train_3386243561,../input/shopee-product-matching/train_images/...,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,"[train_3386243561, train_3423213080]"
2,train_2288590299,../input/shopee-product-matching/train_images/...,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,"[train_2288590299, train_3803689425]"
3,train_2406599165,../input/shopee-product-matching/train_images/...,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,"[train_2406599165, train_3342059966]"
4,train_3369186413,../input/shopee-product-matching/train_images/...,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,"[train_3369186413, train_921438619]"


In [7]:
image_hash_members = data.groupby('image_phash')['posting_id'].agg('unique').to_dict()
data['hash_matches'] = data['image_phash'].map(image_hash_members)
data['hash_matches'] = data['hash_matches'].apply(list)

In [8]:
if CONFIG.phase == 'train':
    data['hash_f1'] = data.apply(lambda x : f1_score(x['true_matches'], x['hash_matches']), axis=1)
    hash_f1 = data['hash_f1'].mean()
    print(f'hash f1: {hash_f1:.3f}')

hash f1: 0.553


## 3. Model

In [9]:
def predict(features, threshold=0.9):
    preds = []
    length = len(features) // CONFIG.chunk_size
    if len(features) % CONFIG.chunk_size != 0: length += 1
        
    for i in tqdm(range(length)):
        l = CONFIG.chunk_size * i
        r = CONFIG.chunk_size * (i + 1)
        r = min(r, len(features))
        
        sim = torch.matmul(features, features[l:r].T).T
        sim = sim.data.cpu().numpy()
        
        for j in range(len(sim)):
            idx = np.where(sim[j] > threshold)[0]
            preds.append(data.iloc[idx]['posting_id'].to_list())
    
    return preds

### Image

In [10]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, image_path, transform=None):
        self.image_path = image_path
        self.transform = transform
    
    def __len__(self):
        return len(self.image_path)
    
    def __getitem__(self, idx):
        img = Image.open(self.image_path[idx]).convert("RGB")
        if self.transform:
            img = self.transform(img)
        return img

In [11]:
transform = transforms.Compose([
    transforms.Resize((CONFIG.img_size, CONFIG.img_size)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

dataset = Dataset(data['image'].tolist(), transform)
dataloader = torch.utils.data.DataLoader(dataset, CONFIG.batch_size, shuffle=False)

x = next(iter(dataloader))
x.size()

torch.Size([32, 3, 256, 256])

In [12]:
class ImageModel(nn.Module):
    def __init__(self):
        super(ImageModel, self).__init__()
        # Download pretrained weights and upload to the dataset
        # 'https://download.pytorch.org/models/resnet50-0676ba61.pth'
        model = torchvision.models.resnet50()
        weights = torch.load('../input/pytorch-models/resnet50.pth')
        model.load_state_dict(weights)
        self.featurizer = nn.Sequential(*list(model.children())[:-2])
        self.pool = nn.AdaptiveMaxPool2d(1)
        self.flatten = nn.Flatten()
        
    def forward(self, x):
        x = self.featurizer(x)
        x = self.pool(x)
        x = self.flatten(x)
        return x

In [13]:
image_model = ImageModel().to(CONFIG.device)
image_model = image_model.eval()

In [14]:
image_features = []
with torch.no_grad():
    for x in tqdm(dataloader):
        x = x.to(CONFIG.device)
        feat = image_model(x)
        feat = feat.data.cpu().numpy()
        image_features.append(feat)

image_features = np.vstack(image_features)
image_features = normalize(image_features)
image_features = torch.from_numpy(image_features).to(CONFIG.device)

  0%|          | 0/1071 [00:00<?, ?it/s]

In [15]:
image_matches = predict(image_features, threshold=0.95)
data['image_matches'] = image_matches

  0%|          | 0/34 [00:00<?, ?it/s]

In [16]:
if CONFIG.phase == 'train':
    data['image_f1'] = data.apply(lambda x : f1_score(x['true_matches'], x['image_matches']), axis=1)
    image_f1 = data['image_f1'].mean()
    print(f'image f1: {image_f1:.3f}')

image f1: 0.635


### Title

In [17]:
vectorizer = TfidfVectorizer(max_features=50000)

In [18]:
text_features = vectorizer.fit_transform(data['title']).toarray()
text_features = torch.from_numpy(text_features).to(CONFIG.device)

In [19]:
text_matches = predict(text_features, threshold=0.7)
data['text_matches'] = text_matches

  0%|          | 0/34 [00:00<?, ?it/s]

In [20]:
if CONFIG.phase == 'train':
    data['text_f1'] = data.apply(lambda x : f1_score(x['true_matches'], x['text_matches']), axis=1)
    text_f1 = data['text_f1'].mean()
    print(f'text f1: {text_f1:.3f}')

text f1: 0.617


### Total

In [21]:
data['total_matches'] = data.apply(lambda x : list(set(x['hash_matches'] + x['image_matches'] + x['text_matches'])), axis=1)

In [22]:
if CONFIG.phase == 'train':
    data['total_f1'] = data.apply(lambda x : f1_score(x['true_matches'], x['total_matches']), axis=1)
    total_f1 = data['total_f1'].mean()
    print(f'total f1: {total_f1:.3f}')

total f1: 0.723


In [23]:
submission = pd.DataFrame({'posting_id':data['posting_id'], 'matches':data['total_matches']})
submission['matches'] = submission['matches'].apply(lambda x : ' '.join(x))
submission.head()

Unnamed: 0,posting_id,matches
0,train_129225211,train_129225211 train_2278313361
1,train_3386243561,train_3386243561
2,train_2288590299,train_2288590299
3,train_2406599165,train_1744956981 train_3576714541 train_240659...
4,train_3369186413,train_3369186413


In [24]:
submission.to_csv('submission.csv', index=False)