# EfficientNet Image Embeddings + DBSCAN Baseline [CV 0.658]

Using EfficientNet image embeddings feature extraction from this [notebook](https://www.kaggle.com/cdeotte/part-2-rapids-tfidfvectorizer-cv-0-700), we cluster the embeddings with [DBSCAN](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html) and match images / postings within the same cluster.

# Load Libraries & Data

In [None]:
import os
import gc
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from tqdm.notebook import tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.cluster import DBSCAN
from sklearn import metrics

import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0

In [None]:
data_dir = '../input/shopee-product-matching/'
test = pd.read_csv(data_dir + 'test.csv')

if len(test) != 3:
    SUBMISSION = True
    train = test.copy()
    BASE = '../input/shopee-product-matching/test_images/'
else:
    SUBMISSION = False
    train = pd.read_csv(data_dir + 'train.csv')
    BASE = '../input/shopee-product-matching/train_images/'

print(train.shape)
train.head(3)

# F1 Score & pHash Baseline
source: https://www.kaggle.com/cdeotte/part-2-rapids-tfidfvectorizer-cv-0-700/

In [None]:
def getMetric(col):
    def f1score(row):
        n = len(np.intersect1d(row.target, row[col]))
        return 2 * n / (len(row.target) + len(row[col]))
    return f1score

tmp = train.groupby('image_phash')['posting_id'].agg('unique').to_dict()
train['matches_phash'] = train['image_phash'].map(tmp)

if not SUBMISSION:
    tmp = train.groupby('label_group')['posting_id'].agg('unique').to_dict()
    train['target'] = train['label_group'].map(tmp)

    train['f1_phash'] = train.apply(getMetric('matches_phash'), axis=1)
    print('Train F1 Score - method:pHash =', train['f1_phash'].mean())

# Image Embeddings via EfficientNet
source: https://www.kaggle.com/cdeotte/part-2-rapids-tfidfvectorizer-cv-0-700/

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, df, img_size=256, batch_size=32, path=''): 
        self.df = df
        self.img_size = img_size
        self.batch_size = batch_size
        self.path = path
        self.indexes = np.arange( len(self.df) )
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        ct = len(self.df) // self.batch_size
        ct += int(( (len(self.df)) % self.batch_size)!=0)
        return ct

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        X = self.__data_generation(indexes)
        return X
            
    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' 
        X = np.zeros((len(indexes),self.img_size,self.img_size,3),dtype='float32')
        df = self.df.iloc[indexes]
        for i,(index,row) in enumerate(df.iterrows()):
            img = cv2.imread(self.path+row.image)
            X[i,] = cv2.resize(img,(self.img_size,self.img_size)) #/128.0 - 1.0
        return X

In [None]:
WGT = '../input/effnetb0/efficientnetb0_notop.h5'
model = EfficientNetB0(weights=WGT, include_top=False, pooling='avg', input_shape=None)

embeds = []
CHUNK = 1024*4

print('Computing image embeddings...')
CTS = len(train)//CHUNK
if len(train)%CHUNK!=0: CTS += 1
for i,j in enumerate( range( CTS ) ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(train))
    print('chunk',a,'to',b)
    
    train_gen = DataGenerator(train.iloc[a:b], batch_size=32, path=BASE)
    image_embeddings = model.predict(train_gen,verbose=1,use_multiprocessing=True, workers=4)
    embeds.append(image_embeddings)
    
del model
_ = gc.collect()
image_embeddings = np.concatenate(embeds)

# with open('../working/ShopeeEffNetEmbeddings.pkl', 'wb') as handle:
    # pickle.dump(image_embeddings, handle)

print('image embeddings shape',image_embeddings.shape)

# DBSCAN Clustering

### Functions: fit, match, and combine

In [None]:
# source: https://www.kaggle.com/cdeotte/part-2-rapids-tfidfvectorizer-cv-0-700/

def dbscan_fit_match(train_sample, embeddings_sample, n_dims=200, eps=2, min_samp=2, show_vc=True, verbose=True, metric='euclidean', sub=False):

    use_train = train_sample.copy()
    use_image_embeddings = embeddings_sample.copy()
    
    if verbose:
        print('fitting dbscan using {} samples and params: n_dims={}, eps={}'.format(len(use_train), n_dims, eps))

    # fit dbscan
    db = DBSCAN(eps=eps, 
                min_samples=min_samp, 
                metric=metric, 
                n_jobs=-1).fit(use_image_embeddings[:, :n_dims])
    labels = db.labels_
    use_train['clusters'] = labels

    # use cluster labels to match items & compute, add their own posting_id for unclustered items
    if show_vc:
        display(use_train['clusters'].value_counts())
    clustered = (use_train['clusters'] != -1)
    tmp = use_train.loc[clustered].groupby('clusters')['posting_id'].agg('unique').to_dict()
    tmp[-1] = []
    for key in tmp:
        if len(tmp[key]) > 50:
            tmp[key] = tmp[key][:50]  # dirty solution to keep cluster sizes up to 50 only
    use_train['matches_dbscan'] = use_train['clusters'].map(tmp)
    # add self if not yet in matches
    use_train['matches_dbscan'] = use_train.apply(match_self, axis=1)
    
    if not sub:
        use_train['f1_dbscan'] = use_train.apply(getMetric('matches_dbscan'), axis=1)
        return db, use_train['clusters'], use_train['matches_dbscan'], use_train['f1_dbscan']
    else:
        return db, use_train['clusters'], use_train['matches_dbscan'], None

def match_self(row):
    if row['posting_id'] not in row['matches_dbscan']:
        return [row['posting_id']] + row['matches_dbscan']
    else:
        return row['matches_dbscan']

def combine_for_cv(row, match_cols):
    x = np.concatenate([row[col] for col in match_cols])
    return np.unique(x)

def combine_for_sub(row, match_cols):
    x = np.concatenate([row[col] for col in match_cols])
    return ' '.join(np.unique(x))

### Optimize eps param on euclidean distance

In [None]:
if not SUBMISSION:
    # use few samples for selecting eps for faster runtime
    sample_labels = pd.Series(train['label_group'].unique()).sample(frac=0.1, random_state=0)
    train_sample = train.loc[train['label_group'].isin(sample_labels)].copy()
    image_embeddings_sample = image_embeddings[train_sample.index].copy()

In [None]:
if not SUBMISSION:
    n_dims = 1280
    eps_range = np.arange(1, 11, 1)
    min_samp = 2
    
    opt_matrix = []
    for eps in tqdm(eps_range, total=len(eps_range)):
        print('fitting dbscan w/ eps={}...'.format(round(eps, 3)))
        db, clusters, train_sample['matches_dbscan'], train_sample['f1_dbscan'] = dbscan_fit_match(train_sample, image_embeddings_sample, n_dims=n_dims, eps=eps, min_samp=min_samp, show_vc=False, verbose=False, metric='euclidean')

        # combine dbscan matches and pHash matches
        train_sample['matches'] = train_sample.apply(combine_for_cv, 
                                                     axis=1, 
                                                     match_cols=['matches_phash', 'matches_dbscan'])
        train_sample['f1_combined'] = train_sample.apply(getMetric('matches'),axis=1)
        opt_row = [n_dims,
                   eps,
                   clusters.value_counts(),
                   train_sample['f1_phash'].mean(), 
                   train_sample['f1_dbscan'].mean(), 
                   train_sample['f1_combined'].mean()]
        opt_matrix.append(opt_row)
        
    opt_df = pd.DataFrame(opt_matrix, columns=['n_dims', 'eps', 'counts', 'f1_phash', 'f1_dbscan', 'f1_combined'])
    display(opt_df[['n_dims', 'eps', 'f1_phash', 'f1_dbscan', 'f1_combined']])
    
    print('best dbscan f1 score = {}'.format(opt_df['f1_dbscan'].max()))
    display(opt_df.sort_values(by='f1_dbscan', ascending=False)['counts'].iloc[0])

### Optimize eps param on cosine distance

In [None]:
if not SUBMISSION:
    # first attempt was np.arange(0.5, 1, 0.5) then used finer ranges
    n_dims=1280
    eps_range = np.arange(0.15, 0.26, 0.01)
    min_samp= 2

    opt_matrix = []

    for eps in tqdm(eps_range, total=len(eps_range)):
        print('fitting dbscan w/ eps={}...'.format(round(eps, 3)))
        db, clusters, train_sample['matches_dbscan'], train_sample['f1_dbscan'] = dbscan_fit_match(train_sample, image_embeddings_sample, n_dims=n_dims, eps=eps, min_samp=min_samp, show_vc=False, verbose=False, metric='cosine')

        # combine dbscan matches and pHash matches
        train_sample['matches'] = train_sample.apply(combine_for_cv, 
                                                     axis=1, 
                                                     match_cols=['matches_phash', 'matches_dbscan'])
        train_sample['f1_combined'] = train_sample.apply(getMetric('matches'),axis=1)
        opt_row = [n_dims,
                   eps,
                   clusters.value_counts(),
                   train_sample['f1_phash'].mean(), 
                   train_sample['f1_dbscan'].mean(), 
                   train_sample['f1_combined'].mean()]
        opt_matrix.append(opt_row)

    opt_df = pd.DataFrame(opt_matrix, columns=['n_dims', 'eps', 'counts', 'f1_phash', 'f1_dbscan', 'f1_combined'])
    display(opt_df[['n_dims', 'eps', 'f1_phash', 'f1_dbscan', 'f1_combined']])

    print('best dbscan f1 score = {}'.format(opt_df['f1_dbscan'].max()))
    display(opt_df.sort_values(by='f1_dbscan', ascending=False)['counts'].iloc[0])

### Fit on whole train set (~18 mins run time)
- time complexity seems $O(n^{2})$, estimating 72 mins run time in test
- using eps=0.15 because performance dropped on whole train sample on best eps=0.22 from above

In [None]:
%%time
db, clusters, train['matches_dbscan'], f1_dbscan = dbscan_fit_match(train, image_embeddings, n_dims=1280, eps=0.15, min_samp=2, show_vc=True, verbose=True, metric='cosine', sub=SUBMISSION)

if not SUBMISSION:
    train['f1_dbscan'] = f1_dbscan
    # combine dbscan matches and pHash matches
    train['matches'] = train.apply(combine_for_cv, axis=1, match_cols=['matches_phash', 'matches_dbscan'])
    train['f1_combined'] = train.apply(getMetric('matches'),axis=1)

    # print F1 Score for each method
    print('Train F1 Score - method:pHash =', train['f1_phash'].mean())
    print('Train F1 Score - method:dbscan =', train['f1_dbscan'].mean())
    print('Train F1 Score - combined =', train['f1_combined'].mean())

# Submission

In [None]:
matches_to_combine = ['matches_phash', 'matches_dbscan']
train['matches'] = train.apply(combine_for_sub, axis=1, match_cols=matches_to_combine)
train[['posting_id', 'matches']].to_csv('submission.csv',index=False)
sub = pd.read_csv('submission.csv')
sub.head()