In [None]:
import numpy as np, pandas as pd, gc
import cv2, matplotlib.pyplot as plt
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0

Here, we are limiting the RAM availability for the GPU for more availability for RAPIDS

In [None]:
LIMIT = 1
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
  except RuntimeError as e:
    print(e)

# Train Data
Here, we have loaded the training dataset and we have created a target column by simply finding items that have the same label group.

In [None]:
COMPUTE_CV = True

test = pd.read_csv('../input/shopee-product-matching/test.csv')
if len(test)>3: COMPUTE_CV = False
else: print('this submission notebook will compute CV score, but commit notebook will not')

In [None]:
train = pd.read_csv('../input/shopee-product-matching/train.csv')
tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
train['target'] = train.label_group.map(tmp)
print('train shape is', train.shape )
train.head()

In [None]:
def countHashinGroups(train):
    hash_group = train.image_phash.unique()
    count = 0
    hash_li = []
    count_li = []
    for each in hash_group:
        label_count = len(train[train.image_phash == each].label_group.unique())
        if label_count > 1:
            count += 1
            hash_li.append(each) # collect the hash that are labelled wrongly in the dataset
            count_li.append(label_count) # store the count to get more detail information
    print('{:.2f}% of the image are labelled in more than 2 groups'.format(100*count/len(hash_group)))
    print('{} out of {} are labelled in more than 2 groups'.format(count, len(hash_group)))
    return hash_li, count_li

In [None]:
hash_li, count_li = countHashinGroups(train)

In [None]:
def makeOneLabel(train, phash):
    '''
    input: dataset, phash
        find the index of certain phash, and then find out the max count of label in new dataset. In the end, change all
        label to the max-count label
    output: dataset
    '''
    train_correct = train
    train_hash = train[train.image_phash == phash]
    idx = train_hash.index.tolist()
    allLabels = train_hash.label_group.value_counts(ascending=False).index.tolist()
    label = train_hash.label_group.value_counts(ascending=False).index[0]
    train_correct.label_group.iloc[idx] = label
    print('phash: {}, index: {}, \n all label: {}, new label: {} \n'.format(phash, idx, allLabels, label))
    return train_correct

# Compute Baseline CV Score

We are computing a baseline CV score based on image_phash values being duplicate.

In [None]:
tmp = train_correct.groupby('image_phash').posting_id.agg('unique').to_dict()
train_correct['oof'] = train_correct.image_phash.map(tmp)

train_correct.head()

In [None]:
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

In [None]:
train['f1'] = train_correct.apply(getMetric('oof'),axis=1)
print('CV score for baseline =',train.f1.mean())

# Compute RAPIDS Model CV
We are now using the images, image_phash and titles combined to get a better CV score.

In [None]:
if COMPUTE_CV:
    test = pd.read_csv('../input/shopee-product-matching/train.csv')
    test_gf = cudf.DataFrame(test)
    print('Using train as test to compute CV (since commit notebook). Shape is', test_gf.shape )
else:
    test = pd.read_csv('../input/shopee-product-matching/test.csv')
    test_gf = cudf.read_csv('../input/shopee-product-matching/test.csv')
    print('Test shape is', test_gf.shape )
test_gf.head()

In [None]:
def countHashinGroups(train):
    hash_group = train.image_phash.unique()
    count = 0
    hash_li = []
    count_li = []
    for each in hash_group:
        label_count = len(train[train.image_phash == each].label_group.unique())
        if label_count > 1:
            count += 1
            hash_li.append(each) # collect the hash that are labelled wrongly in the dataset
            count_li.append(label_count) # store the count to get more detail information
    print('{:.2f}% of the image are labelled in more than 2 groups'.format(100*count/len(hash_group)))
    print('{} out of {} are labelled in more than 2 groups'.format(count, len(hash_group)))
    return hash_li, count_li

In [None]:
hash_li, count_li = countHashinGroups(test)

In [None]:
for phash in hash_li:
    test_correct = makeOneLabel(test, phash)

In [None]:
test = test_correct.copy()

In [None]:
test_gf = cudf.DataFrame(test)

# Using images to make a model to predict labels.
We are making use of RAPIDS cuml K-Nearest Neighbour model to predict labels.
In order to avoid memory errors, the computation is done in chunks.

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, df, img_size=256, batch_size=32, path=''): 
        self.df = df
        self.img_size = img_size
        self.batch_size = batch_size
        self.path = path
        self.indexes = np.arange( len(self.df) )
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        ct = len(self.df) // self.batch_size
        ct += int(( (len(self.df)) % self.batch_size)!=0)
        return ct

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        X = self.__data_generation(indexes)
        return X
            
    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' 
        X = np.zeros((len(indexes),self.img_size,self.img_size,3),dtype='float32')
        df = self.df.iloc[indexes]
        for i,(index,row) in enumerate(df.iterrows()):
            img = cv2.imread(self.path+row.image)
            X[i,] = cv2.resize(img,(self.img_size,self.img_size)) #/128.0 - 1.0
        return X

In [None]:
BASE = '../input/shopee-product-matching/test_images/'
if COMPUTE_CV: BASE = '../input/shopee-product-matching/train_images/'

WGT = '../input/model1/efficientnetb0_notop.h5'
model = EfficientNetB0(weights=WGT,include_top=False, pooling='avg', input_shape=None)

embeds = []
CHUNK = 1024*4

print('Computing image embeddings...')
CTS = len(test)//CHUNK
if len(test)%CHUNK!=0: CTS += 1
for i,j in enumerate( range( CTS ) ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(test))
    print('chunk',a,'to',b)
    
    test_gen = DataGenerator(test.iloc[a:b], batch_size=32, path=BASE)
    image_embeddings = model.predict(test_gen,verbose=1,use_multiprocessing=True, workers=4)
    embeds.append(image_embeddings)

    #if i>=1: break
    
del model
_ = gc.collect()
image_embeddings = np.concatenate(embeds)
print('image embeddings shape',image_embeddings.shape)

In [None]:
KNN = 50
if len(test)==3: KNN = 2
model = NearestNeighbors(n_neighbors=KNN)
model.fit(image_embeddings)

In [None]:
preds = []
CHUNK = 1024*4

print('Finding similar images...')
CTS = len(image_embeddings)//CHUNK
if len(image_embeddings)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(image_embeddings))
    print('chunk',a,'to',b)
    distances, indices = model.kneighbors(image_embeddings[a:b,])
    
    for k in range(b-a):
        IDX = np.where(distances[k,]<6.0)[0]
        IDS = indices[k,IDX]
        o = test.iloc[IDS].posting_id.values
        preds.append(o)
        
del model, distances, indices, image_embeddings, embeds
_ = gc.collect()

In [None]:
test['preds2'] = preds
test.head()

# Using title to make a model to predict labels.
We are making use of Tfidf vectorirzer to predict labels.

In [None]:
print('Computing text embeddings...')
model = TfidfVectorizer(stop_words='english', binary=True, max_features=25_000)
text_embeddings = model.fit_transform(test_gf.title).toarray()
print('text embeddings shape',text_embeddings.shape)

In [None]:
text_embeddings

In [None]:
preds = []
CHUNK = 1024*4

print('Finding similar titles...')
CTS = len(test)//CHUNK
if len(test)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(test))
    print('chunk',a,'to',b)
    
    # COSINE SIMILARITY DISTANCE
    cts = cupy.matmul( text_embeddings, text_embeddings[a:b].T).T
    
    for k in range(b-a):
        IDX = cupy.where(cts[k,]>0.7)[0]
        o = test.iloc[cupy.asnumpy(IDX)].posting_id.values
        preds.append(o)
        
del model, text_embeddings
_ = gc.collect()

In [None]:
test['preds'] = preds
test.head()

# Using image_phash to get labels labels.
Creating a target column by getting all posting_id values for each phash.

In [None]:
tmp = test.groupby('image_phash').posting_id.agg('unique').to_dict()
test['preds3'] = test.image_phash.map(tmp)
test.head()

# Calculating CV Score
Here, we have concatenated all the three predicted columns into a single one for calculating the CV score.

In [None]:
def combine_for_sub(row):
    x = np.concatenate([row.preds,row.preds2, row.preds3])
    return ' '.join( np.unique(x) )

def combine_for_cv(row):
    x = np.concatenate([row.preds,row.preds2, row.preds3])
    return np.unique(x)

In [None]:
if COMPUTE_CV:
    tmp = test.groupby('label_group').posting_id.agg('unique').to_dict()
    test['target'] = test.label_group.map(tmp)
    test['oof'] = test.apply(combine_for_cv,axis=1)
    test['f1'] = test.apply(getMetric('oof'),axis=1)
    print('CV Score =', test.f1.mean() )

test['matches'] = test.apply(combine_for_sub,axis=1)

## Got a CV score of >0.72!

# Write Submission CSV


In [None]:
test[['posting_id','matches']].to_csv('submission.csv',index=False)
sub = pd.read_csv('submission.csv')
sub.head()