# RAPIDS cuML kNN - Find Similar Images
In this notebook, we show how to use RAPIDS cuML kNN with CNN image embeddings to find similar images. This is a useful technique that can be used in many computer vision tasks. If you have one image and want to find similar images from a collection of images, you can use this technique. (For example, search internet images for similar images to a specific image, or search a database of images, or compete in Kaggle's Google Landmark Retrieval Competition [here][1])

Using this technique, we find 6 images in this competition's test dataset that are in last years 2019 train dataset. Therefore we know these 6 answers perfectly.

Extracting CNN embeddings is a useful skill by itself. Once you have embeddings in a dataframe, you can train any ML model to classify Melanoma images using embeddings dataframe and ignoring the original images. And you can add more features like meta features to your embeddings for improved accuracy. I will demonstrate training a simple ML model using embeddings.

Lastly we will explore clusters of images in the embeddings space using RAPIDS cuML KMeans and RAPIDS cuML TSNE.

[1]: https://www.kaggle.com/c/landmark-retrieval-2020


# Initialize Environment
We will use TensorFlow to extract CNN image embeddings and RAPIDS cuML kNN to compare them. We will restrict TensorFlow to only use 75% of GPU VRAM so that RAPIDS has 25% of GPU VRAM. Note that we only need to extract embeddings in the first version of this notebook and save them to a Kaggle dataset. Then in subsequent versions, we can load the embeddings from the Kaggle dataset. Do this if you wish to explore larger embeddings that take longer to extract.

In [None]:
DIM = 256; EFFN = 0; BATCH_SIZE = 128
LOAD_EMBEDDINGS = False
if LOAD_EMBEDDINGS: print('We will read embeddings from Kaggle dataset')
else: print('We will extract embeddings from pretrained CNN')
PATH_TO_EMBEDDINGS = '../input/embeddings-melanoma/'

In [None]:
# INSTALL RAPIDS
import sys
!cp ../input/rapids/rapids.0.14.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

# INSTALL EFFICIENT NET
!pip install -q efficientnet >> /dev/null

In [None]:
# LOAD LIBRARIES
import pandas as pd, numpy as np
import tensorflow as tf, re, math
import tensorflow.keras.backend as K
import efficientnet.tfkeras as efn
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt, cv2
import cuml #, cupy
print('RAPIDS version',cuml.__version__)

In [None]:
# RESTRICT TENSORFLOW TO 12GB OF GPU RAM
# SO THAT WE HAVE GPU RAM FOR RAPIDS CUML KNN
LIMIT = 12
if LOAD_EMBEDDINGS: LIMIT = 0
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*12)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    print(e)
print('Restrict TensorFlow to max %iGB GPU RAM'%LIMIT)

# TFRecords Helper Functions
The following are helper functions to load TFRecords. This are from AgentAuers' notebook [here][1]

[1]: https://www.kaggle.com/agentauers/incredible-tpus-finetune-effnetb0-b6-at-once

In [None]:
def read_labeled_tfrecord(example):
    tfrec_format = {
        'image'                        : tf.io.FixedLenFeature([], tf.string),
        'image_name'                   : tf.io.FixedLenFeature([], tf.string),
        'target'                       : tf.io.FixedLenFeature([], tf.int64)
    }           
    example = tf.io.parse_single_example(example, tfrec_format)
    return example['image'], example['target']


def read_unlabeled_tfrecord(example, return_image_name):
    tfrec_format = {
        'image'                        : tf.io.FixedLenFeature([], tf.string),
        'image_name'                   : tf.io.FixedLenFeature([], tf.string),
    }
    example = tf.io.parse_single_example(example, tfrec_format)
    return example['image'], example['image_name'] if return_image_name else 0

 
def prepare_image(img, augment=True, dim=256):    
    img = tf.image.decode_jpeg(img, channels=3)
    # NORMALIZE IMAGES TO IMAGENET PRETRAIN
    img = ((tf.cast(img, tf.float32) / 255.0) - 0.449) / 0.226                      
    img = tf.reshape(img, [dim,dim, 3])
    return img

def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) 
         for filename in filenames]
    return np.sum(n)

In [None]:
def get_dataset(files, augment = False, shuffle = False, repeat = False, 
                labeled=True, return_image_names=True, batch_size=32, dim=128):
    
    ds = tf.data.TFRecordDataset(files, num_parallel_reads=AUTO)
    ds = ds.cache()
    
    if repeat:
        ds = ds.repeat()
    
    if shuffle: 
        ds = ds.shuffle(1024*8)
        opt = tf.data.Options()
        opt.experimental_deterministic = False
        ds = ds.with_options(opt)
        
    if labeled: 
        ds = ds.map(read_labeled_tfrecord, num_parallel_calls=AUTO)
    else:
        ds = ds.map(lambda example: read_unlabeled_tfrecord(example, return_image_names), 
                    num_parallel_calls=AUTO)      
    
    ds = ds.map(lambda img, imgname_or_label: (prepare_image(img, augment=augment, dim=dim), 
                                               imgname_or_label), 
                num_parallel_calls=AUTO)
    
    ds = ds.batch(batch_size * REPLICAS)
    ds = ds.prefetch(AUTO)
    return ds

# Extract Image Features
We will extract image embeddings from EfficientNet B0 using images of size `256x256`. If the variable `LOAD_EMBEDDINGS=True` then we will load them from a previous notebook version Kaggle dataset. If `LOAD_EMBEDDINGS=False`, we will extract them. Note that we are using `noisy-student` pretrained weights for better features. And in the TFRecord read function above, we normalize our inputs by subtracting ImageNet mean of `0.449` and dividing by ImageNet standard deviation of `0.226`. This is important for image feature extraction because we are not training the EfficientNet anymore. If we were training more, then the model would correct itself if we don't normalize.

In [None]:
# EXTRACT LAST LAYER OF EFFICIENT NET WITH GLOBAL AVERAGE POOLING
EFNS = [efn.EfficientNetB0, efn.EfficientNetB1, efn.EfficientNetB2, efn.EfficientNetB3, 
        efn.EfficientNetB4, efn.EfficientNetB5, efn.EfficientNetB6]

def build_model(dim=256, ef=EFFN):
    inp = tf.keras.layers.Input(shape=(dim,dim,3))
    base = EFNS[ef](input_shape=(dim,dim,3),weights='imagenet',include_top=False)
    x = base(inp)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    model = tf.keras.Model(inputs=inp,outputs=x)
    return model

In [None]:
# TFRECORDS PATH
GCS_PATH = '../input/melanoma-%ix%i'%(DIM,DIM) # GPU does not need KaggleDatasets()
files_train = np.sort(np.array(tf.io.gfile.glob(GCS_PATH + '/train*.tfrec')))
files_test  = np.sort(np.array(tf.io.gfile.glob(GCS_PATH + '/test*.tfrec')))
AUTO     = tf.data.experimental.AUTOTUNE; REPLICAS = 1

## Test Image Embeddings

In [None]:
if not LOAD_EMBEDDINGS:
    model = build_model(dim=DIM,ef=EFFN)
    ds_test = get_dataset(files_test,labeled=False,return_image_names=False,augment=False,
            repeat=False,shuffle=False,dim=DIM,batch_size=BATCH_SIZE)
    embed_test = model.predict(ds_test,verbose=1)
    np.save('embed_test_%i_%i'%(DIM,EFFN),embed_test.astype('float32'))
else:
    embed_test = np.load(PATH_TO_EMBEDDINGS+'embed_test_%i_%i.npy'%(DIM,EFFN))
print('test embeddings shape',embed_test.shape)

## Train Image Embeddings

In [None]:
if not LOAD_EMBEDDINGS:
    model = build_model(dim=DIM,ef=EFFN)
    ds_train = get_dataset(files_train,labeled=False,return_image_names=False,augment=False,
            repeat=False,shuffle=False,dim=DIM,batch_size=BATCH_SIZE)
    embed = model.predict(ds_train,verbose=1)
    np.save('embed_train_%i_%i'%(DIM,EFFN),embed.astype('float32'))
else:
    embed = np.load(PATH_TO_EMBEDDINGS+'embed_train_%i_%i.npy'%(DIM,EFFN))
print('train embeddings shape',embed.shape)

## External Image Embeddings (2019 comp data)

In [None]:
GCS_PATH2 = '../input/isic2019-%ix%i'%(DIM,DIM) # GPU does not need KaggleDatasets()
files_ext = np.sort(np.array(tf.io.gfile.glob(GCS_PATH2 + '/train*.tfrec')))

if not LOAD_EMBEDDINGS:
    model = build_model(dim=DIM,ef=EFFN)
    ds_ext = get_dataset(files_ext,labeled=False,return_image_names=False,augment=False,
            repeat=False,shuffle=False,dim=DIM,batch_size=BATCH_SIZE)
    embed_ext = model.predict(ds_ext,verbose=1)
    np.save('embed_ext_%i_%i'%(DIM,EFFN),embed_ext.astype('float32'))
else:
    embed_ext = np.load(PATH_TO_EMBEDDINGS+'embed_ext_%i_%i.npy'%(DIM,EFFN))
print('ext embeddings shape',embed_ext.shape)

## Train Test Image Names
Here we extract the image names from TFRecords. We use our smallest TFRecord of size `128x128` for maximum speed.

In [None]:
# WE WILL READ IMAGE NAMES FROM 128X128 IMAGES for speed
if not LOAD_EMBEDDINGS:
    DIM = 128; BATCH_SIZE = 32
    GCS_PATH = '../input/melanoma-%ix%i'%(DIM,DIM)
    files_train = np.sort(np.array(tf.io.gfile.glob(GCS_PATH + '/train*.tfrec')))
    files_test  = np.sort(np.array(tf.io.gfile.glob(GCS_PATH + '/test*.tfrec')))

# READ TEST IMAGE NAMES
if not LOAD_EMBEDDINGS:
    ds_test = get_dataset(files_test,labeled=False,return_image_names=True,augment=False,
            repeat=False,shuffle=False,dim=DIM,batch_size=BATCH_SIZE)
    names_test = np.array([img_name.numpy().decode("utf-8") for img, img_name in iter(ds_test.unbatch())])
    np.save('names_test',names_test)
else:
    names_test = np.load(PATH_TO_EMBEDDINGS+'names_test.npy')
print('test names',names_test.shape)

# READ TRAIN IMAGE NAMES
if not LOAD_EMBEDDINGS:
    ds_train = get_dataset(files_train,labeled=False,return_image_names=True,augment=False,
            repeat=False,shuffle=False,dim=DIM,batch_size=BATCH_SIZE)
    names = np.array([img_name.numpy().decode("utf-8") for img, img_name in iter(ds_train.unbatch())])
    np.save('names_train',names)
else:
    names = np.load(PATH_TO_EMBEDDINGS+'names_train.npy')
print('train names',names.shape)

## External Image Names

In [None]:
# READ EXT IMAGE NAMES
if not LOAD_EMBEDDINGS:
    GCS_PATH2 = '../input/isic2019-%ix%i'%(DIM,DIM) # GPU does not need KaggleDatasets()
    files_ext = np.sort(np.array(tf.io.gfile.glob(GCS_PATH2 + '/train*.tfrec')))

if not LOAD_EMBEDDINGS:
    ds_ext = get_dataset(files_ext,labeled=False,return_image_names=True,augment=False,
            repeat=False,shuffle=False,dim=DIM,batch_size=BATCH_SIZE)
    names_ext = np.array([img_name.numpy().decode("utf-8") for img, img_name in iter(ds_ext.unbatch())])
    np.save('names_ext',names_ext)
else:
    names_ext = np.load(PATH_TO_EMBEDDINGS+'names_ext.npy')
print('ext names',names_ext.shape)

# CSV Files

In [None]:
# LOAD TRAIN AND TEST CSV
test = pd.read_csv( '../input/siim-isic-melanoma-classification/test.csv' ).set_index('image_name',drop=True)
test = test.loc[names_test].reset_index()
print('Test csv shape',test.shape)

train = pd.read_csv( '../input/melanoma-%ix%i/train.csv'%(DIM,DIM) ).set_index('image_name',drop=True)
train = train.loc[names].reset_index()
train.target = train.target.astype('float32')
print('Train csv shape',train.shape)

# LOAD EXTERNAL DATA CSV
train_ext = pd.read_csv( '../input/isic2019-%ix%i/train.csv'%(DIM,DIM) ).set_index('image_name',drop=True)
train_ext = train_ext.loc[names_ext].reset_index()
train_ext.target = train_ext.target.astype('float32')
print('Train_ext csv shape',train_ext.shape)

print('Displaying train.csv below...')
train.head()

# RAPIDS cuML kNN - Find Duplicates

## Find Duplicates

In [None]:
KNN = 3
model = cuml.neighbors.NearestNeighbors(n_neighbors=KNN)
model.fit(embed_ext)
distances, indices = model.kneighbors(embed_test)

mm = np.min(distances,axis=1)
plt.title('Shortest Distances of Test Images to any 2019 Image')
plt.hist(mm)
plt.show()

In [None]:
CUTOFF = 2
idx = np.where( (mm<CUTOFF) )[0]
print('There are %i potential duplicate images that have distance < %i'%(len(idx),CUTOFF))

## Display Duplicates

In [None]:
PATH_TEST = '../input/jpeg-melanoma-128x128/test/'
PATH_EXT = '../input/jpeg-isic2019-128x128/train/'

a = []; b = []; c = []

for k in idx:
    
    a.append(names_test[k])
    b.append(train_ext.target.iloc[int(indices[k,0])])
    c.append(names_ext[int(indices[k,0])])
    
    plt.figure(figsize=(10,5))
    
    plt.subplot(1,2,1)
    img = cv2.imread(PATH_TEST+names_test[k]+'.jpg')
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.imshow(img)
    plt.title('2020 Test Image - Target = ?\n%s'%names_test[k])

    plt.subplot(1,2,2)
    img = cv2.imread(PATH_EXT+names_ext[int(indices[k,0])]+'.jpg')
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.imshow(img)
    t = train_ext.target.iloc[int(indices[k,0])]
    plt.title('2019 Image - Target = %i\n%s'%(t,names_ext[int(indices[k,0])]))
    
    plt.show()

# RAPIDS cuML kNN - Train and Predict Test
Let's build a simple kNN model using the embeddings. We will use triple stratified KFold validation and inference.

In [None]:
skf = KFold(n_splits=5,shuffle=True,random_state=42)
oof = np.zeros((train.shape[0]))
folds = np.zeros((train.shape[0]))
preds = np.zeros((test.shape[0]))

for fold,(idxT,idxV) in enumerate(skf.split(np.arange(15))):
    
    # DISPLAY FOLD INFO
    print('#'*25); print('#### FOLD',fold+1)
    
    # CREATE TRAIN AND VALIDATION SUBSETS
    idxT2 = train.loc[train.tfrecord.isin(idxT)].index.values #2020 train
    idxV2 = train.loc[train.tfrecord.isin(idxV)].index.values #2020 valid
    
    model = cuml.neighbors.KNeighborsClassifier(n_neighbors=299)
    model.fit(embed[idxT2,],train.target.values[idxT2])
        
    oof[idxV2] = model.predict_proba(embed[idxV2,])[:,1]  
    preds += model.predict_proba(embed_test)[:,1] / skf.n_splits
    
    # REPORT RESULTS
    folds[idxV2] = fold
    auc = roc_auc_score(train.target.values[idxV2],oof[idxV2])
    print('#### OOF AUC = %.3f'%auc)
    print('#'*25); print()    
    
auc = roc_auc_score(train.target.values,oof)
print('Overall OOF AUC = %.3f'%auc)

# Submit to Kaggle

In [None]:
# WRITE OOF TO DISK
train['pred'] = oof; train['fold'] = folds
train[['image_name','target','pred','fold']].to_csv('oof.csv',index=False)
train[['image_name','target','pred','fold']].head()

In [None]:
# WRITE SUBMISSION TO DISK
submission = pd.DataFrame(dict(image_name=names_test, target=preds))
submission = submission.sort_values('image_name') 
submission.to_csv('submission.csv', index=False)
submission.head()

In [None]:
# PLOT TEST PREDICTION HISTOGRAM
plt.hist(submission.target,bins=50)
plt.show()

# APPENDIX
In this appendix, we will explore our embeddings

# RAPIDS cuML KMeans
Embeddings from EfficientNetB0 has dimension 1280. So each image is now represented by a point in 1280 dimension space. Let's cluster these points into 20 clusters and display what images look like in the different clusters.

In [None]:
CLUSTERS = 20
model = cuml.KMeans(n_clusters=CLUSTERS)
model.fit(embed)
train['cluster'] = model.labels_
train.head()

In [None]:
PATH_TRAIN = '../input/jpeg-melanoma-128x128/train/'

for k in range(CLUSTERS):
    print('#'*25);
    print('#### Cluster %i of similar train images'%k)
    print('#'*25)
    df = train.loc[train.cluster==k]
    plt.figure(figsize=(20,10))
    for j in range(8):
        plt.subplot(2,4,j+1)
        img = cv2.imread(PATH_TRAIN+names[df.index[j]]+'.jpg')
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        plt.axis('off')
        plt.title('%s, Target = %i'%(names[df.index[j]],df.loc[df.index[j],'target']))
        plt.imshow(img)  
    plt.show()
    

# RAPIDS cuML TSNE
Embeddings from EfficientNetB0 has dimension 1280. So each image is now represented by a point in 1280 dimension space. Let's map this 1280 dimensional space to 2 dimensions. Below we plot each image's new 2 dimension representation. We color benign orange and malignant blue. Next we will find a random square in this 2 dimensional space and plot some of the images inside the square.

In [None]:
model = cuml.TSNE()
embed2D = model.fit_transform(embed)
train['x'] = embed2D[:,0]
train['y'] = embed2D[:,1]

In [None]:
X_DIV = 10; Y_DIV = 10
x_min = train.x.min()
x_max = train.x.max()
y_min = train.y.min()
y_max = train.y.max()
x_step = (x_max - x_min)/X_DIV
y_step = (y_max - y_min)/Y_DIV
mx = 0; xa_mx = 0; xb_mx=0; ya_mx = 0; yb_mx = 0
for k in range(X_DIV+1):
    for j in range(Y_DIV+1):
        xa = k*x_step + x_min
        xb = (k+1)*x_step + x_min
        ya = j*y_step + y_min
        yb = (j+1)*y_step + y_min
        df = train.loc[(train.x>xa)&(train.x<xb)&(train.y>ya)&(train.y<yb)]
        t = df.target.mean()
        if (t>mx)&(len(df)>=16):
            mx = t
            xa_mx = xa
            xb_mx = xb
            ya_mx = ya
            yb_mx = yb
        #print(k,j,t)

In [None]:
plt.figure(figsize=(10,10))
df1 = train.loc[train.target==0]
plt.scatter(df1.x,df1.y,color='orange',s=10,label='Benign')
df2 = train.loc[train.target==1]
plt.scatter(df2.x,df2.y,color='blue',s=10,label='Malignant')
plt.plot([xa_mx,xa_mx],[ya_mx,yb_mx],color='black')
plt.plot([xa_mx,xb_mx],[ya_mx,ya_mx],color='black')
plt.plot([xb_mx,xb_mx],[ya_mx,yb_mx],color='black')
plt.plot([xa_mx,xb_mx],[yb_mx,yb_mx],color='black')
plt.legend()
plt.show()

In [None]:
df = train.loc[(train.x>xa_mx)&(train.x<xb_mx)&(train.y>ya_mx)&(train.y<yb_mx)]
print('This region has %.2f %% malignant'%(100*df.target.mean())) 

ROW = 4
COL = 4
plt.figure(figsize=(20,20*ROW/COL))
for k in range(ROW):
    for j in range(COL):
        plt.subplot(ROW,COL,k*COL+j+1)
        img = cv2.imread(PATH_TRAIN+names[df.index[k*COL+j]]+'.jpg')
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        plt.axis('off')
        plt.title('%s, Target = %i'%(names[df.index[k*COL+j]],df.loc[df.index[k*COL+j],'target']))
        plt.imshow(img)  
plt.show()

In [None]:
for it in range(5):
    i = 0
    while i<16:
        k = np.random.randint(0,X_DIV)
        j = np.random.randint(0,Y_DIV)
        xa_mx = k*x_step + x_min
        xb_mx = (k+1)*x_step + x_min
        ya_mx = j*y_step + y_min
        yb_mx = (j+1)*y_step + y_min
        df = train.loc[(train.x>xa_mx)&(train.x<xb_mx)&(train.y>ya_mx)&(train.y<yb_mx)]
        i = len(df)

    plt.figure(figsize=(10,10))
    df1 = train.loc[train.target==0]
    plt.scatter(df1.x,df1.y,color='orange',s=10,label='Benign')
    df2 = train.loc[train.target==1]
    plt.scatter(df2.x,df2.y,color='blue',s=10,label='Malignant')
    plt.plot([xa_mx,xa_mx],[ya_mx,yb_mx],color='black')
    plt.plot([xa_mx,xb_mx],[ya_mx,ya_mx],color='black')
    plt.plot([xb_mx,xb_mx],[ya_mx,yb_mx],color='black')
    plt.plot([xa_mx,xb_mx],[yb_mx,yb_mx],color='black')
    plt.legend()
    plt.show()

    print('This region has %.2f %% malignant'%(100*df.target.mean()))    
    
    ROW = 4
    COL = 4
    plt.figure(figsize=(20,20*ROW/COL))
    for k in range(ROW):
        for j in range(COL):
            plt.subplot(ROW,COL,k*COL+j+1)
            img = cv2.imread(PATH_TRAIN+names[df.index[k*COL+j]]+'.jpg')
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            plt.axis('off')
            plt.title('%s, Target = %i'%(names[df.index[k*COL+j]],df.loc[df.index[k*COL+j],'target']))
            plt.imshow(img)  
    plt.show()