# SETI RAPIDS kNN

In this notebook, we will extract image embeddings with a CNN and use them to train a `RAPIDS` KNN model. I first saw this technique done in [Chris Deotte](https://www.kaggle.com/cdeotte)'s notebook [here](https://www.kaggle.com/cdeotte/rapids-cuml-knn-find-duplicates) from the Melanoma competition; I encourage you to check it out.

Note that I am using [@awsaf49](https://www.kaggle.com/awsaf49)'s TFRecord datasets that can be found [here](https://www.kaggle.com/awsaf49/setibl-256x256-tfrec-dataset). 

In [None]:
import cuml
print('RAPIDS version',cuml.__version__)

In [None]:
import sys
sys.path.append('../input/efficientnet-keras-dataset/efficientnet_kaggle')
from efficientnet.tfkeras import *

In [None]:
from matplotlib import pyplot as plt
import math, os, cv2, gc, re
import numpy as np, pandas as pd
from time import time
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import KFold
import gc
from operator import itemgetter

import tensorflow as tf
import tensorflow.keras.backend as K

# CFG

In [None]:
DEVICE = 'GPU'

if DEVICE == "TPU":
    print("connecting to TPU...")
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        print("Could not connect to TPU")
        tpu = None

    if tpu:
        try:
            print("initializing  TPU ...")
            tf.config.experimental_connect_to_cluster(tpu)
            tf.tpu.experimental.initialize_tpu_system(tpu)
            strategy = tf.distribute.experimental.TPUStrategy(tpu)
            print("TPU initialized")
        except _:
            print("failed to initialize TPU")
    else:
        DEVICE = "GPU"

if DEVICE != "TPU":
    print("Using default strategy for CPU and single GPU")
    strategy = tf.distribute.get_strategy()

if DEVICE == "GPU":
    print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
    

AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

In [None]:
#basic
EFFNET = 0
SEED = 34       
IMAGE_SIZE = [256, 256]               
BATCH_SIZE = 16
FOLDS = 5
VERBOSE = 1
TRANSFORM = False
FIRST_FOLD_ONLY = True

#for KNN training
FINE_TUNE = True
FT_EPOCHS = 2
N_NEIGH = 50

#for t-sne
PERPLEXITY = 5

#for kmeans
N_CLUSTERS = 5

# Helper Functions

In [None]:
def efficientnet(b, image_size, head=False, LR=5e-4):
    efns = [EfficientNetB0, EfficientNetB1, EfficientNetB2,
            EfficientNetB3, EfficientNetB4, EfficientNetB5,
            EfficientNetB6]
    with strategy.scope():
        efficient = efns[b](
            input_shape=(image_size[0]*3, image_size[1], 3),
            weights='noisy-student', #imagenet
            include_top=False
        )
        efficient.trainable=True
        if head:
            model = tf.keras.Sequential([
                efficient,
                tf.keras.layers.GlobalAveragePooling2D(name='pooling'), 
                tf.keras.layers.Dropout(.2), 
                tf.keras.layers.Dense(1, activation='sigmoid')
            ])
        else:
            model = tf.keras.Sequential([
                efficient,
                tf.keras.layers.GlobalAveragePooling2D()])          
    if head: model.compile(optimizer=tf.keras.optimizers.Adam(LR), 
                           loss=tf.keras.losses.BinaryCrossentropy() ,
                           metrics=['AUC'])
    return model

def get_train_file_path(image_id):
    return "../input/seti-breakthrough-listen/train/{}/{}.npy".format(image_id[0], image_id)

def get_test_file_path(image_id):
    return "../input/seti-breakthrough-listen/test/{}/{}.npy".format(image_id[0], image_id)

def count_data_items(fileids):
    n = [int(re.compile(r"-([0-9]*)\.").search(fileid).group(1)) 
         for fileid in fileids]
    return np.sum(n)

def max_val(l, i):
    return max(enumerate(sub[i] for sub in l), key=itemgetter(1))

# Dataset Functions

In [None]:
def read_labeled_tfrecord(example, return_image_ids):
    tfrec_format = {
        'image'                        : tf.io.FixedLenFeature([], tf.string),
        'image_id'                   : tf.io.FixedLenFeature([], tf.string),
        'target'                       : tf.io.FixedLenFeature([], tf.int64)
    }           
    example = tf.io.parse_single_example(example, tfrec_format)
    if return_image_ids:
        return example['image'], example['target'], example['image_id']
    else:
        return example['image'], example['target']


def read_unlabeled_tfrecord(example, return_image_ids):
    tfrec_format = {
        'image'                        : tf.io.FixedLenFeature([], tf.string),
        'image_id'                   : tf.io.FixedLenFeature([], tf.string),
    }
    example = tf.io.parse_single_example(example, tfrec_format)
    if return_image_ids:
        return example['image'], example['image_id']
    else:
        return example['image']


def prepare_image(img, augment=True, dim=IMAGE_SIZE):    
    img = tf.image.decode_png(img, channels=3)
    
    # converting channel information to spatial information
    img = tf.concat([img[...,idx] for idx in range(3)], axis=0)
    img = tf.stack([img for _ in range(3)], axis=-1)
    img = tf.reshape(img, [dim[0]*3,dim[1], 3])
    
    img = tf.cast(img, tf.float32) / 255.0
    
    if augment:
        img = transform(img,DIM=dim) if TRANSFORM else img
        img = tf.image.random_flip_left_right(img)
        #img = tf.image.random_hue(img, 0.01)
        img = tf.image.random_saturation(img, sat[0], sat[1])
        img = tf.image.random_contrast(img, cont[0], cont[1])
        img = tf.image.random_brightness(img, bri)      
                      
    img = tf.reshape(img, [dim[0]*3,dim[1], 3])
            
    return img

In [None]:
def get_dataset(files, augment=False, shuffle=False, repeat=False, 
                labeled=True, return_image_ids=True, batch_size=BATCH_SIZE, dim=IMAGE_SIZE):
    
    ds = tf.data.TFRecordDataset(files, num_parallel_reads=AUTO)
    ds = ds.cache()
    
    if repeat:
        ds = ds.repeat()
    
    if shuffle: 
        ds = ds.shuffle(1024*8)
        opt = tf.data.Options()
        opt.experimental_deterministic = False
        ds = ds.with_options(opt)
        
    if labeled: 
        ds = ds.map(lambda example: read_labeled_tfrecord(example, return_image_ids), 
                    num_parallel_calls=AUTO) 
    else:
        ds = ds.map(lambda example: read_unlabeled_tfrecord(example, return_image_ids), 
                    num_parallel_calls=AUTO)      
    
    try:
        ds = ds.map(lambda img, imgname_or_label: (prepare_image(img, augment=augment, dim=dim), 
                                                   imgname_or_label), num_parallel_calls=AUTO)
    except:
        try:
            ds = ds.map(lambda img, label, img_id: (prepare_image(img, augment=augment, dim=dim), 
                                                label, img_id), num_parallel_calls=AUTO)
        except:
            ds = ds.map(lambda img: prepare_image(img, augment=augment, dim=dim), 
                                                num_parallel_calls=AUTO)
    
    ds = ds.batch(batch_size * REPLICAS)
    ds = ds.prefetch(AUTO)
    return ds

# Data Loading

In [None]:
FILENAMES =  tf.io.gfile.glob(f'../input/setibl-{IMAGE_SIZE[0]}x{IMAGE_SIZE[0]}-tfrec-dataset' + '/*.tfrec')
TRAINING_FILENAMES = [file for file in FILENAMES if 'train' in file]
TEST_FILENAMES = [file for file in FILENAMES if 'test' in file]

# RAPIDS cuML kNN

In [None]:
test_ds = get_dataset(TEST_FILENAMES, labeled=False, return_image_ids=False, repeat=False, shuffle=False)
skf = KFold(n_splits=FOLDS,shuffle=True,random_state=SEED)
preds_all = []
preds_model = []
oof_pred = []
oof_labels = []

for f, (train_index, val_index) in enumerate(skf.split(TRAINING_FILENAMES)):
    
    print('#'*30); print('#### FOLD',f+1); print('#'*30); print('')
    print('Getting datasets...'); print('')
    
    train_ds = get_dataset(list(pd.DataFrame({'TRAINING_FILENAMES': TRAINING_FILENAMES}).loc[train_index]['TRAINING_FILENAMES']),
                                            labeled=True, return_image_ids=False, repeat=False, shuffle=False)
    val_ds = get_dataset(list(pd.DataFrame({'TRAINING_FILENAMES': TRAINING_FILENAMES}).loc[val_index]['TRAINING_FILENAMES']),
                                            labeled=True, return_image_ids=False, repeat=False, shuffle=False)
    train_labs = [target.numpy() for img, target in iter(train_ds.unbatch())]
    val_labs = [target.numpy() for img, target in iter(val_ds.unbatch())]
    
    effnet_ = efficientnet(b=EFFNET, image_size=IMAGE_SIZE, head=FINE_TUNE)
    
    if FINE_TUNE:
        train_ds_ = get_dataset(list(pd.DataFrame({'TRAINING_FILENAMES': TRAINING_FILENAMES}).loc[train_index]['TRAINING_FILENAMES']),
                                            labeled=True, return_image_ids=False, repeat=True, shuffle=True)
        
        print('Fine tuning EfficientNet...'); print('')
        ct_train = count_data_items(list(pd.DataFrame({'TRAINING_FILENAMES': TRAINING_FILENAMES}).loc[train_index]['TRAINING_FILENAMES']))
        effnet_.fit(train_ds_, 
                    validation_data=val_ds,
                    verbose=1, 
                    steps_per_epoch=ct_train//BATCH_SIZE,
                    epochs=FT_EPOCHS)
        print('')
        effnet = tf.keras.Model(inputs=effnet_.input, 
               outputs=effnet_.get_layer('pooling').output)

    else: effnet = effnet_

    print('Getting embeddings...'); print('')
    embed = effnet.predict(train_ds, verbose=1)
    embed_val = effnet.predict(val_ds, verbose=1)
    embed_test = effnet.predict(test_ds, verbose=1)
    np.save(f'embed_{f}_{IMAGE_SIZE[0]}',embed.astype('float32'))
    np.save(f'embed_val_{f}_{IMAGE_SIZE[0]}',embed_val.astype('float32'))
    
    print('Training KNN...'); print('')
    model = cuml.neighbors.KNeighborsClassifier(n_neighbors=N_NEIGH)
    model.fit(embed, np.array(train_labs).astype('float32'))
    val_preds = model.predict_proba(embed_val)[:, 1]
    score = roc_auc_score(np.array(val_labs), val_preds)
    print(''); print(f'Fold {f} AUC: {score}'); print('')
    
    preds = model.predict_proba(embed_test)
    preds_model.append(preds) 
    
    del effnet_, train_ds, val_ds, embed_test, model; z = gc.collect()
    
    if FIRST_FOLD_ONLY:
        break
    
preds_avg = np.stack(preds_model).mean(0)
preds_all.append(preds_avg)
preds_all = np.stack(preds_all)

In [None]:
res = []
for k in range(8, 100, 2):
    model_tt = cuml.neighbors.KNeighborsClassifier(n_neighbors=k)
    model_tt.fit(embed, np.array(train_labs).astype('float32'))
    val_preds = model_tt.predict_proba(embed_val)[:, 1]
    score = roc_auc_score(np.array(val_labs), val_preds)
    res.append([k, score])
best_k, best_score = max_val(res, 1)
print(f"Fold {f} optimized K value: {best_k} with AUC: {best_score}")

# RAPIDS cuML KMeans

In [None]:
train_dummy = get_dataset(TRAINING_FILENAMES, labeled=True,
                         return_image_ids=True, repeat=False, 
                         shuffle=False)
names = np.array([img_name.numpy().decode("utf-8") for img, label, img_name in iter(train_dummy.unbatch())])
labels = np.array([label.numpy() for img, label, img_name in iter(train_dummy.unbatch())])

In [None]:
fresh_effnet = efficientnet(b=EFFNET, image_size=IMAGE_SIZE, head=False)
train_full = get_dataset(TRAINING_FILENAMES, labeled=True,
                       return_image_ids=False, repeat=False, 
                       shuffle=False)
embed_full = fresh_effnet.predict(train_full, verbose=1)
embed_full_tuned = effnet.predict(train_full, verbose=1)

In [None]:
train = pd.DataFrame()
train['image_id'] = names
train['label'] = labels
train['image_id'] = train['image_id'].apply(get_train_file_path)

In [None]:
model = cuml.KMeans(n_clusters=N_CLUSTERS)
model_ = cuml.KMeans(n_clusters=N_CLUSTERS)
model.fit(embed_full)
model_.fit(embed_full_tuned)
train['cluster'] = model.labels_
train['cluster_tuned'] = model_.labels_
train.head()

In [None]:
del effnet, fresh_effnet, model, model_; z = gc.collect()

### With ImageNet Weights

In [None]:
names_ = np.array([f"../input/seti-breakthrough-listen/train/{name[0]}/{name}.npy" for name in names])
for k in range(N_CLUSTERS):
    print('#'*25);
    print(f'#### Cluster {k} of similar train images')
    print('#'*25)
    df = train.loc[train.cluster==k]
    plt.figure(figsize=(20,10))
    for j in range(8):
        plt.subplot(2,4,j+1)
        image = np.load(names_[df.index[j]])
        image = image.astype(np.float32)
        image = np.vstack(image).transpose((1, 0))
        image = cv2.resize(image, (256, 256))
        plt.axis('off')
        plt.title(f"{names[df.index[j]]}, Target = {df.loc[df.index[j],'label']}")
        plt.imshow(image)  
    plt.show()

### After Fine Tuning

In [None]:
for k in range(N_CLUSTERS):
    print('#'*25);
    print(f'#### Cluster {k} of similar train images')
    print('#'*25)
    df = train.loc[train.cluster_tuned==k]
    plt.figure(figsize=(20,10))
    for j in range(8):
        plt.subplot(2,4,j+1)
        image = np.load(names_[df.index[j]])
        image = image.astype(np.float32)
        image = np.vstack(image).transpose((1, 0))
        image = cv2.resize(image, (256, 256))
        plt.axis('off')
        plt.title(f"{names[df.index[j]]}, Target = {df.loc[df.index[j],'label']}")
        plt.imshow(image)  
    plt.show()

# RAPIDS cuML T-SNE

In [None]:
model = cuml.TSNE(perplexity=PERPLEXITY, n_iter=500)
embed2D = model.fit_transform(embed_full)
train['x'] = embed2D[:,0]
train['y'] = embed2D[:,1]

In [None]:
model_ = cuml.TSNE(perplexity=PERPLEXITY, n_iter=500)
embed2D_ = model_.fit_transform(embed_full_tuned)
train['x_tuned'] = embed2D_[:,0]
train['y_tuned'] = embed2D_[:,1]

In [None]:
del model, model_; z = gc.collect()

### With ImageNet Weights

In [None]:
plt.figure(figsize=(10,10))
df1 = train.loc[train.label==0]
df2 = train.loc[train.label==1]

plt.scatter(df1.x,df1.y,c='blue',s=10,label='0')
plt.scatter(df2.x,df2.y,c='red',s=10,label='1')
plt.legend();

### After Fine Tuning

In [None]:
plt.figure(figsize=(10,10))

plt.scatter(df1.x_tuned,df1.y_tuned,c='blue',s=10,label='0')
plt.scatter(df2.x_tuned,df2.y_tuned,c='red',s=10,label='1')
plt.legend();

# Submission

In [None]:
test_dummy = get_dataset(TEST_FILENAMES, labeled=False, return_image_ids=True, repeat=False, shuffle=False)
test_ids = np.array([img_id.numpy().decode("utf-8") 
                        for img, img_id in iter(test_dummy.unbatch())])

In [None]:
submission = pd.DataFrame({'id':test_ids})

In [None]:
submission["target"] = preds_all[:, :, 1].mean(0)
#submission = submission.sort_values('id') 
submission.to_csv("submission.csv", index=False)
submission

In [None]:
plt.hist(submission.target);