Part 2 of [notebook](https://www.kaggle.com/keremt/cassava-eda-imagehash-cnn-dedup-old-and-new-data)

In this notebook we find dedups using Rapids

In [None]:
!pip install -qqU fastai==2.1.7

In [None]:
import fastai; print("fastai:", fastai.__version__)
import torch; print("torch:", torch.__version__)

In [None]:
from fastai.vision.all import *
import torchvision 

In [None]:
new_data_path = Path("../input/cassava-leaf-disease-classification/")
old_data_path = Path("../input/cassavaold/")
dedup_path = Path("../input/cassavadedup/")

## Data

### a) New Data

In [None]:
new_data_path.ls().map(lambda o: o.name)

In [None]:
train_images = get_image_files(new_data_path/'train_images')
test_images = get_image_files(new_data_path/'test_images')
train_df = pd.read_csv(new_data_path/'train.csv')

In [None]:
len(train_images), len(test_images)

In [None]:
new_images = train_images

In [None]:
train_df['label'].value_counts()

In [None]:
labeldict = json.loads((new_data_path/'label_num_to_disease_map.json').open().read())
labeldict = {int(k):v for k,v in labeldict.items()}

In [None]:
train_df['label'].map(labeldict).value_counts()

In [None]:
%%timeit
img1 = PILImage.create(train_images[0]) 
img1 = ToTensor()(img1)

In [None]:
%%timeit
img2 = torchvision.io.read_image(train_images[0].as_posix())

### b) Old Data

Please upvote if you use: https://www.kaggle.com/keremt/cassavaold

In [None]:
old_train_images = get_image_files(old_data_path/'train')
old_test_images = get_image_files(old_data_path/'test')
old_unsup_images = get_image_files(old_data_path/'extraimages')

In [None]:
old_images = old_train_images + old_test_images + old_unsup_images

In [None]:
len(old_images)

## CNN Based Dedup (Part2)

### Normalize labels 

In [None]:
len(old_images), len(new_images)

In [None]:
oldlabeldict = {'cbsd': 'Cassava Brown Streak Disease (CBSD)',
                 'healthy': 'Healthy',
                 'cmd': 'Cassava Mosaic Disease (CMD)',
                 'cgm': 'Cassava Green Mottle (CGM)',
                 'cbb': 'Cassava Bacterial Blight (CBB)',
                 '0': 'Unsup', # test
                 'extraimages': 'Unsup'}

In [None]:
labeldict

In [None]:
old_images2labels = dict(zip(old_images, [oldlabeldict[o] for o in old_images.map(lambda o: o.parent.name)]))

new_images2labels = dict(zip(train_df['image_id'], train_df['label']))
new_images2labels = {k:labeldict[v] for k,v in new_images2labels.items()}
new_images2labels = {o:new_images2labels[o.name] for o in new_images}

In [None]:
Counter(old_images2labels.values())

In [None]:
Counter(new_images2labels.values())

In [None]:
all_images2label = {**old_images2labels, **new_images2labels}

In [None]:
Counter(all_images2label.values())

In [None]:
len(all_images2label)

In [None]:
label_vocab = {'Cassava Bacterial Blight (CBB)':0,
             'Cassava Brown Streak Disease (CBSD)':1,
             'Cassava Green Mottle (CGM)':2,
             'Cassava Mosaic Disease (CMD)':3,
             'Healthy':4, 
             'Unsup':5}

In [None]:
# create dataframe for merged data
fnames, labels = zip(*all_images2label.items())
fnames = [str(o) for o in fnames]
data_df = pd.DataFrame({'fnames':fnames, 'labels':labels})
data_df['source'] = data_df.fnames.apply(lambda o: o.split("/")[2])

In [None]:
data_df.to_csv("merged_training_data.csv", index=False)

In [None]:
data_df

In [None]:
counts_df = data_df.groupby(['source', 'labels']).count(); counts_df

We can see label distributions are different in 2 datasets

In [None]:
counts_df.groupby(level=0).apply(lambda x: x / float(x.sum()))

In [None]:
counts_df.drop(index='Unsup', level='labels').apply(lambda x: x / float(x.sum()))

### 2) Get Embeddings 

In [None]:
all_images = old_images + new_images; len(all_images)

In [None]:
# Fastai
size = (224,224)
bs = 64

tfms = [[PILImage.create, ToTensor, 
         Resize(size, method='squish')], # We don't want to crop different parts of the same image if we are going to look for dedups!
        [lambda o: all_images2label[o], Categorize(label_vocab)]]

dsets = Datasets(all_images, tfms=tfms, splits=None)

batch_tfms = [IntToFloatTensor, Normalize.from_stats(*imagenet_stats)]
dls = dsets.dataloaders(bs=bs, after_batch=batch_tfms)

In [None]:
show_image(dsets[0][0]);

In [None]:
%%time
dls.show_batch(max_n=25)

In [None]:
# load
embeddings = torch.load(dedup_path/"embeddings.pth")
embeddings.shape, len(all_images)

In [None]:
all_images = pd.read_pickle(dedup_path/"all_images_filenames.pkl")

In [None]:
embeddings.shape, len(all_images)

## 3) KNN

In [None]:
import cuml

In [None]:
embeddings_np = embeddings.numpy()

In [None]:
KNN = 4
model = cuml.neighbors.NearestNeighbors(n_neighbors=KNN)
model.fit(embeddings_np)
distances, indices = model.kneighbors(embeddings_np)

In [None]:
plt.hist(np.min(distances[:, 1:], 1));

We have enough variability between 30 - 35 distances indicating different samples, so we can probably thresholds as < 30. You can also play with different `upper` and `lower` to see how nearest neighbors change.

Here I keep 30 to be conservative, but you may pick lower upper threshold.

Actually let's go with 25, as we can start seeing duplicates from that point :)

In [None]:
lower = 0
upper = 25
mask = (distances < upper)*(distances >= lower)

In [None]:
dup_idxs = mask[:, 1:].sum(1) > 0
print(f"Total potential duplicates: {sum(dup_idxs)}")

In [None]:
dup_indices, dup_mask, dup_distances = indices[dup_idxs], mask[dup_idxs], distances[dup_idxs]
sortidxs = np.argsort(dup_distances[:, 1])
dup_indices, dup_mask, dup_distances = dup_indices[sortidxs], dup_mask[sortidxs], dup_distances[sortidxs]

In [None]:
i = 0
for idxs, m in zip(dup_indices, dup_mask):
    masked_idxs = idxs[m]
    
    if masked_idxs[0] != idxs[0]:
        masked_idxs = [idxs[0]] + list(masked_idxs)
    
    fnames = [all_images[i] for i in masked_idxs]
    titles = [all_images2label[fn] for fn in fnames]
    imgs = [PILImage.create(fn) for fn in fnames]
    show_images(imgs, imsize=5, titles=titles)
    i += 1
    if i == 20: break

### Save groups

In [None]:
data_df.head()

In [None]:
image2groupid = {}
i = 0
for idxs, m in zip(dup_indices, dup_mask):
    
    masked_idxs = idxs[m]
    
    if masked_idxs[0] != idxs[0]:
        masked_idxs = [idxs[0]] + list(masked_idxs)
        
    for idx in masked_idxs:
        image2groupid[str(all_images[idx])] = i 
    
    i += 1

In [None]:
len(image2groupid)

In [None]:
data_df['knn_groups'] = data_df['fnames'].map(image2groupid)

In [None]:
unique_groups = np.unique(data_df.dropna().knn_groups.values)

In [None]:
k = np.random.choice(unique_groups)
group_df = data_df.query(f"knn_groups == {k}")
group_df

In [None]:
show_images([open_image(o) for o in group_df['fnames']], imsize=7)

In [None]:
data_df.to_csv("merged_training_data.csv",index=False)