part 2 -> [notebook](https://www.kaggle.com/keremt/cassava-eda-part2-cnn-dedup-with-rapids/)

In [None]:
!pip install -qqU fastai==2.1.7

In [None]:
import fastai; print("fastai:", fastai.__version__)
import torch; print("torch:", torch.__version__)

In [None]:
from fastai.vision.all import *
import torchvision 

In [None]:
new_data_path = Path("../input/cassava-leaf-disease-classification//")
old_data_path = Path("../input/cassavaold/")

## Data

### a) New Data

In [None]:
new_data_path.ls().map(lambda o: o.name)

In [None]:
train_images = get_image_files(new_data_path/'train_images')
test_images = get_image_files(new_data_path/'test_images')
train_df = pd.read_csv(new_data_path/'train.csv')

In [None]:
len(train_images), len(test_images)

In [None]:
new_images = train_images

In [None]:
train_df['label'].value_counts()

In [None]:
labeldict = json.loads((new_data_path/'label_num_to_disease_map.json').open().read())
labeldict = {int(k):v for k,v in labeldict.items()}

In [None]:
train_df['label'].map(labeldict).value_counts()

In [None]:
%%timeit
img1 = PILImage.create(train_images[0]) 
img1 = ToTensor()(img1)

In [None]:
%%timeit
img2 = torchvision.io.read_image(train_images[0].as_posix())

### b) Old Data

Please upvote if you use: https://www.kaggle.com/keremt/cassavaold

In [None]:
old_train_images = get_image_files(old_data_path/'train')
old_test_images = get_image_files(old_data_path/'test')
old_unsup_images = get_image_files(old_data_path/'extraimages')

In [None]:
old_images = old_train_images + old_test_images + old_unsup_images

In [None]:
len(old_images)

## 1) Image Hash based Dedup

Let's see if we can use old competition data or not...

Note that below we are doing exact match of hash codes, so we are not looking at soft similarity scores, for that skip to cnn based dedup.

In [None]:
import imagehash, PIL

In [None]:
def get_imagehash(path, hashfunc=imagehash.average_hash, hash_size=8):
    img = PIL.Image.open(path)
    return str(hashfunc(img, hash_size=hash_size))

In [None]:
new_image_hashes = parallel(partial(get_imagehash, hashfunc=imagehash.phash, hash_size=8), train_images)
old_image_hashes = parallel(partial(get_imagehash, hashfunc=imagehash.phash, hash_size=8), old_images)

In [None]:
len(new_image_hashes), len(old_image_hashes)

### Dups between old and new datasets

In [None]:
common_hashes = list(set(new_image_hashes).intersection(set(old_image_hashes))); len(common_hashes)

In [None]:
print(f"Total of {len(common_hashes)}/{len(old_image_hashes)} pairs might be same images")

In [None]:
common_hashes[:5]

In [None]:
hash2new_images = defaultdict(list)
hash2old_images = defaultdict(list)

for h, im in zip(new_image_hashes, train_images):
    hash2new_images[h].append(im)

for h, im in zip(old_image_hashes, old_images):
    hash2old_images[h].append(im)

In [None]:
hash2new_images[common_hashes[0]], hash2old_images[common_hashes[0]]

In [None]:
imgs = []
for h in common_hashes:
    newimgs = [PILImage.create(o) for o in hash2new_images[h]]
    oldimgs = [PILImage.create(o) for o in hash2old_images[h]]
    imgs += newimgs
    imgs += oldimgs
    
    if len(imgs) > 64: break

Let's plot first 64 sample due to notebook limit. We can see that these are indeed same images. But 4902 is not too bad if that's all the duplicates. I am not very experienced with image hash methods, so using a different hash function with different parameters might also change the result. For that reason I will let you be the judge on whether to use old competition data or not. Let me know what you think down in the comments, as I am also very interested to hear about it!

You may see new images are all horizontal images and old ones are resized in new dataset to make them horizontal too. It's probably due to how photo was originally take with the phone.

In [None]:
show_images(imgs[:64], nrows=8, ncols=8)

You may check how average hash performs in this link: https://johannesbuchner.github.io/imagehash/art2.html. It looks like it has a high false positive rate and brings non-duplicates as similar when they are different. This is pretty good for us and gives us confidence that there is probably not any other duplicates.

In [None]:
duplicate_old_images = [hash2old_images[o] for o in common_hashes]
duplicate_new_images = [hash2new_images[o] for o in common_hashes]

In [None]:
len(duplicate_old_images), len(duplicate_new_images)

We might prefer to use either the old data or the new one for dups. It looks like old data is resized.

In [None]:
old_new_duplicate_pairs = list(zip(duplicate_old_images, duplicate_new_images))[0]

In [None]:
pd.to_pickle(old_new_duplicate_pairs, "old_new_duplicate_pairs.pkl")

### Dups within new dataset or old dataset

In [None]:
new_image_dups = [v for k,v in hash2new_images.items() if len(v) > 1]

In [None]:
new_image_dups

In [None]:
old_image_dups = [v for k,v in hash2old_images.items() if len(v) > 1]

In [None]:
len(old_image_dups), len(np.concatenate(old_image_dups))

In [None]:
dups = [(Path(o.parent.name)/o.name, PILImage.create(o)) for o in np.random.choice(old_image_dups)]
titles, imgs = zip(*dups)
show_images(imgs, titles=titles)

In [None]:
pd.to_pickle(old_image_dups, "old_image_dups.pkl")

## 2) CNN Based Dedup

### Normalize labels 

In [None]:
len(old_images), len(new_images)

In [None]:
oldlabeldict = {'cbsd': 'Cassava Brown Streak Disease (CBSD)',
                 'healthy': 'Healthy',
                 'cmd': 'Cassava Mosaic Disease (CMD)',
                 'cgm': 'Cassava Green Mottle (CGM)',
                 'cbb': 'Cassava Bacterial Blight (CBB)',
                 '0': 'Unsup', # test
                 'extraimages': 'Unsup'}

In [None]:
labeldict

In [None]:
old_images2labels = dict(zip(old_images, [oldlabeldict[o] for o in old_images.map(lambda o: o.parent.name)]))

new_images2labels = dict(zip(train_df['image_id'], train_df['label']))
new_images2labels = {k:labeldict[v] for k,v in new_images2labels.items()}
new_images2labels = {o:new_images2labels[o.name] for o in new_images}

In [None]:
Counter(old_images2labels.values())

In [None]:
Counter(new_images2labels.values())

In [None]:
all_images2label = {**old_images2labels, **new_images2labels}

In [None]:
Counter(all_images2label.values())

In [None]:
len(all_images2label)

In [None]:
label_vocab = {'Cassava Bacterial Blight (CBB)':0,
             'Cassava Brown Streak Disease (CBSD)':1,
             'Cassava Green Mottle (CGM)':2,
             'Cassava Mosaic Disease (CMD)':3,
             'Healthy':4, 
             'Unsup':5}

### 2) Get Embeddings 

In [None]:
all_images = old_images + new_images; len(all_images)

In [None]:
# Torchvision
size = (224,224)
bs = 64
def open_image(fn):    return TensorImage(torchvision.io.read_image(str(fn)))

tfms = [[open_image, torchvision.transforms.Resize(size, )], 
        [lambda o: all_images2label[o], Categorize(label_vocab)]]
batch_tfms = [IntToFloatTensor, Normalize.from_stats(*imagenet_stats)]

dsets = Datasets(all_images, tfms=tfms, splits=None)
dls = dsets.dataloaders(bs=bs, after_batch=batch_tfms)

In [None]:
show_image(dsets[0][0]);

In [None]:
%%time
dls.show_batch(max_n=25)

In [None]:
# # Fastai
# size = (224,224)
# bs = 64

# tfms = [[PILImage.create, ToTensor, Resize(size, method='squish')], 
#         [lambda o: all_images2label[o], Categorize(label_vocab)]]

# dsets = Datasets(all_images, tfms=tfms, splits=None)

# batch_tfms = [IntToFloatTensor, Normalize.from_stats(*imagenet_stats)]
# dls = dsets.dataloaders(bs=bs, after_batch=batch_tfms)

In [None]:
# show_image(dsets[0][0]);

In [None]:
# %%time
# dls.show_batch(max_n=25)

In [None]:
model = create_cnn_model(resnet34, 1, pretrained=True)
model = nn.Sequential(model[0], model[1][:2])
learner = Learner(dls, model, loss_func=CrossEntropyLossFlat)

In [None]:
# generate embeddings
embedding_dl = dls.test_dl(all_images)
embeddings, _ = learner.get_preds(dl=embedding_dl, act=noop)
torch.save(embeddings, "embeddings.pth")
pd.to_pickle(all_images, "all_images_filenames.pkl")

In [None]:
# load
embeddings = torch.load("embeddings.pth")
embeddings.shape, len(all_images)

CPU RAM on this GPU kernel is not enough, so we need to chunk embeddings in to rows and columns. Also it's pretty slow, so let me know down in the comments if you have a better solution for this!

For demonstration I will look at first 1000 images and to see if there are any dups for them.

In [None]:
gpu_kernel = torch.cuda.is_available()
chunk_idxs = list(chunked(range(len(embeddings)), chunk_sz=1000))

sims = []
for i, row_idxs in enumerate(progress_bar(chunk_idxs)):
    row_sims = []
    for col_idxs in progress_bar(chunk_idxs):
        sim = F.cosine_similarity(embeddings[row_idxs].unsqueeze(0), embeddings[col_idxs].unsqueeze(1), dim=-1)    
        row_sims.append(sim)
    
    if gpu_kernel: 
        if i == 2: break
    
    row_sims = torch.cat(row_sims, dim=0).T
    sims.append(row_sims)

In [None]:
sims = torch.cat(sims)

In [None]:
sims.shape

In [None]:
sims = torch.triu(sims, diagonal=1)
thresh = 0.95
similar_idxs = [(i,j) for i,j in list(zip(torch.where(sims > thresh)[0].numpy(), torch.where(sims > thresh)[1].numpy())) if i != j]

In [None]:
print(f"We found {len(similar_idxs)}/{len(sims)} similar pairs")

Let's plot a few random similar pairs

In [None]:
similar_files = []

for i,j in np.random.permutation(similar_idxs)[:20]:
    fn1, fn2 = all_images[i], all_images[j]
    similar_files.append((fn1, fn2))
    
    imgs = [open_image(fn1), open_image(fn2)]
    titles = [all_images2label[fn1], all_images2label[fn2]]
    titles = [all_images[i], all_images[j]]
    show_images(imgs, titles=titles, imsize=10)

### Up next: Rapids CuML clustering!