# About this notebook

When I saw [Chris Deotte's][1] wonderful notebook in Feedback Prize competition,

 https://www.kaggle.com/cdeotte/rapids-umap-tfidf-kmeans-discovers-15-topics 

I wondered if it could be classified using UMAP for image embedding.

Thank you for Chris Deotte! I respect you.

In this notebook, I showed a result, dogs and cats could be easily classified by using UMAP for image embedding. 

[1]:https://www.kaggle.com/cdeotte

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import cv2
from PIL import Image
import PIL

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

# 0. Import timm

In [None]:
!pip install timm

In [None]:
import timm
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform

In [None]:
import timm
from pprint import pprint
model_names = timm.list_models(pretrained=True)
pprint(model_names)

# 1.Loading train data

In [None]:
train = pd.read_csv("../input/petfinder-pawpularity-score/train.csv")
train

In [None]:
path = "../input/petfinder-pawpularity-score/train"

train["path"] = [os.path.join(path,s) + ".jpg" for s in train["Id"]]

In [None]:
train.head(3)

## show 1 image

In [None]:
img = cv2.imread(train["path"].iloc[0])
plt.imshow(img[:,:,::-1])

# 2. Making Image embedding

## 2.1 Dataset/Dataloader

For the basic writing code, I referred to the following notebook.


ref ) https://www.kaggle.com/titericz/imagenet-embeddings-plus-rapids-svr @GIBA. Thank you very much!

In [None]:
class timmDataset:
    def __init__(self, path, modelcfg=None):
        
        self.path = path
        self.transform = create_transform(**modelcfg)
        
        
    def __len__(self):
        return len(self.path)
    
    def __getitem__(self, item):
        
        impath = self.path[item]
        img = Image.open(impath).convert('RGB')
        img = self.transform(img) # PIL image
        return img


In [None]:
modelname = 'efficientnet_b0'

In [None]:
model = timm.create_model(modelname, pretrained=True)

In [None]:
model.to("cuda")
model.eval()

In [None]:
model.default_cfg

In [None]:
train_dataset = timmDataset(
    train["path"],
    modelcfg = resolve_data_config({},model=modelname)
)

In [None]:
train_dataset[0]

In [None]:
train_dataset[0].shape

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=256, num_workers= 2, shuffle=False)

In [None]:
for a in train_dataloader:
    print(a)
    break

## 2.2 making image embedding with prediction

In [None]:
from tqdm import tqdm

In [None]:
allpreds = []

with torch.no_grad():
    for a in tqdm(train_dataloader):
        preds = model(a.to("cuda"))
        preds = preds.detach().cpu().numpy()
        allpreds.append(preds)
    

In [None]:
len(allpreds)

In [None]:
allpreds = np.concatenate(allpreds)
len(allpreds)

In [None]:
preddf = pd.DataFrame(allpreds)

In [None]:
preddf

# 3. UMAP
(ref : https://www.kaggle.com/cdeotte/rapids-umap-tfidf-kmeans-discovers-15-topics 
thank you for @chris deotte)


In [None]:
import cudf, cuml, cupy
from cuml import UMAP


In [None]:
umap = UMAP()
embed_2d = umap.fit_transform(preddf.values)
embed_2d = cupy.asnumpy( embed_2d )

In [None]:
plt.scatter(embed_2d[:,0],embed_2d[:,1])

In [None]:
from cuml import KMeans
kmeans = cuml.KMeans(n_clusters=2)
kmeans.fit(embed_2d)
train['cluster'] = kmeans.labels_

In [None]:
plt.scatter(embed_2d[:,0],embed_2d[:,1],c = kmeans.labels_)

## maybe dog and cats. check now

In [None]:
train.head(3)

## Cluster 0

In [None]:
tmpdf = train[train["cluster"]==0]
plt.figure(figsize=(25,25))

for a in range(25):
    img = cv2.imread(tmpdf["path"].iloc[a])
    img = img[:,:,::-1]
    plt.subplot(5,5,a+1)
    plt.axis("off")
    plt.imshow(img)


### All dogs !!

## Cluster 1

In [None]:
tmpdf = train[train["cluster"]==1]
plt.figure(figsize=(25,25))

for a in range(25):
    img = cv2.imread(tmpdf["path"].iloc[a])
    img = img[:,:,::-1]
    plt.subplot(5,5,a+1)
    plt.axis("off")
    plt.imshow(img)


## All cats !!

# 4. (Ref) t-SNE : no good case

In [None]:
preddf

In [None]:
from cuml.manifold import TSNE
tsne = TSNE(n_components=2)
X_embedded = tsne.fit_transform(preddf.values)

In [None]:
plt.scatter(X_embedded[:,0],X_embedded[:,1])

## this is not good compared to UMAP (poor condition ?)

# 5.Find similar image knn

This is also referred from @chris deotte's notebook in Shoppee competition. Thank you so much !
https://www.kaggle.com/cdeotte/rapids-cuml-tfidfvectorizer-and-knn

In [None]:
from cuml.neighbors import NearestNeighbors

In [None]:
KNN = 50
model = NearestNeighbors(n_neighbors=KNN)
model.fit(preddf.values)
distances, indices = model.kneighbors(preddf.values)

In [None]:
distances

In [None]:
indices

## Try to put out 5 similar images of no 0

In [None]:
indices[0][:6]

In [None]:
plt.figure(figsize=(20,5))
for num,a in enumerate(indices[0][:6]):
    img = cv2.imread(train["path"].iloc[a])
    img = img[:,:,::-1]
    
    title =train["Pawpularity"].iloc[a]
    plt.subplot(1,6,num+1)
    plt.imshow(img)
    plt.axis("off")
    plt.title(f"Pawpularity : {title}")


## it looks good.

# 6.Cosign Similarity : extract duplicate image

## 6.1 normalization

In [None]:
preddf

### pandas → cudf

In [None]:
predcudf = cudf.from_pandas(preddf)

## normalization each image embedding

In [None]:
im_embed_norm = [s / cupy.linalg.norm(s) for s in predcudf.values]

## cudf → cupy

In [None]:
im_embed_norm = cupy.array(im_embed_norm)

## cosine similality

In [None]:
cos_sim = cupy.matmul( im_embed_norm, im_embed_norm.T).T

In [None]:
cos_sim

## check histgram

In [None]:
simdf = cudf.DataFrame(cos_sim)
simdf

## cudf → pandas

In [None]:
simdf = simdf.to_pandas()

In [None]:
plt.hist(np.concatenate(simdf.values))

## np.where(condition) put out the index satisfied with the condition

### you can change the threshold

In [None]:
similarity_thres = 0.87

In [None]:
similar_index = [cupy.where(s > similarity_thres)[0] for s in cos_sim]

In [None]:
similar_index[:3]

## make combination id

In [None]:
sim_combination = [train.iloc[cupy.asnumpy(s)].index.values for s in similar_index]

In [None]:
sim_combination[:3]

## make judge whether duplicate or not

In [None]:
duplicatejudge = [len(s) > 1 for s in sim_combination]

In [None]:
train["combi"] = sim_combination
train["judge"] = duplicatejudge

In [None]:
train["combi"] = train["combi"].astype("str")

## visualize duplicate image

In [None]:
dupdf = train[train["judge"]]
dupdf = dupdf.sort_values("combi")

In [None]:
dupdf.head()

In [None]:
for a in range(int(len(dupdf)/2)):
    
    img = cv2.imread(dupdf["path"].iloc[2*a])
    img2 = cv2.imread(dupdf["path"].iloc[2*a+1])
    
    fig = plt.figure(figsize = (10,5))
    
    plt.subplot(1,2,1)
    plt.imshow(img[:,:,::-1])
    plt.axis("off")
    
    score = dupdf["Pawpularity"].iloc[2*a]
    plt.title(f"id{dupdf.index[2*a]}. Pawpularity {score}")
    
    
    plt.subplot(1,2,2)
    plt.imshow(img2[:,:,::-1])
    plt.axis("off")
    
    score = dupdf["Pawpularity"].iloc[2*a+1]
    plt.title(f"id{dupdf.index[2*a+1]}. Pawpularity {score}")
    
    del fig
    

In [None]:
dupdf.to_csv("dupdf.csv",index=False)