In [1]:
import fiftyone as fo
import fiftyone.zoo as foz
import matplotlib.pyplot as plt
import cv2
import os
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
!ls

image-cleaning.ipynb
[1m[36mraw_data[m[m
[1m[36msign_language_translation[m[m
test_asl2.ipynb
test_asl_aren_utilsdatafromdirectory.ipynb


In [3]:
dir = 'raw_data/archive/asl_alphabet_train/asl_alphabet_train/C'

In [4]:
def get_images(directory): 
    """get images from Google drive if in same directory as collab notebook"""
    images = []
    labels = []

    directory_list = sorted(os.listdir(directory))
    for i in range(len(directory_list)):
        print(f"Getting images of {directory_list[i]}:")
        for image in os.listdir(directory + "/" + directory_list[i]):
            img = cv2.imread(directory + "/" + directory_list[i] + "/" + image)
            img = cv2.resize(img, (28, 28))
            images.append(img)
            labels.append(directory_list[i])

    return images, labels

In [5]:
dir

'raw_data/archive/asl_alphabet_train/asl_alphabet_train/C'

In [6]:
X, y = get_images(dir)

Getting images of C1.jpg:


NotADirectoryError: [Errno 20] Not a directory: 'raw_data/archive/asl_alphabet_train/asl_alphabet_train/C/C1.jpg'

In [7]:
name = "C"
dataset_dir = dir

# Create the dataset
dataset = fo.Dataset.from_dir(
    dataset_dir=dataset_dir,
    dataset_type=fo.types.ImageDirectory,
    name=name,
)

 100% |███████████████| 3000/3000 [297.9ms elapsed, 0s remaining, 10.1K samples/s]     


In [8]:
model = foz.load_zoo_model("mobilenet-v2-imagenet-torch")



In [9]:
embeddings = dataset.compute_embeddings(model)

print(embeddings.shape)

 100% |███████████████| 3000/3000 [2.5m elapsed, 0s remaining, 20.4 samples/s]      
(3000, 1280)


In [None]:
## Calculate Similarity


In [10]:
similarity_matrix = cosine_similarity(embeddings, dense_output=False)

print(similarity_matrix.shape)
print(similarity_matrix)

(3000, 3000)
[[1.         0.97419665 0.7719385  ... 0.8819284  0.86406594 0.87873314]
 [0.97419665 1.         0.7763415  ... 0.86757907 0.84924054 0.86339437]
 [0.7719385  0.7763415  1.         ... 0.73940137 0.74111279 0.74902461]
 ...
 [0.8819284  0.86757907 0.73940137 ... 1.         0.98982324 0.99032338]
 [0.86406594 0.84924054 0.74111279 ... 0.98982324 1.         0.98870075]
 [0.87873314 0.86339437 0.74902461 ... 0.99032338 0.98870075 1.        ]]


In [11]:
n = len(similarity_matrix)

similarity_matrix = similarity_matrix - np.identity(n)

In [12]:
id_map = [s.id for s in dataset.select_fields(["id"])]

for idx, sample in enumerate(dataset):
    sample["max_similarity"] = similarity_matrix[idx].max()
    sample.save()

In [13]:
from fiftyone import ViewField as F

dataset.match(F("max_similarity")>0.95)

Dataset:     C
Media type:  image
Num samples: 2980
Sample fields:
    id:             fiftyone.core.fields.ObjectIdField
    filepath:       fiftyone.core.fields.StringField
    tags:           fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:       fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)
    max_similarity: fiftyone.core.fields.FloatField
View stages:
    1. Match(filter={'$expr': {'$gt': [...]}})

In [14]:
id_map = [s.id for s in dataset.select_fields(["id"])]

In [15]:
thresh = 0.95
samples_to_remove = set()
samples_to_keep = set()

for idx, sample in enumerate(dataset):
    if sample.id not in samples_to_remove:
        # Keep the first instance of two duplicates
        samples_to_keep.add(sample.id)
        
        dup_idxs = np.where(similarity_matrix[idx] > thresh)[0]
        for dup in dup_idxs:
            # We kept the first instance so remove all other duplicates
            samples_to_remove.add(id_map[dup])

        if len(dup_idxs) > 0:
            sample.tags.append("has_duplicates")
            sample.save()

    else:
        sample.tags.append("duplicate")
        sample.save()

print(len(samples_to_remove) + len(samples_to_keep))

# If you want to remove the samples from the dataset entirely, uncomment the following line
# dataset.remove_samples(list(samples_to_remove))

3000


In [17]:
session = fo.launch_app(dataset)

In [16]:
session.show()

NameError: name 'session' is not defined

In [None]:
view = dataset.match_tags(["has_duplicates","duplicate"])
thresh = 0.92

for idx, sample in enumerate(dataset):
    if sample.id in view:
        dup_idxs = np.where(similarity_matrix[idx] > thresh)[0]
        dup_splits = []
        dup_labels = {sample.ground_truth.label}
        for dup in dup_idxs:
            dup_sample = dataset[id_map[dup]]
            dup_split = "test" if "test" in dup_sample.tags else "train"
            dup_splits.append(dup_split)
            dup_labels.add(dup_sample.ground_truth.label)
            
        sample["dup_splits"] = dup_splits
        sample["dup_labels"] = list(dup_labels)
        sample.save()

In [None]:
view.first()

In [None]:
from fiftyone import ViewField as F