## Filter out images using embeddings a.k.a. G&T


In [1]:
import glob
import os
import sys

import pandas as pd
import tensorflow as tf

sys.path.append(os.path.abspath('../'))
from gandt.data import analyse_labels_data
from gandt.data.filter_data import filter_by_similarity

  import sys


## Gather files with annotations
You would get them from S3, like so:

```
aws s3 sync s3://your-bucket/20200219-all-multicategory/labelled/20200219-multicategory-user-relabel .
```

Now we can list all the json files from Ground Truth

In [2]:
experiment_files = glob.glob(
        '../data/external/labelled/2020010*/annotations/'
        'consolidated-annotation/consolidation-request/iteration-1/*.json')

Read in responses

In [3]:
raw_responses = pd.concat([analyse_labels_data.read_responses(x)
                           for x in experiment_files])

In [4]:
# Remove empty responses - not sure how they got there!
empty_removed = raw_responses[True & (raw_responses['labels'].apply(len))]

# Sometimes there are duplicates in label files, so get rid of them,
# only accept the first response per worker per image
raw_responses = empty_removed.groupby(
    ['worker_id', 'image_filename', 'image_index'],
    as_index=False).first()

## Filter responses by unreliable workers

This will add 'is_label_certain' column, which we can use to filter out
unreliable labels

In [5]:
unfiltered_responses, filtered_responses = (
    analyse_labels_data.filter_out_unreliable_workers(raw_responses))
filtered_responses['source'] = 'prints'

# Take only images for which we are certain of the label/ s
labelled_data = filtered_responses[filtered_responses['is_label_certain']]

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




Merge with image filenames on disk, so that we can evaluate the feature
extraction model on them.

In [None]:
images = pd.DataFrame({'full_path': [
    os.path.abspath(x) for x in
    (glob.glob('../data/external/print-data-resized/*/*') +
     glob.glob('../../utility.flickr-data-download/data/external/'
               'event_types/2019-07-18/images/*')
     )
]})

images['image_filename'] = images['full_path'].apply(os.path.basename)

labelled_data = labelled_data.merge(
    images, on='image_filename', how='inner')

## Load models

In [6]:
embedding_model = tf.keras.applications.ResNet152V2(
    include_top=True, weights='imagenet')

# Get a feature extraction layer from the model

feat_extractor = tf.keras.Model(
    inputs=embedding_model.input,
    outputs=embedding_model.get_layer("avg_pool").output)

## Get filenames of images to be labelled

In [7]:
unknown_print_images = pd.DataFrame(
    {'full_path': [
        os.path.abspath(x) for x in
        glob.glob('../data/external/print-data-resized/*/*')]
    })
unknown_print_images = unknown_print_images[
    ~unknown_print_images['full_path'].isin(labelled_data['full_path'])]
unknown_print_images['majority_label'] = 'Unknown'

unknown_print_images = unknown_print_images.reset_index(drop=True)

## Do the filtering!

This evaluates the images using the feature extraction model, and then filters
images which are far away from all labelled categories.

In [None]:
discarded, remaining = filter_by_similarity(
    feat_extractor, labelled_data, unknown_print_images)

Found 3198 validated image filenames.
  1/100 [..............................] - ETA: 12:14