# Inspect dataset with FiftyOne

First set the path and dataset name.

In [None]:
PATH = "/mnt/c/Users/sabri/Documents/github/thesis/datasets/auv"
# PATH = "/home/sabf/thesis/thesis/datasets/rov/"
DATASET_NAME = "auv"
DELETE_ALL_DATASETS = False

In [None]:
import fiftyone as fo

existent_datasets = fo.list_datasets()
print(existent_datasets)

if DELETE_ALL_DATASETS:
    if len(existent_datasets) != 0: 
        for d in existent_datasets:
            dataset = fo.load_dataset(d)
            print("{d} deleted.")
            dataset.delete()
        print("All existent datasets deleted.")
    else:
        print("No datasets existent, nothing was deleted.")
else:
    print("Set to false. Nothing was deleted.")

In [1]:
import fiftyone as fo

existent_datasets = fo.list_datasets()
print(existent_datasets)

['auv', 'auv2', 'rov', 'rov2']


In [7]:
delete_this = fo.load_dataset('rov2')
delete_this.delete()
existent_datasets = fo.list_datasets()
existent_datasets

[]

### Loading the dataset

In [None]:
if DATASET_NAME in existent_datasets:
    dataset = fo.load_dataset(DATASET_NAME)
    print("Dataset loaded.\n")
else:
    # The splits to load
    splits = ["train", "val", "test"]

    # Load the dataset, using tags to mark the samples in each split
    dataset = fo.Dataset(DATASET_NAME)
    for split in splits:
        dataset.add_dir(
            dataset_dir=PATH,
            dataset_type=fo.types.YOLOv5Dataset,
            split=split,
            tags=split,
    )
    print("Dataset created.\n")

# View summary info about the dataset
print(dataset)

In [None]:
fo.pprint(dataset.stats(include_media=True))

In [None]:
# Make the dataset persistent
dataset.persistent = True

In [None]:
# Print the first few samples in the dataset
print(dataset.head())

## Label distribution 

In [None]:
import pandas as pd

splits = ["train", "val", "test"]


def make_df(splits):
    """Counts how many annotations of each label exist per split.

    Args:
        splits (list): List of splits in the fiftyone dataset. E.g.: ['train', 'test']

    Returns:
        pandas DataFrame: label, count and split columns.
    """    
    df_list = []
    for i in splits:
        view = dataset.match_tags(i)
        count_dict = view.count_values("ground_truth.detections.label")

        df = pd.DataFrame(count_dict.items(), columns=['label', 'count']).copy()
        df["split"] = i
        df_list.append(df)
    
    return pd.concat(df_list)

In [None]:
label_count = make_df(["train", "val", "test"])
label_count.head()
# save label counts
# label_count.to_csv("../results/label_count_split.csv", index=False)

### Visualising label counts

In [None]:
import seaborn as sns

# sort by count, more labels first
sorted_df = label_count.sort_values(by='count', ascending=False)

# plot label counts
label_dist = sns.lineplot(data=sorted_df, x="label", y="count", hue="split", palette="mako")
label_dist.set_xticklabels(label_dist.get_xticklabels(), rotation=45, horizontalalignment='right');

## Launch app
Have a look at the actual dataset.

In [None]:
session = fo.launch_app(dataset)

In [None]:
fo.close_app()

## Computing uniqueness

In [None]:
import fiftyone.brain as fob

In [None]:
fob.compute_uniqueness(dataset)

In [None]:
fo.close_app()

In [None]:
# Sort in increasing order of uniqueness (least unique first)
sorted_dataset = dataset.sort_by("uniqueness")

print(sorted_dataset.first())

In [None]:
# Open view in the App
session.view = sorted_dataset

In [None]:
import pandas as pd
df = pd.DataFrame({'filename': sorted_dataset.values("filepath"), 'uniqueness': sorted_dataset.values("uniqueness")}).sort_values('uniqueness', ascending=False)
df

In [None]:
# save uniqueness df to csv
# df.to_csv("auv_uniqueness.csv", index=False)