# Setup

In [None]:
import collections
from pathlib import Path
#from pprint import pprint

import fiftyone as fo
import fiftyone.brain as fob
import fiftyone.core.utils as fou
#import fiftyone.types
import fiftyone.utils.random as four
import fiftyone.zoo as foz
import numpy as np
import pandas as pd
import plotly.express as px
#from custom_plotly_templates import set_render_config, set_template
from fiftyone import ViewField as F
#from fiftyone.utils.image import transform_images
from fiftyone.utils.iou import compute_max_ious
#from omegaconf import OmegaConf
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

# Загрузка датасета

In [None]:
fo.list_datasets()

In [None]:
# # Download dataset

# foz.load_zoo_dataset(
#     name="road_detection",
#     dataset_name="road_detection",
#     dataset_dir=dataset_dir,
#     cleanup=False,
# )

In [None]:
# fo.delete_dataset('road_segmentation')

In [None]:
# dataset = fo.load_dataset('road_segmentation')
# print(dataset.view())
# print(dataset.stats(include_media=True))
# session = fo.launch_app(dataset)
# session.open_tab()

## Загрузка датасета детекции

In [None]:
dataset_name = "roads_detection3"
dataset_dir = r'/root/storage/3030/AkhmetzyanovD/datasets/roads/datasets/det'
overwrite = True
# The splits to load
splits = ["train", "val"]

In [None]:
### YOLO dataset
if not overwrite and fo.dataset_exists(dataset_name):
    dataset = fo.load_dataset(dataset_name)
else:
    if fo.dataset_exists(dataset_name):
        fo.delete_dataset(dataset_name)
    dataset = fo.Dataset(dataset_name)
    for split in splits:
        dataset.add_dir(
            dataset_dir=dataset_dir,
            dataset_type=fo.types.YOLOv5Dataset,
            split=split,
            tags=split,
    )
    dataset.persistent = True
    
print(dataset.view())
print(dataset.stats())

In [None]:
session = fo.launch_app(dataset)
session.open_tab()

## Загрузка датасета сегментации

In [None]:
train_paths = r'/AkhmetzyanovD/datasets/roads2/dataset3/train.txt'
valid_paths = r'/AkhmetzyanovD/datasets/roads2/dataset3/train.txt'

dataset_name = 'road_segmentation3'

In [None]:
dataset = fo.Dataset(dataset_name)

In [None]:
with open(train_paths, 'r') as train_file:
    for paths in train_file.readlines():
        image_path, mask_path = paths.split()

        sample = fo.Sample(filepath=image_path)
        sample["segmentation"] = fo.Segmentation(mask_path=mask_path)
        sample['tags'] = ['train']
        dataset.add_sample(sample)
    train_file.close()

with open(valid_paths, 'r') as valid_file:
    for paths in valid_file.readlines():
        image_path, mask_path = paths.split()

        sample = fo.Sample(filepath=image_path)
        sample["segmentation"] = fo.Segmentation(mask_path=mask_path)
        sample['tags'] = ['valid']
        dataset.add_sample(sample)
    valid_file.close()

print(dataset.view())
print(dataset.stats())

In [None]:
session = fo.launch_app(dataset)
session.open_tab()

## OTHER

In [None]:
dataset = fo.Dataset(dataset_name)

In [None]:
#add dir
for split in splits:
        dataset.add_dir(
            dataset_dir=dataset_dir,
            dataset_type=fo.types.YOLOv5Dataset,
            split=split,
            tags=split,
    )
dataset.persistent = True

In [None]:
view = dataset.view()
view

In [None]:
### COCO dataset
data_path = "/workspace/storage/3030/VoynovD/dump/data/extrabird/mva2023_sod4bird_pub_test/images"
labels_path = "/workspace/storage/3030/VoynovD/dump/data/extrabird/mva2023_sod4bird_pub_test/annotations/public_test_coco_empty_ann.json"

if not overwrite and fo.dataset_exists(dataset_name):
    dataset = fo.load_dataset(dataset_name)
else:
    if fo.dataset_exists(dataset_name):
        fo.delete_dataset(dataset_name)

    dataset = fo.Dataset.from_dir(
        name=dataset_name,
        #dataset_dir=dataset_dir,
        data_path=data_path,
        labels_path=labels_path,
        dataset_type=fo.types.COCODetectionDataset,
        #label_types=["detections"]
    )
    dataset.persistent = True

view = dataset.view()
view

In [None]:
dirs = ['DD', 'E', 'G', 'I', 'K', 'PB', 'PB_glasses', 'PY', 'S', 'test']

In [None]:
#add dir
for dir in dirs:
    dataset.add_dir(
        data_path=f"/workspace/storage/3030/VoynovD/integral/Eye_keypoint_detector/EKD/images/{dir}",
        labels_path=f"/workspace/storage/3030/VoynovD/integral/Eye_keypoint_detector/EKD/labels/{dir}/annotations/person_keypoints_default.json",
        dataset_type=fo.types.COCODetectionDataset,
    )
dataset.persistent = True
view = dataset.view()
view

In [None]:
### KITI dataset
if not overwrite and fo.dataset_exists(dataset_name):
    dataset = fo.load_dataset(dataset_name)
else:
    if fo.dataset_exists(dataset_name):
        fo.delete_dataset(dataset_name)

    dataset = fo.Dataset.from_dir(
        name=dataset_name,
        dataset_dir=dataset_dir,
        dataset_type=fo.types.KITTIDetectionDataset,
    )
    dataset.persistent = True

view = dataset.view()
view

In [None]:
### VOC
dataset_name = "heridal"
dataset_dir = "/workspace/storage/db/emergency-search/heridal/raw"
overwrite = True
# The splits to load
splits = ["train", "test"]

# Create the dataset
# dataset = fo.Dataset.from_dir(
#     dataset_dir=dataset_dir,
#     dataset_type=fo.types.VOCDetectionDataset,
#     name=name,
# )

if not overwrite and fo.dataset_exists(dataset_name):
    dataset = fo.load_dataset(dataset_name)
else:
    if fo.dataset_exists(dataset_name):
        fo.delete_dataset(dataset_name)
    dataset = fo.Dataset(dataset_name)
    for split in splits:
        dataset.add_dir(
            dataset_dir=f'{dataset_dir}/{split}',
            dataset_type=fo.types.VOCDetectionDataset,
            tags=split,
    )
    dataset.persistent = True
    
view = dataset.view()
view

In [None]:
dataset.persistent = True

In [None]:
dataset.stats(include_media=True)

In [None]:
session = fo.launch_app(view, auto=False)
#session.open_tab()

In [None]:
view.filter_field("ground_truth", F("detections").length() == 0).tag_samples("no detections")
len(view.match_tags("no detections")) / len(view)

In [None]:
no_det = view.match_tags("no detections")

In [None]:
view = view.exclude(no_det)
# session.view = view

In [None]:
view

In [None]:
no_good = view.match_tags("no_good")
view = view.exclude(no_good)
session.view = view

In [None]:
dataset_train = fo.load_dataset('mva2023_sod4bird_train')

In [None]:
dataset_all = dataset_train.clone(name='mva2023_sod4bird')

In [None]:
dataset_all.merge_samples(dataset)

In [None]:
dataset.persistent = True

In [None]:
dataset = dataset_all

In [None]:
view = dataset.view()
view

In [None]:
view

# Предсказания

In [None]:
from ultralytics import YOLO

In [None]:
model = YOLO("yolov8n.pt")
#model.to("cuda:0")

pred_field = "predictions_person"
no_person = view.match_tags("no_person_detection")

In [None]:
def add_yolo_predictions(view: fo.DatasetView, model: YOLO) -> None:
    for sample in no_person.iter_samples(progress=True, autosave=True):
        results = model.predict(sample.filepath, augment=True, verbose=False, device=0, imgsz=416, conf=0.5, classes=0)[0]

        detections = []
        for box in results.boxes:
            x_min, y_min, x_max, y_max = box.xyxyn[0]
            detection = fo.Detection(
                label=model.names[box.cls.item()],
                bounding_box=[x_min, y_min, x_max - x_min, y_max - y_min],
                confidence=box.conf.item(),
            )
            detections.append(detection)
        sample[pred_field] = fo.Detections(detections=detections)

In [None]:
# if dataset.has_sample_field(pred_field):
#     dataset.delete_sample_field(pred_field)

if not dataset.has_field(pred_field):
    add_yolo_predictions(dataset, model)

In [None]:
dataset.delete_sample_field("predictions_person")

In [None]:
view.untag_samples("no_person_detection")

In [None]:
def add_yolo_pred_to_ground_truth(view: fo.DatasetView, model: YOLO) -> None:  
    for sample in no_person.iter_samples(progress=True, autosave=True):
        results = model.predict(sample.filepath, augment=True, verbose=False, device=0, imgsz=416, conf=0.5, classes=0)[0]

        detections = []
        for box in results.boxes:
            x_min, y_min, x_max, y_max = box.xyxyn[0]
            detection = fo.Detection(
                label=model.names[box.cls.item()],
                bounding_box=[x_min, y_min, x_max - x_min, y_max - y_min],
                confidence=box.conf.item(),
            )
            sample['ground_truth']['detections'].append(detection)

In [None]:
add_yolo_pred_to_ground_truth(dataset, model)

# Дубликаты

In [None]:
if not dataset.has_field("filehash"):
    for sample in dataset.iter_samples(progress=True, autosave=True):
        sample["filehash"] = fou.compute_filehash(sample.filepath)

    filehash_counts = collections.Counter(sample.filehash for sample in dataset)
    duplicates_hashes = [filehash
                         for filehash, count in filehash_counts.items() if count > 1]
    dataset.match(F("filehash").is_in(duplicates_hashes)).tag_samples("duplicates")

duplicates = dataset.match_tags("duplicates")
# session.view = duplicates.sort_by("filehash")
len(duplicates)

In [None]:
len(duplicates)

In [None]:
view = view.exclude(duplicates)
session.view = view

In [None]:
view

In [None]:
view = dataset.view()
session.view = view

# Отношение сторон изображений

In [None]:
#!pip install ipywidgets==7.5

In [None]:
plot = fo.NumericalHistogram(F("metadata.width") / F("metadata.height"))
#session.plots.attach(plot)
plot.show(title="Отношение сторон изображений")

# Embeddings

In [None]:
# if dataset.has_field("image_embeddings"):
#     dataset.delete_sample_field("image_embeddings")

if not dataset.has_field("image_embeddings"):
    model_name = "clip-vit-base32-torch"
    # model_name = "mobilenet-v2-imagenet-torch"

    model = foz.load_zoo_model(model_name)
    dataset.compute_embeddings(model, embeddings_field="image_embeddings", batch_size=8, num_workers=8)

In [None]:
dataset.has_field("image_embeddings")

# Похожие изображения

In [None]:
#if not dataset.get_field("uniqueness"):
fob.compute_uniqueness(dataset, embeddings="image_embeddings")

In [None]:
fig = px.ecdf(view.values("uniqueness"), ecdfnorm=None, title="Uniqueness CDF")
fig.update_layout(xaxis_title="uniqueness", yaxis_title="samples", showlegend=False)

In [None]:
similar_images_view = view.filter_field("uniqueness", F() < 0.3)
#session.view = similar_images_view.sort_by("uniqueness", reverse=True)
similar_images_view.count(), view.exclude(similar_images_view).count()

In [None]:
session.view = similar_images_view

In [None]:
view = view.exclude(similar_images_view)
session.view = view

In [None]:
view = dataset.view()

In [None]:
view

# Визуализация эмбеддингов

In [None]:
%pip show jupyterlab

In [None]:
!pip install umap

In [None]:
!pip show ipywidget

In [None]:
!pip install umap-learn

In [None]:
!pip install pip -U

In [None]:
!pip install importlib_metadata

In [None]:
from importlib.metadata import version, PackageNotFoundError

In [None]:
!pip install fiftyone -U

In [None]:
dataset.has_brain_run("image_embeddings")

In [None]:
# if dataset.has_brain_run("image_embeddings"):
#     dataset.delete_brain_run("image_embeddings")

if not dataset.has_brain_run("image_embeddings"):
    fob.compute_visualization(
        view, embeddings="image_embeddings", brain_key="image_embeddings", num_dims=2, num_workers=8, seed=0
    )
results = dataset.load_brain_results("image_embeddings")

In [None]:
plot = results.visualize(axis_equal=True)
session.plots.attach(plot)
plot.show()

In [None]:
dataset.save()

In [None]:
view.match_tags('yes')

# Обработка классов

In [None]:
dataset.distinct("ground_truth.detections.label")

In [None]:
classes = [
    "bird"
]

view = view.filter_labels("ground_truth", F("label").is_in(classes))
view.distinct("ground_truth.detections.label")

In [None]:
labels_map = {label: "vehicle" for label in view.distinct("ground_truth.detections.label")}
view = view.map_labels("ground_truth", labels_map)
session.view = view

# Перекрывающиеся детекции

In [None]:
if not any(dataset.values("ground_truth.detections.max_iou", unwind=True)):
    compute_max_ious(view, "ground_truth")

In [None]:
plot = fo.NumericalHistogram("ground_truth.detections.max_iou")
session.plots.attach(plot)
plot.show(title="Максимальное пересечение детекций")

In [None]:
overlapping_view = view.filter_labels("ground_truth", F("max_iou") > 0.6)
session.view = overlapping_view
len(overlapping_view)

In [None]:
# view = view.exclude(overlapping_view)
session.view = view

# Площадь объектов

In [None]:
if not dataset.has_field("ground_truth.detections.bbox_area"):
    dataset.add_sample_field("ground_truth.detections.bbox_area", fo.FloatField)
view = view.set_field("ground_truth.detections.bbox_area", 100 * F("bounding_box")[2] * F("bounding_box")[3])
session.view = view.sort_by("ground_truth.detections.bbox_area", reverse=False)

plot = fo.NumericalHistogram("ground_truth.detections.bbox_area", bins=100, xlabel="percent")
session.plots.attach(plot)
plot.show(title="Площадь детекций")

# Центры детекций

In [None]:
center_x = view.values(
    F("ground_truth.detections.bounding_box")[0] + F("ground_truth.detections.bounding_box")[2] / 2, unwind=True
)
center_y = view.values(
    F("ground_truth.detections.bounding_box")[1] + F("ground_truth.detections.bounding_box")[3] / 2, unwind=True
)
points = list(zip(center_x, center_y))

sizes = view.values(F("ground_truth.detections.bbox_area"), unwind=True)

In [None]:
plot = fo.scatterplot(points, sizes=sizes)
aspect_ratio = np.divide(dataset.values("metadata.width"), dataset.values("metadata.height")).mean()
plot.show(title="Центры детекций", width=800, height=800 / aspect_ratio)

# Разбивка датасета

In [None]:
#final_dataset_dir = Path("/workspace/storage_labs/3030/MukhametshinR/data/vehicle-analytics/final-detection/yolov8")

In [None]:
view = dataset.view()

In [None]:
session.view = view

In [None]:
sparse = view.match_tags("sparse")

In [None]:
sparse

In [None]:
train_split, extra_split = train_test_split(sparse.values("id"), test_size=0.5, random_state=0)

In [None]:
view = dataset.view()
sparse = view.match_tags("sparse2")
train_split, extra2_split = train_test_split(sparse.values("id"), test_size=0.7, random_state=0)

In [None]:
view = dataset.view()
sparse = view.match_tags("sparse3")
train_split, extra3_split = train_test_split(sparse.values("id"), test_size=0.8, random_state=0)

In [None]:
view.select(extra_split).tag_samples("extra")
view.select(extra2_split).tag_samples("extra")
view.select(extra3_split).tag_samples("extra")

In [None]:
view.untag_samples(["train", "val", "test"])

train_split, val_split = train_test_split(view.values("id"), test_size=0.15, random_state=0)
#val_split, test_split = train_test_split(test_split, test_size=2 / 3, random_state=0)

view.select(train_split).tag_samples("train")
#view.select(test_split).tag_samples("test")
view.select(val_split).tag_samples("val")

# assert not dataset.match_tags("train").match_tags("test")
# assert not dataset.match_tags("train").match_tags("val")
# assert not dataset.match_tags("val").match_tags("test")

view.count_sample_tags()

In [None]:
final_dataset_dir = Path('/workspace/storage/db/ppe/set13_v2')

In [None]:
# view.export(
#     export_dir=str(final_dataset_dir),
#     dataset_type=fo.types.YOLOv5Dataset,
#     split="kitti",
# )

In [None]:
session.view = view

In [None]:
no_det = view.match_tags("no detections")
bad = view.match_tags("bad")
extra = view.match_tags("extra")

In [None]:
my_set = view.exclude(no_det).exclude(bad).exclude(extra)

In [None]:
my_set

In [None]:
dataset_dir = '/workspace/storage/3030/VoynovD/dump/data/landfill_set_part15'

for split in ["train"]:
    filepaths = ["./images/" + Path(path).parts[-2] + '/' + Path(path).name for path in my_set.match_tags(split).values("filepath")]

    split_file = Path(dataset_dir) / f"{split}_corr.txt"

    split = pd.DataFrame()
    if split_file.exists():
        split = pd.read_csv(split_file, names=["filepath"])
    split = pd.concat((split, pd.DataFrame({"filepath": filepaths})), axis=0)

    split.to_csv(split_file, index=None, header=None)

In [None]:
for split in ["train", 'val']:
    filepaths = ["./images/" + Path(path).parts[-2] + '/' + Path(path).name for path in view.match_tags(split).values("filepath")]

    split_file = Path(dataset_dir) / f"{split}_clear.txt"

    split = pd.DataFrame()
    if split_file.exists():
        split = pd.read_csv(split_file, names=["filepath"])
    split = pd.concat((split, pd.DataFrame({"filepath": filepaths})), axis=0)

    split.to_csv(split_file, index=None, header=None)

In [None]:
for split in ['train']:
    filepaths = ["./images/" + Path(path).name for path in view.match_tags(split).values("filepath")]

    split_file = Path(dataset_dir) / f"train_clear.txt"

    split = pd.DataFrame()
    if split_file.exists():
        split = pd.read_csv(split_file, names=["filepath"])
    split = pd.concat((split, pd.DataFrame({"filepath": filepaths})), axis=0)

    split.to_csv(split_file, index=None, header=None)

In [None]:
filepaths

In [None]:
for split in ["train", "val"]:
    filepaths = [f"./images/{split}/" + Path(path).name for path in view.match_tags(split).values("filepath")]

    # split_file = final_dataset_dir / f"{split}.txt"

    # split = pd.DataFrame()
    # if split_file.exists():
    #     split = pd.read_csv(split_file, names=["filepath"])
    # split = pd.concat((split, pd.DataFrame({"filepath": filepaths})), axis=0)

    # split.to_csv(split_file, index=None, header=None)

# new export

In [None]:
view

In [None]:
#classes = ['person']
export_dir = "/workspace/storage/3030/VoynovD/dump/data/extrabird/mva2023_sod4bird_train_yolo"
view.export(
    export_dir=export_dir,
    dataset_type=fo.types.YOLOv5Dataset,
    #split=split,
    label_field="detections",
    #classes=classes,
    )

In [None]:
high_resolution = view.match_tags("high_resolution")

In [None]:
view = view.exclude(high_resolution)
session.view = view

In [None]:
view.match_tags("test").tag_samples("train")

In [None]:
view.untag_samples("test")

In [None]:
session.view = view

In [None]:
new_dataset = view.clone(name="set13_v2")
new_dataset.persistent = True

In [None]:
#classes = ['person']
splits = ["train", "val"]
export_dir = "/workspace/storage/3030/VoynovD/dump/data/bird_set"

In [None]:
for split in splits:
    _set= view.match_tags(split)
    print(f'{split} = {len(_set)} samples')
    
    _set.export(
    export_dir=export_dir,
    dataset_type=fo.types.YOLOv5Dataset,
    split=split,
    label_field="ground_truth",
    classes=classes,
    )

In [None]:
# Export **only** labels in the `ground_truth` field in COCO format
# with absolute image filepaths in the labels
for split in splits:
    _set= view.match_tags(split)
    print(f'{split} = {len(_set)} samples')

    _set.export(
        dataset_type=fo.types.COCODetectionDataset,
        #export_dir=export_dir,
        labels_path=f"{export_dir}/annotations/{split}.json",
        label_field="ground_truth",
        #abs_paths=True,
    )

In [None]:
print(fo.list_datasets())

In [None]:
view = view.match_tags("good")
view

In [None]:
final_dataset_dir = Path('/workspace/storage/db/emergency-search/uzaodd_clear')

In [None]:
filepaths = [f"./images/" + Path(path).name for path in view.values("filepath")]

split_file = final_dataset_dir / "train_good.txt"

split = pd.DataFrame()
if split_file.exists():
    split = pd.read_csv(split_file, names=["filepath"])
split = pd.concat((split, pd.DataFrame({"filepath": filepaths})), axis=0)

split.to_csv(split_file, index=None, header=None)

In [None]:
for split in ["train", "val", "test"]:
    filepaths = [f"./images/{split}/" + Path(path).name for path in view.match_tags(split).values("filepath")]

    split_file = final_dataset_dir / "train_clear.txt"

    split = pd.DataFrame()
    if split_file.exists():
        split = pd.read_csv(split_file, names=["filepath"])
    split = pd.concat((split, pd.DataFrame({"filepath": filepaths})), axis=0)

    split.to_csv(split_file, index=None, header=None)

In [None]:
fo.delete_dataset('WiSARDv1')

In [None]:
new_dataset.default_classes

In [None]:
view = new_dataset.view()