In [1]:
%matplotlib notebook
%load_ext autoreload

# Analysis

Notebook containing code to create our plots

In [2]:
import pandas as pd
from tqdm.autonotebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import pickle
from typing import Dict


plt.style.use(['science', 'notebook'])

  from tqdm.autonotebook import tqdm


## Preambula
We are going to run some computations, to save time let's create a decorator that stores and read from disk

In [3]:
def maybe_load_from_disk(location: Path):
    def decorator(func):
        def _inner(*args, **kwargs):
            if location.exists():
                print(f"[INFO] loading from {location}")
                with open(location, "rb") as f:
                    return pickle.load(f)
            res = func(*args, **kwargs)
            with open(location, "wb") as f:
                print(f"[INFO] saving to {location}")
                pickle.dump(res, f)
            return res
        return _inner
    return decorator
    

In [4]:
df = pd.read_csv("../metadata/categories.csv", index_col=0)

df.head()

Unnamed: 0_level_0,category
dataset,Unnamed: 1_level_1
hand-gestures-jps7z,real world
smoke-uvylj,real world
wall-damage,real world
corrosion-bi3q3,real world
excavators-czvg9,real world


# Sizes

Find out total dataset sizes, we have `rf100` download at `rf100`. We can use the index to iterate and get the size of each folder

In [5]:
from pathlib import Path
from functools import reduce
from collections import defaultdict

RF100_ROOT = Path('../rf100')

def count_num_files(dataset: str):
    dataset_path = RF100_ROOT / dataset
    sub_dirs = ["train", "valid", "test"]
    num_files = defaultdict(int)
    for sub_dir in sub_dirs:
        sub_dir_path = dataset_path / sub_dir / 'images'
        num_files[sub_dir] += sum([1 if curr.is_file() else 0 for curr in sub_dir_path.iterdir()])
    
    return pd.Series(num_files)

In [6]:
# @maybe_load_from_disk(Path('../temp/df.pkl'))
def apply_num_files(df):
    df[["train", "test", "valid"]] = df.apply(lambda row: count_num_files(row.name), axis=1)[["train", "test", "valid"]]
    df["size"] = df["train"] +  df["test"] +  df["valid"]
    return df

df = apply_num_files(df)

We now want to add the number of classes for each dataset, obtained before hand

In [7]:
import json
# obtained by running `export ROBOFLOW_API_KEY=.... && python ./scripts/get_labels_names.py` 
with open("../metadata/labels_names.json", 'r') as f: 
    labels_names = json.load(f)

In [8]:
df

Unnamed: 0_level_0,category,train,test,valid,size
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
hand-gestures-jps7z,real world,642,94,178,914
smoke-uvylj,real world,522,76,148,746
wall-damage,real world,325,40,96,461
corrosion-bi3q3,real world,840,105,304,1249
excavators-czvg9,real world,2244,144,267,2655
...,...,...,...,...,...
axial-mri,electromagnetic,253,39,79,371
gynecology-mri,electromagnetic,2122,253,526,2901
brain-tumor-m2pbp,electromagnetic,6930,990,1980,9900
bone-fracture-7fylg,electromagnetic,326,44,88,458


In [9]:
def get_num_classes_per_dataset(labels_names: Dict) -> pd.DataFrame:
    records = []
    for item in labels_names:
        num_classes = len(item["classes"])
        records.append({
            "dataset" : item["name"],
            "num_classes": num_classes
        })
    return pd.DataFrame.from_records(records).set_index("dataset")

df = df.join(get_num_classes_per_dataset(labels_names))

Finally, we also want to add the yolov5/7 - glip results

In [10]:
results = pd.read_csv("../results.csv", index_col=0)
df = df.join(results)

In [11]:
df

Unnamed: 0_level_0,category,train,test,valid,size,num_classes,yolov5,yolov7,glip
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
hand-gestures-jps7z,real world,642,94,178,914,14.0,0.9950,0.9950,
smoke-uvylj,real world,522,76,148,746,1.0,0.9590,0.9620,0.431
wall-damage,real world,325,40,96,461,3.0,0.5000,0.4340,
corrosion-bi3q3,real world,840,105,304,1249,3.0,0.7680,0.7640,0.003
excavators-czvg9,real world,2244,144,267,2655,3.0,0.9460,0.8950,0.274
...,...,...,...,...,...,...,...,...,...
axial-mri,electromagnetic,253,39,79,371,2.0,0.6380,0.5490,0.039
gynecology-mri,electromagnetic,2122,253,526,2901,3.0,0.3230,0.1710,0.000
brain-tumor-m2pbp,electromagnetic,6930,990,1980,9900,3.0,0.7680,0.8090,0.003
bone-fracture-7fylg,electromagnetic,326,44,88,458,4.0,0.0851,0.0896,0.000


Let's see how many of them there are for each category

In [12]:
df

Unnamed: 0_level_0,category,train,test,valid,size,num_classes,yolov5,yolov7,glip
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
hand-gestures-jps7z,real world,642,94,178,914,14.0,0.9950,0.9950,
smoke-uvylj,real world,522,76,148,746,1.0,0.9590,0.9620,0.431
wall-damage,real world,325,40,96,461,3.0,0.5000,0.4340,
corrosion-bi3q3,real world,840,105,304,1249,3.0,0.7680,0.7640,0.003
excavators-czvg9,real world,2244,144,267,2655,3.0,0.9460,0.8950,0.274
...,...,...,...,...,...,...,...,...,...
axial-mri,electromagnetic,253,39,79,371,2.0,0.6380,0.5490,0.039
gynecology-mri,electromagnetic,2122,253,526,2901,3.0,0.3230,0.1710,0.000
brain-tumor-m2pbp,electromagnetic,6930,990,1980,9900,3.0,0.7680,0.8090,0.003
bone-fracture-7fylg,electromagnetic,326,44,88,458,4.0,0.0851,0.0896,0.000


In [13]:
df["num_datasets"] = 1
aggretations = {
    "train" : "sum", "test" : "sum", "valid" : "sum", "size" : "sum", "num_classes" : "sum",
    "yolov5": "mean", "yolov7": "mean",
    "num_datasets" : "sum"           
                }
grouped_df = df.groupby("category").agg(aggretations).reset_index()
grouped_df = grouped_df.sort_values("size")
grouped_df["perc"] = grouped_df["size"] / grouped_df["size"].sum()
grouped_df

Unnamed: 0,category,train,test,valid,size,num_classes,yolov5,yolov7,num_datasets,perc
0,aerial,6643,1100,1940,9683,24.0,0.636,0.504286,7,0.043141
6,videogames,8233,1127,2219,11579,88.0,0.859857,0.796286,7,0.051588
3,microscopic,9576,1273,2529,13378,28.0,0.650727,0.59166,11,0.059603
5,underwater,12633,1794,3576,18003,39.0,0.56,0.6624,5,0.080209
1,documents,17866,2350,4597,24813,90.0,0.716125,0.7225,8,0.110549
2,electromagnetic,25398,3669,7314,36381,41.0,0.689675,0.607383,12,0.162088
4,real world,78747,10331,21537,110615,495.0,0.748008,0.702581,50,0.492823


Now, we want to use the order of the categories to sort our original dataframe, till will make it easier to visualize them

In [14]:
df_with_ordered_categories = pd.DataFrame(index=grouped_df.index, data={"category": grouped_df.category})
df = df_with_ordered_categories.merge(df.reset_index("dataset"), on="category", how="inner")
df = df.set_index("dataset")
df

Unnamed: 0_level_0,category,train,test,valid,size,num_classes,yolov5,yolov7,glip,num_datasets
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
aerial-pool,aerial,673,96,177,946,7.0,0.513,0.791,0.013,1
secondary-chains,aerial,103,16,43,162,1.0,0.341,0.312,0.000,1
aerial-spheres,aerial,318,51,104,473,6.0,0.993,0.539,0.000,1
soccer-players-5fuqs,aerial,114,16,33,163,3.0,0.660,0.399,0.065,1
weed-crop-aerial,aerial,823,118,235,1176,2.0,0.820,0.615,0.027,1
...,...,...,...,...,...,...,...,...,...,...
bees-jt5in,real world,5640,836,1604,8080,1.0,0.891,0.680,0.009,1
sedimentary-features-9eosf,real world,156,21,45,222,5.0,0.327,0.244,0.000,1
currency-v4f8j,real world,576,82,155,813,10.0,0.583,0.514,0.099,1
trail-camera,real world,941,131,239,1311,2.0,0.966,0.969,0.512,1


Let's store it to disk

In [15]:
df.to_csv("../metadata/datasets_stats.csv")

## Bounding boxes stats

Cool, so we may also want to plot/show the mean size of bboxes for each dataset

Let's create something to read all the annotations. We can take advantage of PyTorch Dataloader to use multiple cores and make the computation go brum brum

In [14]:
from torch.utils.data import Dataset, DataLoader
import torch

IGNORE = -1
# all images are resized to 640
size = (640, 640)

class AnnotationDataset(Dataset):
    def __init__(self, root: Path, fmt: str = "txt"):
        super().__init__()
        self.annotations_paths = list(root.glob(f"**/*.{fmt}"))
    
    def maybe_convert_polygon_to_bbox(self, line: str):
        splitted = line.split(" ")
        label, rest = splitted[0], splitted[1:]
        label = torch.as_tensor(int(label))
        is_bbox = len(rest) == 4
        if is_bbox:
            return  label, torch.as_tensor([float(el) for el in rest])
        else:
            # must be a polygon
            poly = torch.as_tensor([float(el) for el in rest])
            poly = poly.view(-1, 2)
            xmax, ymax = torch.max(poly, dim=0).values
            xmin, ymin = torch.min(poly, dim=0).values
            width, heigh = xmax - xmin, ymax - ymin
            xcenter, ycenter =  xmin + width / 2, ymin + heigh / 2
            return label, torch.stack([xcenter, ycenter, width, heigh])
            
    def __getitem__(self, idx: int):
        with self.annotations_paths[idx].open('r') as f:
            for line in f.readlines():
                label, bbox = self.maybe_convert_polygon_to_bbox(line)
                return label, bbox 
            return  torch.tensor(IGNORE), torch.as_tensor([IGNORE, IGNORE, IGNORE, IGNORE], dtype=torch.float32)
        
    def __len__(self):
        return len(self.annotations_paths)

Let's try it out

In [15]:
ds = AnnotationDataset(RF100_ROOT / df.index[0] / 'test' / 'labels')
ds[0]

(tensor(-1), tensor([-1., -1., -1., -1.]))

gg. Now we can use a torch `DataLoader` to speed up stuff. Let's define a couple of functions to help us out

In [16]:
def get_areas_and_labels(dataset: str, split: str ="test"):
    ds = AnnotationDataset(RF100_ROOT / dataset / split / 'labels')
    dl = DataLoader(ds, 
#                     num_workers=1, 
                    batch_size=128)

    all_areas = None
    all_labels = None
    for (labels, bboxes) in dl:
        bboxes = bboxes[labels != IGNORE] 
        # area = w * h
        areas = bboxes[:,2] * bboxes[:,3]
        all_areas = torch.cat((all_areas, areas)) if all_areas is not None else areas
        all_labels = torch.cat((all_labels, labels)) if all_labels is not None else labels

    return all_areas, all_labels


def compute_stats(areas: torch.Tensor):
    # let's compute the number of small, medium and large bbox
    bins = torch.histc(areas, bins=3, min=0, max=0.3)
    return areas.mean(), areas.std(), *bins

In [17]:
@maybe_load_from_disk(Path("../temp/bbox.pkl"))
def create_bbox_df(df):
    records = []
    dataset_bar = tqdm(df.index)
    for dataset in dataset_bar:
        dataset_bar.set_postfix_str(dataset)
        split_bar = tqdm(["train", "test", "valid"], leave=False)
        for split in split_bar:
            split_bar.set_postfix_str(split)
            areas, labels = get_areas_and_labels(dataset, split)
            vals = compute_stats(areas)
            vals = [val.float().item() for val in vals]
            area_mean, area_std, num_small, num_medium, num_large = vals
            labels = labels[labels != IGNORE]
            records.append(dict(
                                num_classes=labels.unique().numpy().shape[0],
                                labels=labels.unique().numpy(),
                                areas=areas.numpy(),
                                area_mean=area_mean, 
                                area_std=area_std, 
                                num_small=num_small, 
                                num_medium=num_medium, 
                                num_large=num_large,
                                split=split,
                                dataset=dataset,
                            )
                          )

    return pd.DataFrame.from_records(records)

In [18]:
bbox_df = create_bbox_df(df)
bbox_df

[INFO] loading from ../temp/bbox.pkl


Unnamed: 0,num_classes,labels,areas,area_mean,area_std,num_small,num_medium,num_large,split,dataset
0,5,"[0, 1, 2, 3, 4]","[0.0002142334, 0.0066253664, 0.0006176758, 0.0...",0.006662,0.011051,663.0,2.0,0.0,train,aerial-pool
1,5,"[0, 1, 2, 3, 4]","[0.0030529783, 0.011803589, 0.001994629, 0.001...",0.009999,0.011440,93.0,0.0,0.0,test,aerial-pool
2,5,"[0, 1, 2, 3, 4]","[0.019512938, 0.008712158, 0.000324707, 0.0032...",0.009305,0.015495,171.0,1.0,0.0,valid,aerial-pool
3,1,[0],"[0.008886719, 0.031712037, 0.010894775, 0.0251...",0.029541,0.044291,95.0,7.0,1.0,train,secondary-chains
4,1,[0],"[0.002546997, 0.02175232, 0.05264099, 0.108736...",0.053974,0.071981,13.0,2.0,1.0,test,secondary-chains
...,...,...,...,...,...,...,...,...,...,...
295,2,"[0, 1]","[0.21078613, 0.24919434, 0.079537965, 0.013065...",0.120005,0.115028,78.0,19.0,22.0,test,trail-camera
296,2,"[0, 1]","[0.28198242, 0.01100586, 0.23198852, 0.0040466...",0.117687,0.134675,145.0,41.0,27.0,valid,trail-camera
297,2,"[0, 1]","[0.0015747071, 0.009613037, 0.020339966, 0.004...",0.011733,0.012671,679.0,0.0,0.0,train,cell-towers
298,2,"[0, 1]","[0.0068237307, 0.015996095, 0.04102173, 0.0031...",0.009002,0.007607,95.0,0.0,0.0,test,cell-towers


In [19]:
train_df = bbox_df[bbox_df["split"] == "train"].reset_index(drop=True)
valid_df = bbox_df[bbox_df["split"] == "valid"].reset_index(drop=True)
test_df = bbox_df[bbox_df["split"] == "test"].reset_index(drop=True)

In [20]:
train_df

Unnamed: 0,num_classes,labels,areas,area_mean,area_std,num_small,num_medium,num_large,split,dataset
0,5,"[0, 1, 2, 3, 4]","[0.0002142334, 0.0066253664, 0.0006176758, 0.0...",0.006662,0.011051,663.0,2.0,0.0,train,aerial-pool
1,1,[0],"[0.008886719, 0.031712037, 0.010894775, 0.0251...",0.029541,0.044291,95.0,7.0,1.0,train,secondary-chains
2,5,"[0, 2, 3, 4, 5]","[0.00065612793, 0.0007086182, 0.00065612793, 0...",0.000619,0.000080,318.0,0.0,0.0,train,aerial-spheres
3,1,[1],"[0.006867676, 0.006181641, 0.009504395, 0.0061...",0.005348,0.001906,114.0,0.0,0.0,train,soccer-players-5fuqs
4,2,"[0, 1]","[0.0053552245, 0.0007440186, 0.004049072, 0.00...",0.003095,0.008399,822.0,1.0,0.0,train,weed-crop-aerial
...,...,...,...,...,...,...,...,...,...,...
95,1,[0],"[0.0018530274, 0.009246826, 0.00057739264, 0.0...",0.005481,0.006934,4747.0,1.0,0.0,train,bees-jt5in
96,5,"[0, 1, 2, 3, 4]","[0.0069213873, 0.0050097657, 0.002090454, 0.00...",0.008032,0.005847,149.0,0.0,0.0,train,sedimentary-features-9eosf
97,10,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]","[0.052851565, 0.021710815, 0.17441045, 0.15909...",0.210679,0.279003,256.0,127.0,70.0,train,currency-v4f8j
98,2,"[0, 1]","[0.053240966, 0.014560546, 0.06569824, 0.02523...",0.112492,0.124287,579.0,153.0,117.0,train,trail-camera


check if we have all the labels in all splits

In [21]:
import numpy as np 

all_missing_labels = []
all_is_correct = []
for idx, (train_labels, valid_labels, test_labels) in tqdm(
                                zip(train_df.index,
                                    zip(train_df["labels"].values,  valid_df["labels"].values,  test_df["labels"].values
                                       )
                                   ),
                                total=len(train_df)):
    # see https://numpy.org/doc/stable/reference/generated/numpy.setdiff1d.html
    missing_from_valid = np.setdiff1d(valid_labels, train_labels)
    missing_from_test = np.setdiff1d(test_labels, train_labels)
    missing_labels = np.array([])
    
    if missing_from_valid.shape[0] > 0:
        missing_labels = np.concatenate((missing_labels, missing_from_valid))
    if missing_from_test.shape[0] > 0:
        missing_labels = np.concatenate((missing_labels, missing_from_test))
    
    is_correct = missing_labels.shape[0] == 0
    all_missing_labels.append(missing_labels)
    all_is_correct.append(is_correct)

train_df["missing_labels"] = all_missing_labels
train_df["is_correct"] = all_is_correct


  0%|          | 0/100 [00:00<?, ?it/s]

In [22]:
train_df

Unnamed: 0,num_classes,labels,areas,area_mean,area_std,num_small,num_medium,num_large,split,dataset,missing_labels,is_correct
0,5,"[0, 1, 2, 3, 4]","[0.0002142334, 0.0066253664, 0.0006176758, 0.0...",0.006662,0.011051,663.0,2.0,0.0,train,aerial-pool,[],True
1,1,[0],"[0.008886719, 0.031712037, 0.010894775, 0.0251...",0.029541,0.044291,95.0,7.0,1.0,train,secondary-chains,[],True
2,5,"[0, 2, 3, 4, 5]","[0.00065612793, 0.0007086182, 0.00065612793, 0...",0.000619,0.000080,318.0,0.0,0.0,train,aerial-spheres,[],True
3,1,[1],"[0.006867676, 0.006181641, 0.009504395, 0.0061...",0.005348,0.001906,114.0,0.0,0.0,train,soccer-players-5fuqs,[],True
4,2,"[0, 1]","[0.0053552245, 0.0007440186, 0.004049072, 0.00...",0.003095,0.008399,822.0,1.0,0.0,train,weed-crop-aerial,[],True
...,...,...,...,...,...,...,...,...,...,...,...,...
95,1,[0],"[0.0018530274, 0.009246826, 0.00057739264, 0.0...",0.005481,0.006934,4747.0,1.0,0.0,train,bees-jt5in,[],True
96,5,"[0, 1, 2, 3, 4]","[0.0069213873, 0.0050097657, 0.002090454, 0.00...",0.008032,0.005847,149.0,0.0,0.0,train,sedimentary-features-9eosf,[],True
97,10,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]","[0.052851565, 0.021710815, 0.17441045, 0.15909...",0.210679,0.279003,256.0,127.0,70.0,train,currency-v4f8j,[],True
98,2,"[0, 1]","[0.053240966, 0.014560546, 0.06569824, 0.02523...",0.112492,0.124287,579.0,153.0,117.0,train,trail-camera,[],True


In [23]:
wrong_df = train_df[~train_df["is_correct"]]

In [24]:
wrong_df[["dataset", "missing_labels"]].to_csv("missing.csv")

Let's add all the prev informations

merging areas

In [25]:
records = {}

for idx, train_area, valid_area, test_area in zip(train_df["dataset"], 
                                                  train_df["areas"].values, 
                                                  valid_df["areas"].values,
                                                  test_df["areas"].values):
    records[idx] = np.concatenate([train_area, valid_area, test_area])
    

areas_series = pd.Series(records)
areas_series

aerial-pool                   [0.0002142334, 0.0066253664, 0.0006176758, 0.0...
secondary-chains              [0.008886719, 0.031712037, 0.010894775, 0.0251...
aerial-spheres                [0.00065612793, 0.0007086182, 0.00065612793, 0...
soccer-players-5fuqs          [0.006867676, 0.006181641, 0.009504395, 0.0061...
weed-crop-aerial              [0.0053552245, 0.0007440186, 0.004049072, 0.00...
                                                    ...                        
bees-jt5in                    [0.0018530274, 0.009246826, 0.00057739264, 0.0...
sedimentary-features-9eosf    [0.0069213873, 0.0050097657, 0.002090454, 0.00...
currency-v4f8j                [0.052851565, 0.021710815, 0.17441045, 0.15909...
trail-camera                  [0.053240966, 0.014560546, 0.06569824, 0.02523...
cell-towers                   [0.0015747071, 0.009613037, 0.020339966, 0.004...
Length: 100, dtype: object

In [26]:
bbox_df_grouped["areas"] = areas_series
bbox_df_grouped.head(2)

NameError: name 'bbox_df_grouped' is not defined

In [None]:
df_with_bbox = df.join(bbox_df_grouped, how="inner")

In [None]:
df_with_bbox

In [None]:
fig = plt.figure(figsize=(25, 40))
plot = sns.boxplot(data=df_with_bbox["areas"], orient='h')
plt.xlabel("bbox")
plt.ylabel("dataset")
plot.set_yticklabels(df.index)
plt.gcf().savefig("plot_all_train.png")

In [None]:
plt.gcf().savefig("plot.png")

### Clip Embeddings

I have sampled 512 points per dataset and encoded them with CLIP. Let's load them, avg them and plot in 2D (after PCA). Let's do it! 

In [None]:
import clip
from typing import Callable
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
RF100_ROOT = Path("./rf100/")

class ImageDataset(Dataset):
    def __init__(self, root: Path, fmt: str = "jpg", transform: Callable = None):
        super().__init__()
        self.images_path = list(root.glob(f"**/*.{fmt}"))
        self.transform = transform or ToTensor()

    def __getitem__(self, idx: int):
        image = Image.open(self.images_path[idx]).convert("RGB")
        return self.transform(image), idx, str(self.images_path[idx])

    def __len__(self):
        return len(self.images_path)

def pca(x, k, center=True):
    if center:
        m = x.mean(0, keepdim=True)
        s = x.std(0, unbiased=False, keepdim=True)
        x -= m
        x /= s
    # why pca related to svd? https://www.cs.cmu.edu/~elaw/papers/pca.pdf chap VI
    U, S, V = torch.linalg.svd(x) 
    reduced = torch.mm(x, V[:k].T)

    return reduced

@maybe_load_from_disk(Path("./embeddings_means.pkl"))
def get_embeddings(df):
    MAX_BATCHES = 2
    model, preprocess = clip.load("ViT-B/32", device=device, jit=True)
    records = []
    for dataset in tqdm(df.index):
        ds = ImageDataset(RF100_ROOT / dataset / "train/images", transform=preprocess)
        dl = DataLoader(
            ds, batch_size=256, num_workers=8, pin_memory=True, shuffle=True
        )  # we shuffle and we sample MAX_BATCHES batches per dataset
        i = 0
        means = None
        for (x, _, _) in dl:
            with torch.no_grad():
                x = x.to("cuda")
                x = model.encode_image(x)
                means = torch.vstack((means, x.mean(0))) if means is not None else x.mean(0)
            i += 1
            if i >= MAX_BATCHES: break
        if len(means.shape) == 1: means = means.unsqueeze(0)
        
        records.append(dict(dataset=dataset, clip_mean=means.mean(0).squeeze().cpu().numpy()))
            
    return pd.DataFrame.from_records(records, index=df.index)




In [None]:
embed_df = get_embeddings(df)
embed_df.head(2)

In [None]:
clip_means = torch.stack([torch.from_numpy(el) for el in embed_df.clip_mean.tolist()])

In [None]:
clip_means_reduced = pca(clip_means.float(), k=2)
clip_means_reduced.shape

In [None]:
fig = plt.figure(figsize=(10, 8))
ax = sns.scatterplot(x=clip_means_reduced[:,0], y=clip_means_reduced[:,1], size=df["size"], sizes=(0, 500), hue=df.category, alpha=.66, legend="brief")
sns.move_legend(ax, bbox_to_anchor=(1.02, 1), loc='upper left')

In [None]:
plt.savefig("embedds.png",)

## Plotting

In [None]:
df_with_bbox

In [None]:
plt.savefig("datasets.png")

In [None]:
fig, axs = plt.subplots(2, 2,  figsize=(20,16))
import matplotlib as mpl
mpl.rcParams['axes.titlepad'] = 16
# fig.suptitle('Datasets Categories')
axs[0,0].set_title("Total datasets size/category")
ax = sns.barplot(data=grouped_df, x="size", y="category", linewidth=2,  edgecolor=".2", ax=axs[0,0])
ax.set_ylabel('category')
ax.set_xlabel('size')
secax = ax.secondary_xaxis('top', functions=(lambda x: x / df_with_bbox["size"].sum(), lambda x:x))
secax.set_xlabel('size (%)')
ax.minorticks_on()
secax.minorticks_on()

axs[0,1].set_title("Mean datasets size/category")
ax = sns.boxplot(data=df_with_bbox, x="size", y="category", ax=axs[0,1])
ax.set_xlabel('size')
ax.get_yaxis().set_visible(False)
secax = ax.secondary_xaxis('top', functions=(lambda x: x / df_with_bbox["size"].sum(), lambda x:x))
secax.set_xlabel('size (%)')
ax.minorticks_on()
secax.minorticks_on()

axs[1,0].set_title("Mean bbox area")
ax = sns.boxplot(data=df_with_bbox, x="area_mean", y="category", ax=axs[1,0])
ax.set_xlabel("bbox")

axs[1,1].set_title("Mean num_classes")
ax = sns.boxplot(data=df_with_bbox, x="num_classes", y="category", ax=axs[1,1])
ax.set_xlabel("labels")
ax.get_yaxis().set_visible(False)


In [None]:
plt.savefig("datasets_stats.png")