In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

In [3]:
ds_path = '/home/ariya/workspace/datasets/animals10-dvc'
ds_images_path = os.path.join(ds_path, 'images')
os.listdir(ds_images_path)

['butterfly',
 'cat',
 'chicken',
 'cow',
 'dog',
 'elephant',
 'horse',
 'sheep',
 'spider',
 'squirrel',
 'translate.py']

In [4]:
# cols: image_name, abs_path, split, GT
classes = np.array(['butterfly', 'cat', 'chicken', 'cow', 'dog', 
                    'elephant', 'horse', 'sheep', 'spider', 'squirrel'])

In [5]:
# example of how it works
[*('cat' == classes).astype(int)]

[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]

In [6]:
data = []
exts = ('.jpeg', '.jpg', '.png') # checked in check_ds_info.ipynb
for abs_path, subdirs, files in os.walk(ds_images_path):
    for file in tqdm(files):
        if file.endswith(exts):
            gt_class = os.path.split(abs_path)[-1]
            one_hot_gt = (gt_class == classes).astype(int)
            row = [file, os.path.join(abs_path, file), *one_hot_gt]
            data.append(row)
annot_df = pd.DataFrame(data, columns=['image_name', 'abs_path', *classes])

100%|██████████| 1/1 [00:00<00:00, 1205.26it/s]
100%|██████████| 2112/2112 [00:00<00:00, 14247.85it/s]
100%|██████████| 1668/1668 [00:00<00:00, 12155.69it/s]
100%|██████████| 3098/3098 [00:00<00:00, 13473.66it/s]
100%|██████████| 1866/1866 [00:00<00:00, 12751.74it/s]
100%|██████████| 4863/4863 [00:00<00:00, 12689.57it/s]
100%|██████████| 1446/1446 [00:00<00:00, 12291.76it/s]
100%|██████████| 2623/2623 [00:00<00:00, 12858.23it/s]
100%|██████████| 1820/1820 [00:00<00:00, 12448.18it/s]
100%|██████████| 4821/4821 [00:00<00:00, 13171.63it/s]
100%|██████████| 1862/1862 [00:00<00:00, 12543.62it/s]


In [7]:
annot_df

Unnamed: 0,image_name,abs_path,butterfly,cat,chicken,cow,dog,elephant,horse,sheep,spider,squirrel
0,e030b20928e90021d85a5854ee454296eb70e3c818b413...,/home/ariya/workspace/datasets/animals10-dvc/i...,1,0,0,0,0,0,0,0,0,0
1,e030b20929e90021d85a5854ee454296eb70e3c818b413...,/home/ariya/workspace/datasets/animals10-dvc/i...,1,0,0,0,0,0,0,0,0,0
2,e030b2092be90021d85a5854ee454296eb70e3c818b413...,/home/ariya/workspace/datasets/animals10-dvc/i...,1,0,0,0,0,0,0,0,0,0
3,e030b2092ce90021d85a5854ee454296eb70e3c818b413...,/home/ariya/workspace/datasets/animals10-dvc/i...,1,0,0,0,0,0,0,0,0,0
4,e030b2092de90021d85a5854ee454296eb70e3c818b413...,/home/ariya/workspace/datasets/animals10-dvc/i...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
26174,OIP-_U7JiIoYjbWPqmmmmdsvJwHaF5.jpeg,/home/ariya/workspace/datasets/animals10-dvc/i...,0,0,0,0,0,0,0,0,0,1
26175,OIP-_VBkNQd_MZI4xoemUb-FtAHaE7.jpeg,/home/ariya/workspace/datasets/animals10-dvc/i...,0,0,0,0,0,0,0,0,0,1
26176,OIP-_WyHKgREia-4VijlL6DNswHaFj.jpeg,/home/ariya/workspace/datasets/animals10-dvc/i...,0,0,0,0,0,0,0,0,0,1
26177,OIP-_xFGMN0UbYduHdiXQ1maZAHaIF.jpeg,/home/ariya/workspace/datasets/animals10-dvc/i...,0,0,0,0,0,0,0,0,0,1


## Train/Valid/Test split

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
# 80/10/10 split
train_df, valid_test_df = train_test_split(annot_df, test_size=0.2, random_state=42, shuffle=True)
valid_df, test_df = train_test_split(valid_test_df, test_size=0.5, random_state=42, shuffle=True)

In [15]:
# Đảm bảo rằng chúng ta đang làm việc với bản sao của DataFrame
train_df = train_df.copy()
valid_df = valid_df.copy()
test_df = test_df.copy()

# Gán giá trị vào cột 'split' của từng DataFrame
train_df.loc[:, 'split'] = 'train'
valid_df.loc[:, 'split'] = 'valid'
test_df.loc[:, 'split'] = 'test'

In [16]:
final_annot_df = pd.concat([train_df, valid_df, test_df])
len(final_annot_df), np.unique(final_annot_df['split'], return_counts=True)

(26179,
 (array(['test', 'train', 'valid'], dtype=object),
  array([ 2618, 20943,  2618])))

In [17]:
final_annot_df.columns

Index(['image_name', 'abs_path', 'butterfly', 'cat', 'chicken', 'cow', 'dog',
       'elephant', 'horse', 'sheep', 'spider', 'squirrel', 'split'],
      dtype='object')

In [18]:
final_annot_df.to_csv(os.path.join(ds_path, 'annotation_df.csv'), index=False)