In [27]:
import os
import glob
import json
import shutil
import pandas as pd
import math

from sklearn.model_selection import train_test_split

In [28]:
base_dir = 'dataset'

In [29]:
ls dataset

annotations-disease.json  annotations.json     [0m[01;34mimages[0m/
annotations-healed.json   count-dataset.csv
annotations-healing.json  dataset-healed.json


In [41]:
df = pd.read_json(os.path.join(base_dir, 'annotations.json'))
df = df.astype({
    'created_at': 'string',
    'updated_at': 'string'
})
df = df[['id', 'class', 'annotations', 'file_upload', 'data', 'created_at', 'updated_at', 'project']]
df.head(3)

Unnamed: 0,id,class,annotations,file_upload,data,created_at,updated_at,project
0,3072,healed,"[{'id': 3141, 'completed_by': 3, 'result': [{'...",083eb53a-0000000000001-X-20161227-102546-X3XSP...,{'image': '/data/upload/7/083eb53a-00000000000...,2023-03-23 08:57:54.374685+00:00,2023-06-01 09:55:43.540752+00:00,7
1,3073,healed,"[{'id': 3142, 'completed_by': 3, 'result': [{'...",1fc7bd90-001.jpg,{'image': '/data/upload/7/1fc7bd90-001.jpg'},2023-03-23 08:57:54.374789+00:00,2023-06-01 09:55:59.819153+00:00,7
2,3074,healed,"[{'id': 3143, 'completed_by': 3, 'result': [{'...",081af617-002.jpg,{'image': '/data/upload/7/081af617-002.jpg'},2023-03-23 08:57:54.374848+00:00,2023-06-01 09:56:15.997657+00:00,7


In [42]:
df.loc[0, 'annotations']

[{'id': 3141,
  'completed_by': 3,
  'result': [{'id': 'p3_iMF36X3',
    'type': 'polygonlabels',
    'value': {'closed': True,
     'points': [[7.08502024291498, 66.7185069984448],
      [10.931174089068826, 57.85381026438569],
      [15.182186234817813, 52.41057542768274],
      [17.408906882591094, 47.744945567651634],
      [19.635627530364374, 44.32348367029549],
      [20.850202429149796, 41.990668740279936],
      [23.279352226720647, 43.07931570762053],
      [25.708502024291498, 45.41213063763608],
      [25.910931174089068, 49.61119751166407],
      [23.279352226720647, 68.58475894245723]],
     'polygonlabels': ['healed']},
    'origin': 'manual',
    'to_name': 'image',
    'from_name': 'label',
    'image_rotation': 0,
    'original_width': 1484,
    'original_height': 1930}],
  'was_cancelled': False,
  'ground_truth': False,
  'created_at': '2023-06-01T09:55:43.477058Z',
  'updated_at': '2023-06-01T09:55:43.477085Z',
  'lead_time': 27.668,
  'prediction': {},
  'result_c

In [43]:
classes = df['class'].unique()
classes

array(['healed', 'healing', 'disease'], dtype=object)

In [54]:
dataset_splits = {dataset: [] for dataset in ['train', 'test', 'validate']}

for cls in classes:
    tmp_df = df.copy()
    tmp_data = list(tmp_df[tmp_df['class']==cls].T.to_dict().values())
    Train, Test = train_test_split(tmp_data, test_size=0.2, random_state=1)
    Train, Validate = train_test_split(Train, test_size=0.125, random_state=1)
    
    [elm.update({'dataset': 'train'}) for elm in Train]
    [elm.update({'dataset': 'test'}) for elm in Test]
    [elm.update({'dataset': 'validate'}) for elm in Validate]
    
    dataset_splits['train'] += Train
    dataset_splits['test'] += Test 
    dataset_splits['validate'] += Validate 

In [58]:
df_count = pd.concat([pd.DataFrame(elm) for dataset, elm in dataset_splits.items()])\
            .groupby(['dataset', 'class'])\
            .agg({'id': 'count'})\
            .rename(columns={'id': 'numberOfDataset'})
df_count

Unnamed: 0_level_0,Unnamed: 1_level_0,numberOfDataset
dataset,class,Unnamed: 2_level_1
test,disease,70
test,healed,90
test,healing,70
train,disease,245
train,healed,311
train,healing,245
validate,disease,35
validate,healed,45
validate,healing,35


In [59]:
df_count.reset_index().to_csv(os.path.join(base_dir, 'count-dataset.csv'), index=False)

In [68]:
classes = list(df_count.reset_index()['class'].unique())
classes

['disease', 'healed', 'healing']

In [69]:
with open(os.path.join(base_dir, 'dataset-meta.json'), 'w') as f:
    data = {
        'class': classes
    }
    json.dump(data, f)

In [61]:
dataset_splits.keys()

dict_keys(['train', 'test', 'validate'])

In [62]:
for dataset_name, data in dataset_splits.items():
    with open(os.path.join(base_dir ,f"dataset-{dataset_name}.json"), 'w') as f:
        json.dump(data, f) 