In [1]:
from glob import glob

In [2]:
len(glob("data/*/*/*.jpg") + glob("data/*/*.jpg"))

171374

In [3]:
paths = glob("data/*/*/*.jpg") + glob("data/*/*.jpg")

In [4]:
import random
random.shuffle(paths)

In [5]:
paths[:5]

['data\\NGS\\2001-3000\\NGS_02193.jpg',
 'data\\MMZ\\1001-2000\\MMZ_01222.jpg',
 'data\\NGB\\6001-7000\\NGB_06804.jpg',
 'data\\ART\\1001-2000\\ART_01587.jpg',
 'data\\PMO\\6001-7000\\PMO_06843.jpg']

In [6]:
import pandas as pd

In [7]:
abbr =  pd.read_csv("data/abbreviations.csv", delimiter=";")
class_dict = {i : {"abbreviations": abbr.iloc[i, 0], "class_name": abbr.iloc[i, 1]} for i in range(len(abbr))}
class_dict

{0: {'abbreviations': 'ABE', 'class_name': 'Abnormal eosinophil'},
 1: {'abbreviations': 'ART', 'class_name': 'Artefact'},
 2: {'abbreviations': 'BAS', 'class_name': 'Basophil'},
 3: {'abbreviations': 'BLA', 'class_name': 'Blast'},
 4: {'abbreviations': 'EBO', 'class_name': 'Erythroblast'},
 5: {'abbreviations': 'EOS', 'class_name': 'Eosinophil'},
 6: {'abbreviations': 'FGC', 'class_name': 'Faggott cell'},
 7: {'abbreviations': 'HAC', 'class_name': 'Hairy cell'},
 8: {'abbreviations': 'KSC', 'class_name': 'Smudge cell'},
 9: {'abbreviations': 'LYI', 'class_name': 'Immature lymphocyte'},
 10: {'abbreviations': 'LYT', 'class_name': 'Lymphocyte'},
 11: {'abbreviations': 'MMZ', 'class_name': 'Metamyelocyte'},
 12: {'abbreviations': 'MON', 'class_name': 'Monocyte'},
 13: {'abbreviations': 'MYB', 'class_name': 'Myelocyte'},
 14: {'abbreviations': 'NGB', 'class_name': 'Band neutrophil'},
 15: {'abbreviations': 'NGS', 'class_name': 'Segmented neutrophil'},
 16: {'abbreviations': 'NIF', 'class_

In [8]:
rev_class_dict = {v["abbreviations"]: {"index": k, "class_name": v["class_name"]} for k, v in class_dict.items()}
rev_class_dict

{'ABE': {'index': 0, 'class_name': 'Abnormal eosinophil'},
 'ART': {'index': 1, 'class_name': 'Artefact'},
 'BAS': {'index': 2, 'class_name': 'Basophil'},
 'BLA': {'index': 3, 'class_name': 'Blast'},
 'EBO': {'index': 4, 'class_name': 'Erythroblast'},
 'EOS': {'index': 5, 'class_name': 'Eosinophil'},
 'FGC': {'index': 6, 'class_name': 'Faggott cell'},
 'HAC': {'index': 7, 'class_name': 'Hairy cell'},
 'KSC': {'index': 8, 'class_name': 'Smudge cell'},
 'LYI': {'index': 9, 'class_name': 'Immature lymphocyte'},
 'LYT': {'index': 10, 'class_name': 'Lymphocyte'},
 'MMZ': {'index': 11, 'class_name': 'Metamyelocyte'},
 'MON': {'index': 12, 'class_name': 'Monocyte'},
 'MYB': {'index': 13, 'class_name': 'Myelocyte'},
 'NGB': {'index': 14, 'class_name': 'Band neutrophil'},
 'NGS': {'index': 15, 'class_name': 'Segmented neutrophil'},
 'NIF': {'index': 16, 'class_name': 'Not identifiable'},
 'OTH': {'index': 17, 'class_name': 'Other cell'},
 'PEB': {'index': 18, 'class_name': 'Proerythroblast'},
 

In [9]:
meta_df = pd.DataFrame(paths, columns=["path"])
meta_df.head()

Unnamed: 0,path
0,data\NGS\2001-3000\NGS_02193.jpg
1,data\MMZ\1001-2000\MMZ_01222.jpg
2,data\NGB\6001-7000\NGB_06804.jpg
3,data\ART\1001-2000\ART_01587.jpg
4,data\PMO\6001-7000\PMO_06843.jpg


In [10]:
meta_df["class_name"] = meta_df["path"].apply(lambda x: rev_class_dict[x.split("\\")[1]]["class_name"])
meta_df["label"] = meta_df["path"].apply(lambda x: rev_class_dict[x.split("\\")[1]]["index"])
meta_df.head()

Unnamed: 0,path,class_name,label
0,data\NGS\2001-3000\NGS_02193.jpg,Segmented neutrophil,15
1,data\MMZ\1001-2000\MMZ_01222.jpg,Metamyelocyte,11
2,data\NGB\6001-7000\NGB_06804.jpg,Band neutrophil,14
3,data\ART\1001-2000\ART_01587.jpg,Artefact,1
4,data\PMO\6001-7000\PMO_06843.jpg,Promyelocyte,20


In [11]:
meta_df.label.unique()

array([15, 11, 14,  1, 20, 10,  3, 13, 18,  5, 16, 19,  2,  4, 12,  8,  7,
        9, 17,  6,  0], dtype=int64)

In [12]:
meta_df.label.value_counts()

15    29424
4     27395
10    26242
1     19630
20    11994
3     11973
14     9968
19     7629
13     6557
5      5883
12     4040
16     3538
11     3055
18     2740
2       441
7       409
17      294
9        65
6        47
8        42
0         8
Name: label, dtype: int64

In [14]:
from sklearn.model_selection import train_test_split

In [15]:

train_df, test_df = train_test_split(meta_df, test_size = 0.3, stratify=meta_df["label"].tolist())

In [16]:
train_df.shape, test_df.shape

((119961, 3), (51413, 3))

In [17]:
test_df, val_df = train_test_split(test_df, test_size=0.33, stratify=test_df["label"].tolist())
test_df.shape, val_df.shape

((34446, 3), (16967, 3))

In [18]:
train_df.to_csv("data/train.csv", index=False)
val_df.to_csv("data/val.csv", index=False)
test_df.to_csv("data/test.csv", index=False)