## load library

In [1]:
import pickle
import os 
import shutil
import pandas as pd
import re
import numpy as np

## load data

In [2]:
datadir = "/Users/saskia/unibe19/master_thesis/TKI_project/data/"
ds_dir = os.path.join(datadir, 'dataset_PC_HN')
info = pickle.load(open(os.path.join(ds_dir, "info.pckl"), "rb"))
info

{0: {'exp': 'LM',
  'stim': 'IL-1b High',
  'stim_class': 1,
  'stim_class_simple': 1,
  'bn': '1_il1b_1_s1c1.tif'},
 1: {'exp': 'LM',
  'stim': 'TNF',
  'stim_class': 3,
  'stim_class_simple': 2,
  'bn': '2_TNF_1_s1c1.tif'},
 2: {'exp': 'LM',
  'stim': 'IL-1b High',
  'stim_class': 1,
  'stim_class_simple': 1,
  'bn': '3_IL1B_1_s1.tif'},
 3: {'exp': 'LM',
  'stim': 'NS',
  'stim_class': 0,
  'stim_class_simple': 0,
  'bn': '4_NS_1_s1c1.tif'},
 4: {'exp': 'LM',
  'stim': 'NS',
  'stim_class': 0,
  'stim_class_simple': 0,
  'bn': '5_NS_1_s1c1.tif'},
 5: {'exp': 'LM',
  'stim': 'IL-1b High',
  'stim_class': 1,
  'stim_class_simple': 1,
  'bn': '6_IL1B_1_s1c1.tif'},
 6: {'exp': 'SS',
  'stim': 'IL-1b Low',
  'stim_class': 2,
  'stim_class_simple': 1,
  'bn': 'FC2_Il1b_low_1_s1c1.tif'},
 7: {'exp': 'SS',
  'stim': 'IL-1b High',
  'stim_class': 1,
  'stim_class_simple': 1,
  'bn': 'FC3_Il1b_high_1_s1c1.tif'},
 8: {'exp': 'SS',
  'stim': 'IL-1b Low',
  'stim_class': 2,
  'stim_class_simple':

### get list of all folders and all image files 

In [3]:
folder_list = [os.path.join(ds_dir, folder_name) for folder_name in sorted(os.listdir(ds_dir)) if os.path.isdir(os.path.join(ds_dir, folder_name))]

In [4]:
img_names = []
folder_names = []
for folder in folder_list:
    for img in sorted(os.listdir(folder)): 
        img_name = os.path.join(folder, img)
        img_names.append(img_name)
        folder_names.append(folder)

#### build dataframe 

In [5]:
dict_Df = pd.DataFrame.from_dict(info, orient='index') 
paths_Df = pd.DataFrame({'path_to_img': img_names, 'path_to_folder': folder_names})

add image id (3 digits of folder + 3 digits of image) 

In [6]:
paths_Df['img_id'] = paths_Df.apply(lambda row: re.search(r'/(\d{3})/(\d{3})', row.path_to_img).group(1) + re.search(r'/(\d{3})/(\d{3})', row.path_to_img).group(2), axis=1)

- add group to ``paths_Df`` 
- add info of ``dict_Df`` to ``paths_Df`` by merging using group

In [7]:
paths_Df['group'] = paths_Df.apply(lambda row: int(re.search(r'/(\d{3})$', row.path_to_folder).group(1)), axis=1)

In [8]:
dict_Df['group'] = dict_Df.index

In [9]:
df = pd.merge(paths_Df, dict_Df, on='group')

for each group:

- shuffle indices
- last 20% for test data 
- remaining 80% for train data 


create lists of indices for test and train

In [10]:
test_indices = []
train_indices = []

groups = df['group'].unique()

for group in groups: 
    df_temp = df[df['group'] == group]
    idxs = df_temp.index
    idxs = np.random.RandomState(seed=18).permutation(idxs) 
    n_test_data = int(len(idxs) * 0.2) 
    if n_test_data >= 1:
        test_indices.append(idxs[-n_test_data:])
        train_indices.append(idxs[:-n_test_data])
    else: 
        test_indices.append(idxs[-1:])
        train_indices.append(idxs[:-1])
        
test_indices = np.concatenate( test_indices, axis=0 )
train_indices = np.concatenate( train_indices, axis=0 )

In [11]:
test_indices

array([  2,  10,  18,  26,  34,  42,  50,  54,  58,  62,  70,  78,  86,
        94, 102, 110, 118, 126, 134, 142, 150, 158, 166, 174, 182, 190,
       198, 204, 210])

create train and test dataframe with indices

In [15]:
test_df = df.loc[test_indices]
train_df = df.loc[train_indices]

In [16]:
test_df

Unnamed: 0,path_to_img,path_to_folder,img_id,group,exp,stim,stim_class,stim_class_simple,bn
2,/Users/saskia/unibe19/master_thesis/TKI_projec...,/Users/saskia/unibe19/master_thesis/TKI_projec...,2,0,LM,IL-1b High,1,1,1_il1b_1_s1c1.tif
10,/Users/saskia/unibe19/master_thesis/TKI_projec...,/Users/saskia/unibe19/master_thesis/TKI_projec...,1002,1,LM,TNF,3,2,2_TNF_1_s1c1.tif
18,/Users/saskia/unibe19/master_thesis/TKI_projec...,/Users/saskia/unibe19/master_thesis/TKI_projec...,2002,2,LM,IL-1b High,1,1,3_IL1B_1_s1.tif
26,/Users/saskia/unibe19/master_thesis/TKI_projec...,/Users/saskia/unibe19/master_thesis/TKI_projec...,3002,3,LM,NS,0,0,4_NS_1_s1c1.tif
34,/Users/saskia/unibe19/master_thesis/TKI_projec...,/Users/saskia/unibe19/master_thesis/TKI_projec...,4002,4,LM,NS,0,0,5_NS_1_s1c1.tif
42,/Users/saskia/unibe19/master_thesis/TKI_projec...,/Users/saskia/unibe19/master_thesis/TKI_projec...,5002,5,LM,IL-1b High,1,1,6_IL1B_1_s1c1.tif
50,/Users/saskia/unibe19/master_thesis/TKI_projec...,/Users/saskia/unibe19/master_thesis/TKI_projec...,6002,6,SS,IL-1b Low,2,1,FC2_Il1b_low_1_s1c1.tif
54,/Users/saskia/unibe19/master_thesis/TKI_projec...,/Users/saskia/unibe19/master_thesis/TKI_projec...,7002,7,SS,IL-1b High,1,1,FC3_Il1b_high_1_s1c1.tif
58,/Users/saskia/unibe19/master_thesis/TKI_projec...,/Users/saskia/unibe19/master_thesis/TKI_projec...,8002,8,SS,IL-1b Low,2,1,FC4_Il1b_low_1_s1c1.tif
62,/Users/saskia/unibe19/master_thesis/TKI_projec...,/Users/saskia/unibe19/master_thesis/TKI_projec...,9002,9,SS,IL-1b High,1,1,FC1_Il1b_1_to_13_1_s1c1.tif


### creating new directories

In [17]:
folder = os.path.join(datadir, 'dataset5classes')
if not os.path.exists(folder):
    os.mkdir(folder)
train_folder = os.path.join(folder, 'train')
if not os.path.exists(train_folder):
    os.mkdir(train_folder)
test_folder = os.path.join(folder, 'test') 
if not os.path.exists(test_folder): 
    os.mkdir(test_folder)


create folder for classes:

In [18]:
test_folder

'/Users/saskia/unibe19/master_thesis/TKI_project/data/dataset5classes/test'

In [19]:
classes = df['stim_class'].unique()

for folder in [test_folder, train_folder]:

    for a_class in classes:
        class_folder = os.path.join(folder, str(a_class))
        if not os.path.exists(class_folder): 
            os.mkdir(class_folder)

assign to group_folder - *copy images*

In [20]:

for df, folder in zip([train_df, test_df], [train_folder, test_folder]):
    for original_path, img_id, label in zip(df['path_to_img'], df['img_id'], df['stim_class']): 
        dest_folder = os.path.join(folder, str(label), str(img_id) + '.png') # renamed with img_id 
        shutil.copy(original_path, dest_folder) 
