Goals: preprocessing images for DL train, val, and test
1. synchrotronize x-ray images and corresponding laser absorptivity, this has to be done individually for each folder, 4+1 in total, we also need to select the frames we need to avoide too many 0 absorption frames
2. mimic CUB_200_2011, creat images.txt, labels.txt, train_test_split.txt

In [105]:
import glob
import pandas as pd
import matplotlib.pyplot as plt

### step 1: synchrotronize to generate labels

In [122]:
path = '/Users/rubyjiang/Desktop/KA_raw_data/NIST_SW_raw_images'
# All files and directories ending with .csv and that don't begin with a dot:
all_csv_path = sorted(glob.glob(path+"/*Data.csv"))
all_csv_path

['/Users/rubyjiang/Desktop/KA_raw_data/NIST_SW_raw_images/014_CW22_CalibratedAbsorptionData.csv',
 '/Users/rubyjiang/Desktop/KA_raw_data/NIST_SW_raw_images/018_CW24_CalibratedAbsorptionData.csv',
 '/Users/rubyjiang/Desktop/KA_raw_data/NIST_SW_raw_images/021_CW28_CalibratedAbsorptionData.csv',
 '/Users/rubyjiang/Desktop/KA_raw_data/NIST_SW_raw_images/022_CW35_CalibratedAbsorptionData.csv']

In [123]:
def synchrotronize(one_csv_path, out_name):
    df = pd.read_csv(one_csv_path)
    # drop first row of units
    df = df.iloc[1: , :] 
    
    df = df.astype({'Time':'float',
                    'InputLaser':'float',
                    'AbsoluteAbsorption':'float', 
                    'RelativeAbsorption':'float', 
                    'FrameNumber':'int'})
    
    df = df[df['FrameNumber']!=0]
    print('nonzero row', len(df))
    # output frame number (start from 1) and relative absorption
    df = df[df['FrameNumber']!=0].reset_index()
    # output frame number (start from 1) and relative absorption
    df_out = df[['FrameNumber','RelativeAbsorption']]
    df_out.to_csv(out_name+'.csv', index=False)
    return df_out

In [125]:
out_name_list =['CW_P22_labels','CW_P24_labels', 'CW_P28_labels','CW_P35_labels']

for i in range(4):
    synchrotronize(all_csv_path[i], out_name_list[i])

nonzero row 227
nonzero row 233
nonzero row 233
nonzero row 233


### Step 2: for each folder of images, first 233 images and divided by the first image for bkg removal

In [69]:
import cv2 as cv
import numpy as np
import matplotlib.pyplot as plt
import os
from skimage import io, img_as_float, img_as_ubyte, exposure
from os.path import isfile, join
import os.path, sys
import re

import glob

%matplotlib notebook

In [76]:
from ipywidgets import interact

%matplotlib notebook

def atoi(text):
    return int(text) if text.isdigit() else text

def natural_keys(text):
    return [ atoi(c) for c in re.split(r'(\d+)', text) ]

def autoscale(array, percentile):
    value = np.percentile(np.ndarray.flatten(array), percentile)
    return value

In [70]:
my_list = os.listdir(path)

In [71]:
my_list

['.DS_Store',
 '022_Ti64_P35t2047us_S2F2.8mm_U18G12',
 '014_Ti64_P22t2080us_S2F2.8mm_U18G12',
 '022_CW35_CalibratedAbsorptionData.csv',
 '021_CW28_CalibratedAbsorptionData.csv',
 '018_Ti64_P24t2075us_S2F2.8mm_U18G12',
 '021_Ti64_P28t2065us_S2F2.8mm_U18G12',
 '018_CW24_CalibratedAbsorptionData.csv',
 '014_CW22_CalibratedAbsorptionData.csv',
 '220530_Data for CMU.txt']

In [127]:
all_image_path = sorted([path+'/'+p for p in my_list if 'U18G12' in p])
all_image_path

['/Users/rubyjiang/Desktop/KA_raw_data/NIST_SW_raw_images/014_Ti64_P22t2080us_S2F2.8mm_U18G12',
 '/Users/rubyjiang/Desktop/KA_raw_data/NIST_SW_raw_images/018_Ti64_P24t2075us_S2F2.8mm_U18G12',
 '/Users/rubyjiang/Desktop/KA_raw_data/NIST_SW_raw_images/021_Ti64_P28t2065us_S2F2.8mm_U18G12',
 '/Users/rubyjiang/Desktop/KA_raw_data/NIST_SW_raw_images/022_Ti64_P35t2047us_S2F2.8mm_U18G12']

In [128]:
folder_name_list = ['CW_P22_processed','CW_P24_processed', 'CW_P28_processed','CW_P35_processed']


for i in range(4):
    folder_name = folder_name_list[i]
    image_path = all_image_path[i]
    generate_process_images(image_path, folder_name)

In [126]:

def generate_process_images(image_path, folder_name):
    image_list = []
    for filename in sorted(glob.glob(image_path+'/*.tif'), key=natural_keys): 
        #print(filename)
        im = cv.imread(filename, cv.IMREAD_UNCHANGED)
        image_list.append(im)
    # crop to 300 x 300
    image_list = [im[0:350, 90:390+50] for im in image_list]
    # remove background
    bkg = image_list[0]
    image_list = [im/bkg for im in image_list]
    # norm back to [0,255]
    image_list = [cv.normalize(im,None, alpha=0, beta=255, norm_type=cv.NORM_MINMAX, dtype = cv.CV_8U) for im in image_list]
    #take the first 233
    if folder_name == 'CW_P22_processed':
        image_list = image_list[:227]
    else: 
        image_list = image_list[:233]
    # mkdr and save images: folder name: CW22_processed
    # Create a folder
    processed_path = path + '/' + folder_name + '/'
    try:
        os.makedirs(processed_path)
    except FileExistsError:
        pass
    # name conventions: CW22_frame_001.tif
    i = 1
    for image in image_list:   
        if i<10:
            name = folder_name + '_frame00' + str(i) + '.tif'
        elif i<100: 
            name = folder_name + '_frame0' + str(i) + '.tif'
        else:
            name = folder_name + '_frame' + str(i) + '.tif'
        i +=1   
        cv.imwrite(processed_path + '/' + name, image)

In [83]:
image_list[0].shape

(600, 640)

In [94]:
image_list[0][0:300, 90:390].shape

(300, 300)

## Step 3: merge all the four files into one 

## Step 4: split train and val and record the index



In [137]:
labels_path = "/Users/rubyjiang/Desktop/KA_raw_data/merged_labels.csv"
df = pd.read_csv(labels_path, header=None)
df.columns = [['relative_absorption']]
df.head()

Unnamed: 0,relative_absorption
0,0.40285
1,0.446961
2,-0.968159
3,-0.282654
4,0.371854


In [138]:
len(df)

926

#### 4.1 Generate train (1) and val (0) index

In [164]:
import random
random.seed(2022)
df['train_val_idx'] = [int(np.where(random.random()<=0.8, 1, 0)) for i in range(len(df))]

# train_idx = np.sort(random.sample(range(0, 227), 200)) # pay attentio to the second number
# #test_idx = np.sort(random.sample(range(0, 227), 27))
# test_idx = np.sort([i for i in range(227) if i not in train_idx])
# split_idx = (train_idx, test_idx)

In [173]:
df['train_val_idx'].value_counts()

(train_val_idx,)
1                   758
0                   168
dtype: int64

In [175]:
168 * 100 / len(df)

18.14254859611231

In [176]:
df.head()

Unnamed: 0,relative_absorption,train_val_idx
0,0.40285,1
1,0.446961,1
2,-0.968159,1
3,-0.282654,1
4,0.371854,1


#### 4.2 to check the balance of the dataset

In [180]:
balance_df = df.copy()
balance_df['balance'] = np.where(balance_df['relative_absorption']<5, 0, 1)
balance_df['balance'] 

Unnamed: 0,balance
0,0
1,0
2,0
3,0
4,0
...,...
921,0
922,0
923,0
924,0


In [181]:
balance_df['balance'].value_counts()

(balance,)
0             522
1             404
dtype: int64

In [182]:
df.head()

Unnamed: 0,relative_absorption,train_val_idx
0,0.40285,1
1,0.446961,1
2,-0.968159,1
3,-0.282654,1
4,0.371854,1


In [183]:
df.to_csv('final_labels_and_split.csv', index=False)