In [None]:
import os
import cv2
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import scipy
from scipy import ndimage
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
import pickle

In [None]:
def find_jpg_files(search_dir):
    jpg_files = []
    for root, dirs, files in os.walk(search_dir):
        for file in files:
            if file.endswith('.jpg'):
                full_path = os.path.join(root, file)
                jpg_files.append(full_path)
    return jpg_files

In [None]:
def autocropmin(image, threshold=100, kernsel_size = 10):
        
    img = image.copy()
    
    SIZE = img.shape[0]
    imgfilt = ndimage.minimum_filter(img, size=kernsel_size)
    img_b=np.where(imgfilt<threshold,0,255)
    a=img_b[:,:,0].sum(axis=1)
    a=np.concatenate(([0],a,[0]))

    a_=np.where(a==0)[0]
    mina=a_[np.argmax(a_[1:]-a_[:-1])]
    maxa=a_[np.argmax(a_[1:]-a_[:-1])+1]-1

    b=img_b[:,:,0].sum(axis=0)
    b=np.concatenate(([0],b,[0]))

    b_=np.where(b==0)[0]
    minb=b_[np.argmax(b_[1:]-b_[:-1])]
    maxb=b_[np.argmax(b_[1:]-b_[:-1])+1]-1

    if  mina!=maxa and minb!=maxb:
        imageout=img[mina:maxa,minb:maxb,:]
    else:
        imageout=img

    return imageout

In [None]:
# Path of Challenge1 data
search_dir = '/home/Data/rmf3mc/Challenge/challenge1/Code/Train_Valid_dataset/challenge1_dataset/OriginalDatauncompressed'

# List to hold the paths of .jpg files
jpg_files = find_jpg_files(search_dir)


In [None]:
image_number=0
len_jpg=len(jpg_files)

def process_image(str1):
    img = cv2.imread(str1)
    if img is None:
        return str1, True, None

    ct_scan = "/".join(str1.split("/")[-6:-1])
    new_shape = img.shape  

    img = autocropmin(img)

    str1 = str1.replace("/OriginalDatauncompressed/", "/preprocessed/")
    folder_path = "/".join(str1.split("/")[:-1])

    os.makedirs(folder_path, exist_ok=True)
    cv2.imwrite(str1, img)
    
    return ct_scan, False, new_shape


shape_dict = {}
diff_shape = []
bug_files = []


with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_image, str1) for str1 in jpg_files]
    
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing Images"):
        ct_scan, is_bug_file, shape = future.result()
        if is_bug_file:
            bug_files.append(ct_scan)
        else:
            if ct_scan not in shape_dict:
                shape_dict[ct_scan] = set()
            shape_dict[ct_scan].add(shape)
            if len(shape_dict[ct_scan]) != 1:
                diff_shape.append(ct_scan)

In [None]:
search_dir = '/home/Data/rmf3mc/Challenge/challenge1/Code/Train_Valid_dataset/challenge1_dataset/preprocessed'

challenge1_preprocessed=find_jpg_files(search_dir)
print(len(challenge1_preprocessed))

In [None]:
def process_image(path):
    img = cv2.imread(path)
    img2 = ndimage.minimum_filter(img, 5)
    img_b = np.where(img2 < 100, 0, 255)
    mask = scipy.ndimage.binary_fill_holes(img_b[:, :, 0])
    mask_ = mask * 255
    aaa = mask_ - img_b[:, :, 0]
    area = aaa.sum() / 255
    return area

In [None]:
def process_images_multithreaded_ordered(image_paths):
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(process_image, image_paths))
    return results

In [None]:
challenge1_train_area = process_images_multithreaded_ordered(challenge1_preprocessed)

In [None]:
challenge1_preprocessed_train_indices = [i for i, path in enumerate(challenge1_preprocessed) if 'train' in path]
challenge1_preprocessed_valid_indices = [i for i, path in enumerate(challenge1_preprocessed) if 'valid' in path]

In [None]:
challenge1_preprocessed_train_paths = [challenge1_preprocessed[i] for i in challenge1_preprocessed_train_indices]
challenge1_train_areas = [challenge1_train_area[i] for i in challenge1_preprocessed_train_indices]
challenge1_preprocessed_valid_paths = [challenge1_preprocessed[i] for i in challenge1_preprocessed_valid_indices]
challenge1_valid_areas = [challenge1_train_area[i] for i in challenge1_preprocessed_valid_indices]

In [None]:
challenge1_train_path_area=pd.DataFrame((zip(challenge1_preprocessed_train_paths, challenge1_train_areas)), columns = ['path', 'area'])
challenge1_valid_path_area=pd.DataFrame((zip(challenge1_preprocessed_valid_paths, challenge1_valid_areas)), columns = ['path', 'area'])

In [None]:
path_to_files='/home/Data/rmf3mc/Challenge/challenge1/Code/Refined/files'
challenge1_train_path_area.to_csv(os.path.join(path_to_files,'challenge1_train_path_area.csv'))
challenge1_valid_path_area.to_csv(os.path.join(path_to_files,'challenge1_valid_path_area.csv'))

In [None]:
challenge1_valid_area_path = pd.read_csv(os.path.join(path_to_files, 'challenge1_valid_path_area.csv'))

challenge1_valid_area_path["ct_path"] = challenge1_valid_area_path["path"].apply(lambda x: "/".join(x.split("/")[:-1]))

challenge1_valid_area_path["ct_slice"] = challenge1_valid_area_path["path"].apply(lambda x: int(x.split("/")[-1].split(".")[0]))

challenge1_valid_area_path.sort_values(by=['ct_path', 'ct_slice'], inplace=True)

In [None]:
challenge1_train_area_path=pd.read_csv(os.path.join(path_to_files,'challenge1_train_path_area.csv'))

challenge1_train_area_path["ct_path"]=challenge1_train_area_path["path"].apply(lambda x: "/".join(x.split("/")[:-1]))

challenge1_train_area_path["ct_slice"]=challenge1_train_area_path["path"].apply(lambda x: int(x.split("/")[-1].split(".")[0]))

challenge1_train_area_path.sort_values(by=['ct_path', 'ct_slice'], inplace=True)

In [None]:
def sum_max(a,w=0.4):
    l=len(a)
    k=int(np.ceil(l*w))
    d=0
    tmp_max=0
    # print(l, k)
    for i in range(l-k+1):
        if np.sum(a[i:i+k])>tmp_max:
            tmp_max=np.sum(a[i:i+k])
            d=i
    return d,d+k

In [None]:
challenge1_ct_path_list = challenge1_valid_area_path["ct_path"].unique()
challenge1_valid_dic = {}
for i in tqdm(range(len(challenge1_ct_path_list))):
    tmp_df = challenge1_valid_area_path[challenge1_valid_area_path["ct_path"] == challenge1_ct_path_list[i]].reset_index(drop=True)
    challenge1_valid_dic[challenge1_ct_path_list[i]] = list(sum_max(tmp_df["area"].values, 0.5))

In [None]:
challenge1_ct_path_list=challenge1_train_area_path["ct_path"].unique()
challenge1_train_dic={}
for i in tqdm(range(len(challenge1_ct_path_list))):
    tmp_df=challenge1_train_area_path[challenge1_train_area_path["ct_path"]==challenge1_ct_path_list[i]].reset_index(drop=True)
    challenge1_train_dic[challenge1_ct_path_list[i]]=list(sum_max(tmp_df["area"].values,0.5))

In [None]:
with open(os.path.join(path_to_files,'challenge1_train_range.pickle'), 'wb') as handle:
    pickle.dump(challenge1_train_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open(os.path.join(path_to_files,'challenge1_valid_range.pickle'), 'wb') as handle:
    pickle.dump(challenge1_valid_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)