In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [None]:
import os
import cv2
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import scipy
from scipy import ndimage
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
import pickle

In [None]:
#The function find_jpg_files searches for all .jpg files within a specified directory and its subdirectories. 
#It returns a list containing the paths of these .jpg files.

def find_jpg_files(search_dir):
    """
    Find all .jpg files within the given directory and its subdirectories.

    Parameters:
    - search_dir: The path of the directory to search in.

    Returns:
    - A list of paths to .jpg files found within the specified directory and its subdirectories.
    """
    jpg_files = []  # List to hold the paths of .jpg files

    # Walk through the directory and its subdirectories
    for root, dirs, files in os.walk(search_dir):
        for file in files:
            # Check if the file ends with .jpg
            if file.endswith('.jpg'):
                # Construct the full path and add it to the list
                full_path = os.path.join(root, file)
                jpg_files.append(full_path)
                
    return jpg_files

In [None]:
#The function autocropmin performs cropping of an image based on a minimum intensity threshold. 
#It reduces the image size by removing regions that are below a specified intensity threshold, 
#likely focusing on a central object or area of interest. 

def autocropmin(image, threshold=100, kernsel_size = 10):
        
    img = image.copy()
    
    SIZE = img.shape[0]
    # Apply a minimum filter to the image with the specified kernel size
    imgfilt = ndimage.minimum_filter(img, size=kernsel_size)
    
    # Binarize the image: set pixels below the threshold to 0, others to 255
    img_b=np.where(imgfilt<threshold,0,255)
    
    
    a=img_b[:,:,0].sum(axis=1)
    a=np.concatenate(([0],a,[0]))

    a_=np.where(a==0)[0]
    mina=a_[np.argmax(a_[1:]-a_[:-1])]
    maxa=a_[np.argmax(a_[1:]-a_[:-1])+1]-1

    b=img_b[:,:,0].sum(axis=0)
    b=np.concatenate(([0],b,[0]))

    b_=np.where(b==0)[0]
    minb=b_[np.argmax(b_[1:]-b_[:-1])]
    maxb=b_[np.argmax(b_[1:]-b_[:-1])+1]-1

    if  mina!=maxa and minb!=maxb:
        # Crop the image to the determined boundaries
        imageout=img[mina:maxa,minb:maxb,:]
    else:
        imageout=img

    return imageout

In [None]:
# Specify the path of your data

# Path of Challenge1 data
search_dir = '/mnt/ephemeral/challenge_data/challenge1_data/OriginalDatauncompressed'

In [None]:
# List to hold the paths of .jpg files of Challenge 1 Dataset
jpg_files = find_jpg_files(search_dir)

In [None]:
print(len(jpg_files))

In [None]:
image_number=0
len_jpg=len(jpg_files)

#The process_image function processes a given image by reading it, cropping it based on intensity thresholds,
#and saving the processed image to a new location

def process_image(str1):
    img = cv2.imread(str1)
    if img is None:
        return str1, True, None  # Indicate that the file is a bug file

    ct_scan = "/".join(str1.split("/")[-6:-1])
    new_shape = img.shape  

    # Process the image 
    img = autocropmin(img)

    # Prepare the output path
    str1 = str1.replace("/OriginalDatauncompressed/", "/preprocessed/")
    folder_path = "/".join(str1.split("/")[:-1])

    os.makedirs(folder_path, exist_ok=True)
    cv2.imwrite(str1, img)
    
    return ct_scan, False, new_shape

In [None]:
####################################### Challenge 1 Dataset ###############################
#The following processes the list of .jpg files of the challenge 1 dataset concurrently. 
##########################################################################################

shape_dict = {}
diff_shape = []
bug_files = []


with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit all tasks and wrap them with tqdm for a progress bar
    futures = [executor.submit(process_image, str1) for str1 in jpg_files]
    
    # Use tqdm to wrap the as_completed generator
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing Images"):
        ct_scan, is_bug_file, shape = future.result()
        if is_bug_file:
            bug_files.append(ct_scan)
        else:
            if ct_scan not in shape_dict:
                shape_dict[ct_scan] = set()
            shape_dict[ct_scan].add(shape)
            if len(shape_dict[ct_scan]) != 1:
                diff_shape.append(ct_scan)

In [None]:
################## Challenge 2 dataset##################
## Repeating the above but for Challenge 2 dataset######

# Path of Challenge2 data
search_dir = '/mnt/ephemeral/challenge_data/challenge2_data/dataset'

# List to hold the paths of .jpg files
jpg_files = find_jpg_files(search_dir)

In [None]:
image_number=0
len_jpg=len(jpg_files)

def process_image(str1):
    img = cv2.imread(str1)
    if img is None:
        return str1, True, None  # Indicate that the file is a bug file

    ct_scan = "/".join(str1.split("/")[-6:-1])
    new_shape = img.shape  

    # Process the image 
    img = autocropmin(img)

    # Prepare the output path
    str1 = str1.replace("/dataset/", "/preprocessed/")
    folder_path = "/".join(str1.split("/")[:-1])

    os.makedirs(folder_path, exist_ok=True)
    cv2.imwrite(str1, img)
    
    return ct_scan, False, new_shape


shape_dict = {}
diff_shape = []
bug_files = []


with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit all tasks and wrap them with tqdm for a progress bar
    futures = [executor.submit(process_image, str1) for str1 in jpg_files]
    
    # Use tqdm to wrap the as_completed generator
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing Images"):
        ct_scan, is_bug_file, shape = future.result()
        if is_bug_file:
            bug_files.append(ct_scan)
        else:
            if ct_scan not in shape_dict:
                shape_dict[ct_scan] = set()
            shape_dict[ct_scan].add(shape)
            if len(shape_dict[ct_scan]) != 1:
                diff_shape.append(ct_scan)

######   The following for computing the lung area in each slice in Challenge1 Dataset  #######

# Get Area of Each Slice

In [None]:
search_dir = '/mnt/ephemeral/challenge_data/challenge1_data/preprocessed'

challenge1_preprocessed=find_jpg_files(search_dir)
print(len(challenge1_preprocessed))

In [None]:
# Function to computer area of a single image
def process_image(path):
    img = cv2.imread(path)
    img2 = ndimage.minimum_filter(img, 5)
    img_b = np.where(img2 < 100, 0, 255)
    mask = scipy.ndimage.binary_fill_holes(img_b[:, :, 0])
    mask_ = mask * 255
    aaa = mask_ - img_b[:, :, 0]
    area = aaa.sum() / 255
    return area

In [None]:
#### Compute the area using multi threading ########
def process_images_multithreaded_ordered(image_paths):
    with ThreadPoolExecutor() as executor:
        # Directly convert executor.map to list without tqdm
        results = list(executor.map(process_image, image_paths))
    return results

In [None]:
challenge1_train_area = process_images_multithreaded_ordered(challenge1_preprocessed)

######   The following for computing the lung area in each slice in Challenge2 Dataset  #######

In [None]:
search_dir = '/mnt/ephemeral/challenge_data/challenge2_data/preprocessed'

challenge2_preprocessed=find_jpg_files(search_dir)
print(len(challenge2_preprocessed))

In [None]:
challenge2_train_area = process_images_multithreaded_ordered(challenge2_preprocessed)

# Saving the processed information ################

In [None]:
challenge1_preprocessed_train_indices = [i for i, path in enumerate(challenge1_preprocessed) if 'train' in path]
challenge1_preprocessed_valid_indices = [i for i, path in enumerate(challenge1_preprocessed) if 'valid' in path]

In [None]:
# Extract 'train' paths using challenge1_preprocessed_train_indices
challenge1_preprocessed_train_paths = [challenge1_preprocessed[i] for i in challenge1_preprocessed_train_indices]

# Extract corresponding 'train' areas using the same indices
challenge1_train_areas = [challenge1_train_area[i] for i in challenge1_preprocessed_train_indices]

# Extract 'valid' paths using challenge1_preprocessed_valid_indices
challenge1_preprocessed_valid_paths = [challenge1_preprocessed[i] for i in challenge1_preprocessed_valid_indices]

# Extract corresponding 'valid' areas using the same indices
challenge1_valid_areas = [challenge1_train_area[i] for i in challenge1_preprocessed_valid_indices]

In [None]:
challenge1_train_path_area=pd.DataFrame((zip(challenge1_preprocessed_train_paths, challenge1_train_areas)), columns = ['path', 'area'])
challenge1_valid_path_area=pd.DataFrame((zip(challenge1_preprocessed_valid_paths, challenge1_valid_areas)), columns = ['path', 'area'])

In [None]:
path_to_files='/mnt/challenge_preprocessing/files'
challenge1_train_path_area.to_csv(os.path.join(path_to_files,'challenge1_train_path_area.csv'))
challenge1_valid_path_area.to_csv(os.path.join(path_to_files,'challenge1_valid_path_area.csv'))

In [None]:
challenge2_preprocessed_valid_indices = [i for i, path in enumerate(challenge2_preprocessed) if 'Valid' in path]
challenge2_indices_with_train_and_annotated = [i for i, path in enumerate(challenge2_preprocessed) if 'Train' in path and '/annotated' in path]
challenge2_indices_with_train_and_non_annotated = [i for i, path in enumerate(challenge2_preprocessed) if 'Train' in path and '/non-annotated' in path]

In [None]:
challenge2_preprocessed_valid_paths = [challenge2_preprocessed[i] for i in challenge2_preprocessed_valid_indices]
challenge2_valid_areas = [challenge2_train_area[i] for i in challenge2_preprocessed_valid_indices]

In [None]:
challenge2_preprocessed_train_annotated_paths=[challenge2_preprocessed[i] for i in challenge2_indices_with_train_and_annotated]
challenge2_train_annotated_area=  [challenge2_train_area[i] for i in challenge2_indices_with_train_and_annotated]

In [None]:
challenge2_preprocessed_train_non_annotated_paths=[challenge2_preprocessed[i] for i in challenge2_indices_with_train_and_non_annotated]
challenge2_train_non_annotated_area=  [challenge2_train_area[i] for i in challenge2_indices_with_train_and_non_annotated]

In [None]:
challenge2_train_path_area=pd.DataFrame((zip(challenge2_preprocessed_train_annotated_paths, challenge2_train_annotated_area)), columns = ['path', 'area'])
challenge2_non_annotated_path_area=pd.DataFrame((zip(challenge2_preprocessed_train_non_annotated_paths, challenge2_train_non_annotated_area)), columns = ['path', 'area'])
challenge2_valid_path_area=pd.DataFrame((zip(challenge2_preprocessed_valid_paths, challenge2_valid_areas)), columns = ['path', 'area'])

In [None]:
challenge2_train_path_area.to_csv(os.path.join(path_to_files,'challenge2_train_path_area.csv'))
challenge2_valid_path_area.to_csv(os.path.join(path_to_files,'challenge2_valid_path_area.csv'))
challenge2_non_annotated_path_area.to_csv(os.path.join(path_to_files,'challenge2_non_annotated_path_area.csv'))

# Sort the slices to group the slices of the same scan ##############

In [None]:

challenge1_valid_area_path = pd.read_csv(os.path.join(path_to_files, 'challenge1_valid_path_area.csv'))


challenge1_valid_area_path["ct_path"] = challenge1_valid_area_path["path"].apply(lambda x: "/".join(x.split("/")[:-1]))
challenge1_valid_area_path["ct_slice"] = challenge1_valid_area_path["path"].apply(lambda x: int(x.split("/")[-1].split(".")[0]))


challenge1_valid_area_path.sort_values(by=['ct_path', 'ct_slice'], inplace=True)

In [None]:
challenge1_train_area_path=pd.read_csv(os.path.join(path_to_files,'challenge1_train_path_area.csv'))

challenge1_train_area_path["ct_path"]=challenge1_train_area_path["path"].apply(lambda x: "/".join(x.split("/")[:-1]))

challenge1_train_area_path["ct_slice"]=challenge1_train_area_path["path"].apply(lambda x: int(x.split("/")[-1].split(".")[0]))

challenge1_train_area_path.sort_values(by=['ct_path', 'ct_slice'], inplace=True)


In [None]:
#the following function finds a contiguous subarray within an array a of length k,
#such that the sum of the subarray's elements is maximized. 
#It returns the starting and ending indices of this subarray.

def sum_max(a,w=0.4):
    l=len(a)
    k=int(np.ceil(l*w))
    d=0
    tmp_max=0
    # print(l, k)
    for i in range(l-k+1):
        if np.sum(a[i:i+k])>tmp_max:
            tmp_max=np.sum(a[i:i+k])
            d=i
    return d,d+k


In [None]:
#### Validation of challenge 1 Dataset ############

# The following finds and store the subarray within the "area" values of each unique CT path 
#that has the maximum sum for a specified window size (50% of the length of the "area" values)

challenge1_ct_path_list = challenge1_valid_area_path["ct_path"].unique()
challenge1_valid_dic = {}

for i in tqdm(range(len(challenge1_ct_path_list))):

    tmp_df = challenge1_valid_area_path[challenge1_valid_area_path["ct_path"] == challenge1_ct_path_list[i]].reset_index(drop=True)
    challenge1_valid_dic[challenge1_ct_path_list[i]] = list(sum_max(tmp_df["area"].values, 0.5))


In [None]:
#### Training of challenge 1 Dataset ############

challenge1_ct_path_list=challenge1_train_area_path["ct_path"].unique()
challenge1_train_dic={}
for i in tqdm(range(len(challenge1_ct_path_list))):
    tmp_df=challenge1_train_area_path[challenge1_train_area_path["ct_path"]==challenge1_ct_path_list[i]].reset_index(drop=True)
    challenge1_train_dic[challenge1_ct_path_list[i]]=list(sum_max(tmp_df["area"].values,0.5))
    

In [None]:
challenge1_train_dic

In [None]:
print(len(challenge1_train_dic),len(challenge1_valid_dic))

In [None]:
#### Saving the processed Info. #############

with open(os.path.join(path_to_files,'challenge1_valid_range.pickle'), 'wb') as handle:
    pickle.dump(challenge1_valid_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open(os.path.join(path_to_files,'challenge1_train_range.pickle'), 'wb') as handle:
    pickle.dump(challenge1_train_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)

##### Doing the same for challenge 2 dataset ####################

In [None]:
# Read the CSV file for 'train' in challenge2
challenge2_train_area_path = pd.read_csv(os.path.join(path_to_files, 'challenge2_train_path_area.csv'))

# Modify the DataFrame to include 'ct_path' and 'ct_slice' columns for 'train'
challenge2_train_area_path["ct_path"] = challenge2_train_area_path["path"].apply(lambda x: "/".join(x.split("/")[:-1]))
challenge2_train_area_path["ct_slice"] = challenge2_train_area_path["path"].apply(lambda x: int(x.split("/")[-1].split(".")[0]))

# Sort the DataFrame by 'ct_path' and 'ct_slice'
challenge2_train_area_path.sort_values(by=['ct_path', 'ct_slice'], inplace=True)

In [None]:
# Read the CSV file for 'valid' in challenge2
challenge2_valid_area_path = pd.read_csv(os.path.join(path_to_files, 'challenge2_valid_path_area.csv'))

# Modify the DataFrame to include 'ct_path' and 'ct_slice' columns for 'valid'
challenge2_valid_area_path["ct_path"] = challenge2_valid_area_path["path"].apply(lambda x: "/".join(x.split("/")[:-1]))
challenge2_valid_area_path["ct_slice"] = challenge2_valid_area_path["path"].apply(lambda x: int(x.split("/")[-1].split(".")[0]))

# Sort the DataFrame by 'ct_path' and 'ct_slice'
challenge2_valid_area_path.sort_values(by=['ct_path', 'ct_slice'], inplace=True)

In [None]:
# Read the CSV file for 'non_annotated' in challenge2
challenge2_non_annotated_area_path = pd.read_csv(os.path.join(path_to_files, 'challenge2_non_annotated_path_area.csv'))

# Modify the DataFrame to include 'ct_path' and 'ct_slice' columns for 'non_annotated'
challenge2_non_annotated_area_path["ct_path"] = challenge2_non_annotated_area_path["path"].apply(lambda x: "/".join(x.split("/")[:-1]))
challenge2_non_annotated_area_path["ct_slice"] = challenge2_non_annotated_area_path["path"].apply(lambda x: int(x.split("/")[-1].split(".")[0]))

# Sort the DataFrame by 'ct_path' and 'ct_slice'
challenge2_non_annotated_area_path.sort_values(by=['ct_path', 'ct_slice'], inplace=True)

In [None]:
# Assuming challenge2_train_area_path is previously defined and has the 'ct_path' and 'area' columns
challenge2_ct_path_list = challenge2_train_area_path["ct_path"].unique()
challenge2_train_dic = {}

for i in tqdm(range(len(challenge2_ct_path_list))):
    # Filter the DataFrame for the current CT path and reset the index
    tmp_df = challenge2_train_area_path[challenge2_train_area_path["ct_path"] == challenge2_ct_path_list[i]].reset_index(drop=True)
    
    # Assuming sum_max is a function you've defined elsewhere
    challenge2_train_dic[challenge2_ct_path_list[i]] = list(sum_max(tmp_df["area"].values, 0.5))

In [None]:
challenge2_ct_path_list = challenge2_valid_area_path["ct_path"].unique()
challenge2_valid_dic = {}

for i in tqdm(range(len(challenge2_ct_path_list))):
    # Filter the DataFrame for the current CT path and reset the index
    tmp_df = challenge2_valid_area_path[challenge2_valid_area_path["ct_path"] == challenge2_ct_path_list[i]].reset_index(drop=True)
    
    # Assuming sum_max is a function you've defined elsewhere
    challenge2_valid_dic[challenge2_ct_path_list[i]] = list(sum_max(tmp_df["area"].values, 0.5))


In [None]:
challenge2_ct_path_list = challenge2_non_annotated_area_path["ct_path"].unique()
challenge2_non_annotated_dic = {}

for i in tqdm(range(len(challenge2_ct_path_list))):
    # Filter the DataFrame for the current CT path and reset the index
    tmp_df = challenge2_non_annotated_area_path[challenge2_non_annotated_area_path["ct_path"] == challenge2_ct_path_list[i]].reset_index(drop=True)
    
    # Assuming sum_max is a function you've defined elsewhere
    challenge2_non_annotated_dic[challenge2_ct_path_list[i]] = list(sum_max(tmp_df["area"].values, 0.5))

In [None]:
with open(os.path.join(path_to_files,'challenge2_valid_range.pickle'), 'wb') as handle:
    pickle.dump(challenge2_valid_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open(os.path.join(path_to_files,'challenge2_non_annotated_range.pickle'), 'wb') as handle:
    pickle.dump(challenge2_non_annotated_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open(os.path.join(path_to_files,'challenge2_train_range.pickle'), 'wb') as handle:
    pickle.dump(challenge2_train_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
challenge2_train_scan_path_range = pd.DataFrame(list(challenge2_train_dic.items()), columns=['Path', 'Range'])

In [None]:
challenge2_valid_scan_path_range = pd.DataFrame(list(challenge2_valid_dic.items()), columns=['Path', 'Range'])

In [None]:
challenge2_non_annotated_scan_path_range = pd.DataFrame(list(challenge2_non_annotated_dic.items()), columns=['Path', 'Range'])

In [None]:
challenge1_train_scan_path_range = pd.DataFrame(list(challenge1_train_dic.items()), columns=['Path', 'Range'])
challenge1_valid_scan_path_range = pd.DataFrame(list(challenge1_valid_dic.items()), columns=['Path', 'Range'])

#### The following to label each CT Scan #######

In [None]:
# For the 'train' data scenario
challenge1_train_scan_path_range['Label'] = 0


# challenge1_train_scan_path_range_label = challenge1_train_scan_path_range
# However, to create a separate DataFrame for modifications, use .copy()
challenge1_train_scan_path_range_label = challenge1_train_scan_path_range.copy()

# Update 'Label' based on a condition in the copied or original DataFrame
challenge1_train_scan_path_range_label['Label'] = challenge1_train_scan_path_range_label['Path'].apply(lambda x: 1 if '/positive' in x else 0)

# For the 'valid' data scenario

# Assuming challenge1_valid_scan_path_range is your original DataFrame for 'valid' data
# Make a copy of the DataFrame to work with
challenge1_valid_scan_path_range_label = challenge1_valid_scan_path_range.copy()

# Initialize the 'Label' column to 0 in the new DataFrame
challenge1_valid_scan_path_range_label['Label'] = 0

# Use .apply() to update 'Label' based on a condition in the new DataFrame
challenge1_valid_scan_path_range_label['Label'] = challenge1_valid_scan_path_range_label['Path'].apply(lambda x: 1 if '/positive' in x else 0)

In [None]:
challenge2_train_scan_path_range['Label'] = 0

challenge2_train_scan_path_range_label=challenge2_train_scan_path_range
# Use .apply() to update 'Label' based on a condition
challenge2_train_scan_path_range_label['Label'] = challenge2_train_scan_path_range['Path'].apply(lambda x: 1 if '/cov_1' in x else 0)



# Make a copy of the DataFrame to work with
challenge2_valid_scan_path_range_label = challenge2_valid_scan_path_range.copy()

# Initialize the 'Label' column to 0 in the new DataFrame
challenge2_valid_scan_path_range_label['Label'] = 0

# Use .apply() to update 'Label' based on a condition in the new DataFrame
challenge2_valid_scan_path_range_label['Label'] = challenge2_valid_scan_path_range_label['Path'].apply(lambda x: 1 if '/cov_1' in x else 0)

Saving the files

In [None]:
challenge1_train_scan_path_range_label.to_csv(os.path.join(path_to_files,'la-challenge1_train_path_range_label.csv'))
challenge1_valid_scan_path_range_label.to_csv(os.path.join(path_to_files,'la-challenge1_valid_path_range_label.csv'))

challenge2_train_scan_path_range_label.to_csv(os.path.join(path_to_files,'la-challenge2_train_path_range_label.csv'))
challenge2_valid_scan_path_range_label.to_csv(os.path.join(path_to_files,'la-challenge2_valid_scan_path_range_label.csv'))
challenge2_non_annotated_scan_path_range.to_csv(os.path.join(path_to_files,'la-challenge2_non_annotated_scan_path_range.csv'))

In [None]:
print(1)