In [15]:
import os
import csv
import pandas as pd

def extract_patient_id(file_name):
    patterns = [
        r'^\d+[A-Z]?_',           # Pattern for 44201451737A_24.9.png
        r'^[\d-]+ -',             # Pattern for 44-22-230-0278 - 1.5.png and 44230631043 - 1.1.png
        r'^Image_\d+',            # Pattern for Image_102.1.png
        r'^[\d-]+\s+-',           # Pattern for 44-23-118-0434  - 6.4.png
        r'^\d+[A-Z]?\.\d+'        # Pattern for 44221600351A.3.png
    ]
    
    for pattern in patterns:
        match = re.match(pattern, file_name)
        if match:
            return match.group(0).strip('_ -.')
    
    return None

def create_csv(directory, output_file='patch_train.csv', entry_type="train"):
    label_encodings = {"ALL": 4, "AML": 1, "CLL": 0, "CML": 3, "NORMAL": 2}
    current_label = 0

    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Filename', 'Location', 'Subtype', 'Patient ID', 'label', 'train/test'])
        
        for dirpath, _, filenames in os.walk(directory):
            subtype = os.path.basename(dirpath)
            for filename in filenames:
                if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff')):
                    location = os.path.join(dirpath, filename)
                    patient_id = extract_patient_id(filename)
                    if subtype not in label_encodings:
                        label_encodings[subtype] = current_label
                        current_label += 1
                    label = label_encodings[subtype]
                    writer.writerow([filename, location, subtype, patient_id, label, entry_type])

# Example usage:
directory_path_train = '/l/users/dawlat.akaila/DATA_MASKS/patched_train'
create_csv(directory_path_train)
directory_path_val = '/l/users/dawlat.akaila/DATA_MASKS/patched_val'
create_csv(directory_path_val, output_file='patch_test.csv', entry_type="test")

# Load CSV files
df_train = pd.read_csv('patch_train.csv')
df_test = pd.read_csv('patch_test.csv')

# Concatenate train and test dataframes
df_combined = pd.concat([df_train, df_test], ignore_index=True)

# Identify patient IDs that appear only once
patient_id_counts = df_combined['Patient ID'].value_counts()
single_patient_ids = patient_id_counts[patient_id_counts == 1].index

# Duplicate rows for patient IDs that appear only once
df_single_patients = df_combined[df_combined['Patient ID'].isin(single_patient_ids)]
df_combined = pd.concat([df_combined, df_single_patients], ignore_index=True)

# Optional: Save the modified dataframe to a new CSV file
df_combined.to_csv('patch_with_duplicates.csv', index=False)

In [4]:
import os
import csv
import re
import pandas as pd

# Define the range mappings for each subtype
range_mappings = {
    "ALL": {
        #train
        range(32, 52): "ALL_P1",
        range(519, 573): "ALL_P2",
        range(2249, 2341): "ALL_P3",
        range(4120, 4208): "ALL_P4",
    },

    "AML": {
        #train
        range(52, 168): "AML_P1",
        range(207, 658): "AML_P2",
        range(3862, 4054): "AML_P3",
        #val
        range(4, 16): "AML_P4",
        range(3800, 3970): "AML_P5",
    },

    "CML": {
        #train
        range(242,286): "CML_P1",
        #val
        range(168,195): "CML_P2",
    }, 
    "CLL": {
        #train
        range(4055,4094): "CLL_P1",
        range(4094,4119): "CLL_P2",
        #val
        range(424,518): "CLL_P3",

    },
    "NORMAL": {
        #train
        range(711,856): "NORMAL_P1",
        range(3504,3542): "NORMAL_P2",
        range(4253,4283): "NORMAL_P3",
        range(4253,4590): "NORMAL_P4",
        #val
        range(658,769): "NORMAL_P5",
        range(3542,3688): "NORMAL_P6",
        range(3688,4250): "NORMAL_P7",
    }
}

def get_patient_id_from_range(subtype, number):
    if subtype in range_mappings:
        for range_key, patient_id in range_mappings[subtype].items():
            if number in range_key:
                return patient_id
    return f"{subtype}_P_{number}"

def extract_patient_id(file_name, subtype):
    patterns = [
        r'^\d+[A-Z]?_',            # Pattern for 44201451737A_24.9.png
        r'^[\d-]+ -',              # Pattern for 44-22-230-0278 - 1.5.png and 44230631043 - 1.1.png
        r'^Image_\d+',             # Pattern for Image_102.1.png
        r'^[\d-]+\s+-',            # Pattern for 44-23-118-0434  - 6.4.png
        r'^\d+[A-Z]?\.\d+'         # Pattern for 44221600351A.3.png
    ]
    
    for pattern in patterns:
        match = re.match(pattern, file_name)
        if match:
            patient_id = match.group(0).strip('_ -.')
            if patient_id.startswith('Image_'):
                number = int(re.search(r'\d+', patient_id).group())
                patient_id = get_patient_id_from_range(subtype, number)
            return patient_id
    
    return None

def create_csv(directory, output_file='patch_train.csv', entry_type="train"):
    label_encodings = {"ALL": 4, "AML": 1, "CLL": 0, "CML": 3, "NORMAL": 2}
    current_label = 0

    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Filename', 'Location', 'Subtype', 'Patient ID', 'label', 'train/test'])
        
        for dirpath, _, filenames in os.walk(directory):
            subtype = os.path.basename(dirpath)
            for filename in filenames:
                if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff')):
                    location = os.path.join(dirpath, filename)
                    patient_id = extract_patient_id(filename, subtype)
                    if subtype not in label_encodings:
                        label_encodings[subtype] = current_label
                        current_label += 1
                    label = label_encodings[subtype]
                    writer.writerow([filename, location, subtype, patient_id, label, entry_type])

# Example usage:
directory_path_train = '/l/users/dawlat.akaila/DATA_MASKS/patched_train'
create_csv(directory_path_train)
directory_path_val = '/l/users/dawlat.akaila/DATA_MASKS/patched_val'
create_csv(directory_path_val, output_file='patch_test.csv', entry_type="test")

# Load CSV files
df_train = pd.read_csv('patch_train.csv')
df_test = pd.read_csv('patch_test.csv')

# Concatenate train and test dataframes
df_combined = pd.concat([df_train, df_test], ignore_index=True)

# Identify patient IDs that appear only once
patient_id_counts = df_combined['Patient ID'].value_counts()
single_patient_ids = patient_id_counts[patient_id_counts == 1].index

# Duplicate rows for patient IDs that appear only once
df_single_patients = df_combined[df_combined['Patient ID'].isin(single_patient_ids)]
df_combined = pd.concat([df_combined, df_single_patients], ignore_index=True)

# Optional: Save the modified dataframe to a new CSV file
df_combined.to_csv('combined_with_duplicates.csv', index=False)

print(df_combined)


                       Filename  \
0               Image_649.1.png   
1              Image_4048.3.png   
2      44-22-230-0278 - 9.4.png   
3              Image_3899.4.png   
4      44-23-067-0398 - 7.1.png   
...                         ...   
12883           Image_410.1.png   
12884           Image_387.1.png   
12885        44221600351A.3.png   
12886        44221600351A.2.png   
12887        44221600351A.1.png   

                                                Location Subtype  \
0      /l/users/dawlat.akaila/DATA_MASKS/patched_trai...     AML   
1      /l/users/dawlat.akaila/DATA_MASKS/patched_trai...     AML   
2      /l/users/dawlat.akaila/DATA_MASKS/patched_trai...     AML   
3      /l/users/dawlat.akaila/DATA_MASKS/patched_trai...     AML   
4      /l/users/dawlat.akaila/DATA_MASKS/patched_trai...     AML   
...                                                  ...     ...   
12883  /l/users/dawlat.akaila/DATA_MASKS/patched_trai...     AML   
12884  /l/users/dawlat.akaila/DATA_