In [15]:
import os
import csv
import pandas as pd

def extract_patient_id(file_name):
    patterns = [
        r'^\d+[A-Z]?_',           # Pattern for 44201451737A_24.9.png
        r'^[\d-]+ -',             # Pattern for 44-22-230-0278 - 1.5.png and 44230631043 - 1.1.png
        r'^Image_\d+',            # Pattern for Image_102.1.png
        r'^[\d-]+\s+-',           # Pattern for 44-23-118-0434  - 6.4.png
        r'^\d+[A-Z]?\.\d+'        # Pattern for 44221600351A.3.png
    ]
    
    for pattern in patterns:
        match = re.match(pattern, file_name)
        if match:
            return match.group(0).strip('_ -.')
    
    return None

def create_csv(directory, output_file='patch_train.csv', entry_type="train"):
    label_encodings = {"ALL": 4, "AML": 1, "CLL": 0, "CML": 3, "NORMAL": 2}
    current_label = 0

    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Filename', 'Location', 'Subtype', 'Patient ID', 'label', 'train/test'])
        
        for dirpath, _, filenames in os.walk(directory):
            subtype = os.path.basename(dirpath)
            for filename in filenames:
                if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff')):
                    location = os.path.join(dirpath, filename)
                    patient_id = extract_patient_id(filename)
                    if subtype not in label_encodings:
                        label_encodings[subtype] = current_label
                        current_label += 1
                    label = label_encodings[subtype]
                    writer.writerow([filename, location, subtype, patient_id, label, entry_type])

# Example usage:
directory_path_train = '/l/users/dawlat.akaila/DATA_MASKS/patched_train'
create_csv(directory_path_train)
directory_path_val = '/l/users/dawlat.akaila/DATA_MASKS/patched_val'
create_csv(directory_path_val, output_file='patch_test.csv', entry_type="test")

# Load CSV files
df_train = pd.read_csv('patch_train.csv')
df_test = pd.read_csv('patch_test.csv')

# Concatenate train and test dataframes
df_combined = pd.concat([df_train, df_test], ignore_index=True)

# Identify patient IDs that appear only once
patient_id_counts = df_combined['Patient ID'].value_counts()
single_patient_ids = patient_id_counts[patient_id_counts == 1].index

# Duplicate rows for patient IDs that appear only once
df_single_patients = df_combined[df_combined['Patient ID'].isin(single_patient_ids)]
df_combined = pd.concat([df_combined, df_single_patients], ignore_index=True)

# Optional: Save the modified dataframe to a new CSV file
df_combined.to_csv('patch_with_duplicates.csv', index=False)

In [4]:
import os
import csv
import re
import pandas as pd

# Define the range mappings for each subtype
range_mappings = {
    "ALL": {
        #train
        range(32, 52): "ALL_P1",
        range(519, 573): "ALL_P2",
        range(2249, 2341): "ALL_P3",
        range(4120, 4208): "ALL_P4",
    },

    "AML": {
        #train
        range(52, 168): "AML_P1",
        range(207, 44): "AML_P2",
        range(581, 658): "AML_P3",
        range(3862, 4054): "AML_P4",
        #val
        range(4, 16): "AML_P5",
        range(3800, 3970): "AML_P6",
    },

    "CML": {
        #train
        range(242,286): "CML_P1",
        #val
        range(168,195): "CML_P2",
    }, 
    "CLL": {
        #train
        range(4055,4120): "CLL_P1",
        #val
        range(424,518): "CLL_P2",

    },
    "NORMAL": {
        #train
        range(711,856): "NORMAL_P1",
        range(3504,3542): "NORMAL_P2",
        range(4253,4283): "NORMAL_P3",
        range(4253,4590): "NORMAL_P4",
        #val
        range(658,769): "NORMAL_P5",
        range(3542,3688): "NORMAL_P6",
        range(3688,4250): "NORMAL_P7",
    }
}

def get_patient_id_from_range(subtype, number):
    if subtype in range_mappings:
        for range_key, patient_id in range_mappings[subtype].items():
            if number in range_key:
                return patient_id
    return f"{subtype}_P_{number}"

def extract_patient_id(file_name, subtype):
    patterns = [
        r'^\d+[A-Z]?_',            # Pattern for 44201451737A_24.9.png
        r'^[\d-]+ -',              # Pattern for 44-22-230-0278 - 1.5.png and 44230631043 - 1.1.png
        r'^Image_\d+',             # Pattern for Image_102.1.png
        r'^[\d-]+\s+-',            # Pattern for 44-23-118-0434  - 6.4.png
        r'^\d+[A-Z]?\.\d+'         # Pattern for 44221600351A.3.png
    ]
    
    for pattern in patterns:
        match = re.match(pattern, file_name)
        if match:
            patient_id = match.group(0).strip('_ -.')
            if patient_id.startswith('Image_'):
                number = int(re.search(r'\d+', patient_id).group())
                patient_id = get_patient_id_from_range(subtype, number)
            return patient_id
    
    return None

def create_csv(directory, output_file='patch_train.csv', entry_type="train"):
    label_encodings = {"ALL": 4, "AML": 1, "CLL": 0, "CML": 3, "NORMAL": 2}
    current_label = 0

    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Filename', 'Location', 'Subtype', 'Patient ID', 'label', 'train/test'])
        
        for dirpath, _, filenames in os.walk(directory):
            subtype = os.path.basename(dirpath)
            for filename in filenames:
                if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff')):
                    location = os.path.join(dirpath, filename)
                    patient_id = extract_patient_id(filename, subtype)
                    if subtype not in label_encodings:
                        label_encodings[subtype] = current_label
                        current_label += 1
                    label = label_encodings[subtype]
                    writer.writerow([filename, location, subtype, patient_id, label, entry_type])

# Example usage:
directory_path_train = '/l/users/dawlat.akaila/DATA_MASKS/patched_train'
create_csv(directory_path_train)
directory_path_val = '/l/users/dawlat.akaila/DATA_MASKS/patched_val'
create_csv(directory_path_val, output_file='patch_test.csv', entry_type="test")

# Load CSV files
df_train = pd.read_csv('patch_train.csv')
df_test = pd.read_csv('patch_test.csv')

# Concatenate train and test dataframes
df_combined = pd.concat([df_train, df_test], ignore_index=True)

# Identify patient IDs that appear only once
patient_id_counts = df_combined['Patient ID'].value_counts()
single_patient_ids = patient_id_counts[patient_id_counts == 1].index

# Duplicate rows for patient IDs that appear only once
df_single_patients = df_combined[df_combined['Patient ID'].isin(single_patient_ids)]
df_combined = pd.concat([df_combined, df_single_patients], ignore_index=True)

# Optional: Save the modified dataframe to a new CSV file
df_combined.to_csv('combined_with_duplicates.csv', index=False)

print(df_combined)


                       Filename  \
0               Image_649.1.png   
1              Image_4048.3.png   
2      44-22-230-0278 - 9.4.png   
3              Image_3899.4.png   
4      44-23-067-0398 - 7.1.png   
...                         ...   
12883           Image_410.1.png   
12884           Image_387.1.png   
12885        44221600351A.3.png   
12886        44221600351A.2.png   
12887        44221600351A.1.png   

                                                Location Subtype  \
0      /l/users/dawlat.akaila/DATA_MASKS/patched_trai...     AML   
1      /l/users/dawlat.akaila/DATA_MASKS/patched_trai...     AML   
2      /l/users/dawlat.akaila/DATA_MASKS/patched_trai...     AML   
3      /l/users/dawlat.akaila/DATA_MASKS/patched_trai...     AML   
4      /l/users/dawlat.akaila/DATA_MASKS/patched_trai...     AML   
...                                                  ...     ...   
12883  /l/users/dawlat.akaila/DATA_MASKS/patched_trai...     AML   
12884  /l/users/dawlat.akaila/DATA_

In [None]:
import os
import csv
import re
import pandas as pd

def create_csv(directory, output_file='patch_train.csv', entry_type="train"):
    label_encodings = {"ALL": 4, "AML": 1, "CLL": 0, "CML": 3, "NORMAL": 2}
    current_label = 0

    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Filename', 'Location', 'Subtype', 'Patient ID', 'label', 'train/test'])
        
        for dirpath, _, filenames in os.walk(directory):
            subtype = os.path.basename(dirpath)
            for filename in filenames:
                if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff')):
                    location = os.path.join(dirpath, filename)
                    patient_id = "P1"
                    if subtype not in label_encodings:
                        label_encodings[subtype] = current_label
                        current_label += 1
                    label = label_encodings[subtype]
                    writer.writerow([filename, location, subtype, patient_id, label, entry_type])

# Example usage:

directory_path_val = '/l/users/dawlat.akaila/DATA_MASKS/patched_val'
create_csv(directory_path_val, output_file='patch_test.csv', entry_type="test")

# Load CSV files
df = pd.read_csv('patch_test.csv')

# Identify patient IDs that appear only once
patient_id_counts = df['Patient ID'].value_counts()
single_patient_ids = patient_id_counts[patient_id_counts == 1].index

# Duplicate rows for patient IDs that appear only once
df_single_patients = df[df['Patient ID'].isin(single_patient_ids)]
df = pd.concat([df, df_single_patients], ignore_index=True)

# Optional: Save the modified dataframe to a new CSV file
df.to_csv('combined_with_duplicates.csv', index=False)

print(df_combined)

In [None]:
def create_csv(directory, output_file='patch_test.csv', entry_type="test"):
    label_encodings = {"ALL": 4, "AML": 1, "CLL": 0, "CML": 3, "NORMAL": 2}
    current_label = 0

    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Filename', 'Location', 'Subtype', 'Patient ID', 'label', 'train/test'])
        
        for dirpath, _, filenames in os.walk(directory):
            subtype = os.path.basename(dirpath)
            for filename in filenames:
                if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff')):
                    location = os.path.join(dirpath, filename)
                    patient_id = "P1"
                    if subtype not in label_encodings:
                        label_encodings[subtype] = current_label
                        current_label += 1
                    label = label_encodings[subtype]
                    writer.writerow([filename, location, subtype, patient_id, label, entry_type])

    # Load CSV file
    df = pd.read_csv(output_file)
    df = df.sort_values(by='Filename').reset_index(drop=True)

    # Add initial subgroup assignments
    patient_id = 
    count = len(df)
    subgroup_numbers = determine_subgroup(count)
    df['Patient ID_Subgroup'] = [f"{patient_id}_{subgroup}" for subgroup in subgroup_numbers]

    # Apply the consolidation function
    df = consolidate_subgroups(df)

    #rename 'Patient ID' column
    df.rename(columns={'Patient ID': 'Patient ID_Original'}, inplace=True)
    df.rename(columns={'Patient ID_Subgroup': 'Patient ID'}, inplace=True)

    # Save the modified dataframe to a new CSV file
    df.to_csv(output_file, index=False)

### NEW 10 INSTANCES IN ONE BAG

In [1]:
import os
import csv
import pandas as pd

In [2]:
def determine_subgroup(count):
    if count <= 5:
        return [1] * count
    else:
        subgroups = [(i // 5) + 1 for i in range(count)]
        # Handle the remainder
        remainder = count % 5
        if remainder != 0:
            last_full_group = (count // 5) - 1
            for i in range(remainder):
                subgroups[-(i + 1)] = last_full_group + 1
        return subgroups
    
# Ensure each subgroup has at least 3 instances
def consolidate_subgroups(df):
    current_subgroup = None
    current_count = 0
    previous_subgroup = None

    for index, row in df.iterrows():
        if current_subgroup is None:
            current_subgroup = row['Patient ID_Subgroup']
            current_count = 1
        elif current_subgroup == row['Patient ID_Subgroup']:
            current_count += 1
        else:
            if current_count < 3:
                df.loc[df['Patient ID_Subgroup'] == current_subgroup, 'Patient ID_Subgroup'] = previous_subgroup
            previous_subgroup = current_subgroup
            current_subgroup = row['Patient ID_Subgroup']
            current_count = 1
    if current_count < 3 and previous_subgroup:
        df.loc[df['Patient ID_Subgroup'] == current_subgroup, 'Patient ID_Subgroup'] = previous_subgroup
    return df

# Add initial subgroup assignments
mapping = pd.read_csv('info.csv')
#get patient id based on mapping
def get_patient_id(filename):
    # Remove the extension from the filename
    base_filename = os.path.splitext(filename)[0]
    base_filename = base_filename.split('.')[0]
    print(base_filename)
    
    # Assuming 'mapping' is a DataFrame with a 'filename' column that also has extensions
    return mapping[mapping['filename'].str.split('.').str[0] == base_filename]['patient'].values[0]
    
def create_csv(directory, output_file='patch_train.csv', entry_type="train"):
    label_encodings = {"ALL": 4, "AML": 1, "CLL": 0, "CML": 3, "NORMAL": 2}
    current_label = 0

    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Filename', 'Location', 'Subtype', 'Patient ID', 'label', 'train/test'])
        
        for dirpath, _, filenames in os.walk(directory):
            subtype = os.path.basename(dirpath)
            for filename in filenames:
                if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff')):
                    location = os.path.join(dirpath, filename)
                    patient_id = get_patient_id(filename)

                    if subtype not in label_encodings:
                        label_encodings[subtype] = current_label
                        current_label += 1
                    label = label_encodings[subtype]
                    writer.writerow([filename, location, subtype, patient_id, label, entry_type])

    df = pd.read_csv(output_file)
    df = df.sort_values(by='Filename').reset_index(drop=True)

    patient_ids = df['Patient ID'].unique().tolist()

    for patient_id in patient_ids:
        count = len(df[df['Patient ID'] == patient_id])
        subgroup_numbers = determine_subgroup(count)
        df.loc[df['Patient ID'] == patient_id, 'Patient ID_Subgroup'] = [f"{patient_id}_{subgroup}" for subgroup in subgroup_numbers]

    #rename 'Patient ID' column
    df.rename(columns={'Patient ID': 'Patient ID_Original'}, inplace=True)
    df.rename(columns={'Patient ID_Subgroup': 'Patient ID'}, inplace=True)

    # Save the modified dataframe to a new CSV file
    df.to_csv(output_file, index=False)

In [3]:
directory_path_train = '/l/users/dawlat.akaila/DATA_MASKS/NO_REAC_APML/patched_train'
create_csv(directory_path_train)

44213572511A_103
44221442027A_10
44213572511A_108
44213572511A_38
44213572511A_48
44213572511A_57
Image_4099
Image_4064
44213572511A_2
44213572511A_93
44213572511A_65
44211490506A_19
44213572511A_100
44213572511A_98
44213572511A_9
44211490506A_33
44213572511A_76
44213572511A_146
44221590363A_5
44213572511A_44
44212271083A_20
44213572511A_122
44213572511A_19
44211490506A_65
Image_4108
44213572511A_20
44213572511A_146
44213572511A_76
44221590363A_5
44213572511A_80
44211490506A_5
44213572511A_44
44212812284A_55
44211490506A_38
44213572511A_104
44221442027A_31
44211490506A_57
Image_4103
44213572511A_19
44213572511A_129
44213572511A_69
44211490506A_65
44213572511A_110
44212271083A_5
44213572511A_20
44213572511A_103
44212812284A_10
44213572511A_38
44213572511A_108
44213572511A_57
44211490506A_12
44213572511A_93
44213572511A_2
44213572511A_65
44213572511A_9
Image_4092
44213572511A_136
44211490506A_71
44213572511A_80
44213572511A_34
44213572511A_104
Image_4068
44211490506A_27
44213572511A_62
4

In [4]:
directory_path_val = '/l/users/dawlat.akaila/DATA_MASKS/NO_REAC_APML/patched_val'
create_csv(directory_path_val, output_file='patch_test.csv', entry_type="test")

44221290379A_85
44221290379A_73
44221290379A_25
44221290379A_17
44-23-016-1755 - 2
Image_183
44221290379A_36
Image_188
Image_175
44221290379A_108
44221290379A_59
44221290379A_96
Image_183
Image_188
Image_175
44-23-016-1755 - 9
44221290379A_108
44221290379A_52
44221290379A_96
44221290379A_78
44221290379A_85
44221290379A_73
44221290379A_25
Image_190
44221290379A_17
44221290379A_110
44221290379A_74
44221290379A_89
44221290379A_46
Image_188
44221290379A_10
44221290379A_29
44221290379A_22
44221290379A_6
Image_172
44221290379A_31
Image_184
44-23-016-1755 - 5
44221290379A_104
44221290379A_67
Image_172
44221290379A_6
Image_184
Image_179
44221290379A_104
44221290379A_55
44221290379A_67
44221290379A_74
44221290379A_46
44221290379A_82
44221290379A_10
44221290379A_29
44-23-016-1755 - 11
44221290379A_20
Image_168
44221290379A_12
44221290379A_80
44221290379A_44
44221290379A_143
44221290379A_98
44221290379A_65
Image_192
44221290379A_93
44221290379A_106
Image_186
44221290379A_33
Image_170
44221290379A

In [5]:
# Load CSV files
df_train = pd.read_csv('patch_train.csv')
df_test = pd.read_csv('patch_test.csv')

# Concatenate train and test dataframes
df_combined = pd.concat([df_train, df_test], ignore_index=True)

# Identify patient IDs that appear only once
patient_id_counts = df_combined['Patient ID'].value_counts()
single_patient_ids = patient_id_counts[patient_id_counts == 1].index

# Duplicate rows for patient IDs that appear only once
df_single_patients = df_combined[df_combined['Patient ID'].isin(single_patient_ids)]
df_combined = pd.concat([df_combined, df_single_patients], ignore_index=True)

# Optional: Save the modified dataframe to a new CSV file
df_combined.to_csv('patches_no_apml_reactive.csv', index=False)

print(df_combined)

                       Filename  \
0      44-22-230-0278 - 1.1.png   
1      44-22-230-0278 - 1.2.png   
2      44-22-230-0278 - 1.3.png   
3      44-22-230-0278 - 1.4.png   
4      44-22-230-0278 - 1.5.png   
...                         ...   
11087           Image_705.1.png   
11088           Image_706.1.png   
11089           Image_708.1.png   
11090           Image_709.1.png   
11091           Image_710.1.png   

                                                Location Subtype  \
0      /l/users/dawlat.akaila/DATA_MASKS/NO_REAC_APML...     AML   
1      /l/users/dawlat.akaila/DATA_MASKS/NO_REAC_APML...     AML   
2      /l/users/dawlat.akaila/DATA_MASKS/NO_REAC_APML...     AML   
3      /l/users/dawlat.akaila/DATA_MASKS/NO_REAC_APML...     AML   
4      /l/users/dawlat.akaila/DATA_MASKS/NO_REAC_APML...     AML   
...                                                  ...     ...   
11087  /l/users/dawlat.akaila/DATA_MASKS/NO_REAC_APML...  NORMAL   
11088  /l/users/dawlat.akaila/DATA_