### NEW SLIDE LEVEL

In [2]:
import os
import csv
import pandas as pd

# Add initial subgroup assignments
mapping = pd.read_csv('info.csv')
#get patient id based on mapping
def get_patient_id(filename):
    # Remove the extension from the filename
    base_filename = os.path.splitext(filename)[0]
    base_filename = base_filename.split('.')[0]
    print(base_filename)
    
    # Assuming 'mapping' is a DataFrame with a 'filename' column that also has extensions
    return mapping[mapping['filename'].str.split('.').str[0] == base_filename]['patient'].values[0]
    
def create_csv(directory, output_file='patch_train.csv', entry_type="train"):
    label_encodings = {"ALL": 4, "AML": 1, "CLL": 0, "CML": 3, "NORMAL": 2}
    current_label = 0

    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Filename', 'Location', 'Subtype', 'Patient ID', 'label', 'train/test'])
        
        for dirpath, _, filenames in os.walk(directory):
            subtype = os.path.basename(dirpath)
            for filename in filenames:
                if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff')):
                    location = os.path.join(dirpath, filename)
                    patient_id = get_patient_id(filename)

                    if subtype not in label_encodings:
                        label_encodings[subtype] = current_label
                        current_label += 1
                    label = label_encodings[subtype]
                    writer.writerow([filename, location, subtype, patient_id, label, entry_type])

In [None]:
directory_path_train = '/l/users/dawlat.akaila/DATA_MASKS/NO_REAC_APML/patched_train'
create_csv(directory_path_train)

In [None]:
directory_path_val = '/l/users/dawlat.akaila/DATA_MASKS/NO_REAC_APML/patched_val'
create_csv(directory_path_val, output_file='patch_test.csv', entry_type="test")

In [5]:
# Load CSV files
df_train = pd.read_csv('patch_train.csv')
df_test = pd.read_csv('patch_test.csv')

# Concatenate train and test dataframes
df_combined = pd.concat([df_train, df_test], ignore_index=True)

# Identify patient IDs that appear only once
patient_id_counts = df_combined['Patient ID'].value_counts()
single_patient_ids = patient_id_counts[patient_id_counts == 1].index

# Duplicate rows for patient IDs that appear only once
df_single_patients = df_combined[df_combined['Patient ID'].isin(single_patient_ids)]
df_combined = pd.concat([df_combined, df_single_patients], ignore_index=True)

# Optional: Save the modified dataframe to a new CSV file
df_combined.to_csv('patches_no_apml_reactive.csv', index=False)

print(df_combined)

                       Filename  \
0      44-22-230-0278 - 1.1.png   
1      44-22-230-0278 - 1.2.png   
2      44-22-230-0278 - 1.3.png   
3      44-22-230-0278 - 1.4.png   
4      44-22-230-0278 - 1.5.png   
...                         ...   
11087           Image_705.1.png   
11088           Image_706.1.png   
11089           Image_708.1.png   
11090           Image_709.1.png   
11091           Image_710.1.png   

                                                Location Subtype  \
0      /l/users/dawlat.akaila/DATA_MASKS/NO_REAC_APML...     AML   
1      /l/users/dawlat.akaila/DATA_MASKS/NO_REAC_APML...     AML   
2      /l/users/dawlat.akaila/DATA_MASKS/NO_REAC_APML...     AML   
3      /l/users/dawlat.akaila/DATA_MASKS/NO_REAC_APML...     AML   
4      /l/users/dawlat.akaila/DATA_MASKS/NO_REAC_APML...     AML   
...                                                  ...     ...   
11087  /l/users/dawlat.akaila/DATA_MASKS/NO_REAC_APML...  NORMAL   
11088  /l/users/dawlat.akaila/DATA_