## Check if the extracted pdbs have the same ids of the csv final file

In [None]:
import pandas as pd
import os

# ==========================================
# 1. CONFIGURATION
# ==========================================
CSV_PATH = "/home/phd2/Scrivania/CorsoRepo/embryo_valencia/datasets/dataset_final_merged.csv"
ROOT_IMG_DIR = "/home/phd2/Documenti/embryo/marilena_videos/extracted_equatorial_frames"

# ==========================================
# 2. SCAN DIRECTORIES
# ==========================================
print(f"Scanning directory: {ROOT_IMG_DIR} ...")

# Set to store names of all folders found in the directory tree
found_folders = set()
folder_paths = {} # Dictionary to store ID -> Full Path mapping (optional but useful)

# os.walk yields a 3-tuple (dirpath, dirnames, filenames)
for root, dirs, files in os.walk(ROOT_IMG_DIR):
    for dirname in dirs:
        # We assume the directory name is the ID (e.g., D2017.09.15_S0770_I631_1)
        clean_dirname = dirname.strip()
        found_folders.add(clean_dirname)
        folder_paths[clean_dirname] = os.path.join(root, dirname)

print(f"Total folders found in filesystem: {len(found_folders)}")

# ==========================================
# 3. LOAD DATASET & MATCH
# ==========================================
if os.path.exists(CSV_PATH):
    df = pd.read_csv(CSV_PATH)
    
    # Create a new column to indicate availability
    # We use boolean indexing to check membership in the set
    df['has_images'] = df['dish_well'].apply(lambda x: str(x).strip() in found_folders)
    
    # Optional: Store the path if found
    df['image_path'] = df['dish_well'].apply(lambda x: folder_paths.get(str(x).strip(), None))

    # ==========================================
    # 4. REPORT STATISTICS
    # ==========================================
    total_ids = len(df)
    found_count = df['has_images'].sum()
    missing_count = total_ids - found_count
    
    print("\n" + "="*40)
    print("MATCHING REPORT")
    print("="*40)
    print(f"Total IDs in CSV:       {total_ids}")
    print(f"IDs found in Folders:   {found_count}  ({(found_count/total_ids)*100:.1f}%)")
    print(f"IDs MISSING images:     {missing_count}  ({(missing_count/total_ids)*100:.1f}%)")
    print("="*40)
    
    # Show breakdown by Note (Outcome/Group)
    print("\nBreakdown by Group (Note):")
    print(df.groupby('Note')['has_images'].value_counts().unstack().fillna(0))
    
    # Show missing examples (first 5)
    if missing_count > 0:
        print("\n--- Example Missing IDs (First 5) ---")
        print(df[~df['has_images']]['dish_well'].head(5).to_string(index=False))
        
    # Show found examples (first 5)
    if found_count > 0:
        print("\n--- Example Found IDs (First 5) ---")
        print(df[df['has_images']]['dish_well'].head(5).to_string(index=False))

    # Optional: Save the report/updated CSV
    # df.to_csv(CSV_PATH.replace(".csv", "_checked.csv"), index=False)
    
else:
    print(f"Error: CSV file not found at {CSV_PATH}")

Scanning directory: /home/phd2/Documenti/embryo/marilena_videos/extracted_equatorial_frames ...
Total folders found in filesystem: 259

MATCHING REPORT
Total IDs in CSV:       204
IDs found in Folders:   204  (100.0%)
IDs MISSING images:     0  (0.0%)

Breakdown by Group (Note):
has_images        True
Note                  
GV_to_GV            36
GV_to_M1            12
GV_to_M2            47
M2_to_blasto        40
M2_to_no_blasto     20
rM2_to_blasto        8
rM2_to_no_blasto    41

--- Example Found IDs (First 5) ---
D2016.11.14_S1895_I106_6
D2016.11.14_S1895_I106_7
D2017.09.15_S0770_I631_2
D2017.09.15_S0770_I631_3
D2017.09.15_S0770_I631_4


In [10]:
# list all the folders not found
missing_folders = found_folders - set(df['dish_well'].astype(str).str.strip())
if missing_folders:
    print("\nFolders in filesystem not listed in CSV:")
    for folder in missing_folders:
        print(folder)
else:
    print("\nAll folders in filesystem are listed in the CSV.")

print("\n" + "="*40)

# list all the folders in the csv not found
missing_in_filesystem = set(df['dish_well'].astype(str).str.strip()) - found_folders
if missing_in_filesystem:
    print("\nIDs in CSV not found in filesystem:")
    for folder in missing_in_filesystem:
        print(folder)
else:
    print("\nAll IDs in CSV were found in the filesystem.\n")


Folders in filesystem not listed in CSV:
D2024.07.13_S01059_I4587_P_10
2016
D2020.06.01_S01605_I0631_D_12
D2020.06.01_S01605_I0631_D_4
D2024.07.10_S04123_I0057_P_4
D2021.09.22_S00966_I0758_D_11
D2021.09.22_S00966_I0758_D_10
D2017.09.16_S0772_I631_7
2024
D2021.09.22_S00966_I0758_D_9
D2024.07.13_S01059_I4587_P_9
2019
D2018.06.12_S01044_I0631_D_1
D2024.07.13_S01059_I4587_P_6
D2024.07.13_S01059_I4587_P_5
D2021.07.10_S00925_I3026_P_6
D2024.12.20_S04300_I0057_P_5
2021
D2021.09.21_S01044_I3026_P_7
D2021.09.21_S01044_I3026_P_12
D2020.06.01_S01605_I0631_D_9
2017
D2018.06.11_S01043_I0631_D_10
D2017.09.16_S0772_I631_2
D2018.06.11_S01043_I0631_D_9
D2021.07.10_S00925_I3026_P_8
D2020.06.01_S01605_I0631_D_3
D2024.07.13_S01059_I4587_P_8
D2021.07.10_S00925_I3026_P_7
D2021.09.22_S00966_I0758_D_1
D2021.09.22_S00966_I0758_D_2
D2016.11.15_S0516_I631_3
2020
D2020.06.01_S01605_I0631_D_11
D2021.09.22_S00966_I0758_D_3
D2018.06.12_S01044_I0631_D_2
D2021.07.10_S00925_I3026_P_11
2018
D2024.12.20_S04300_I0057_P_6