## Check if the extracted pdbs have the same ids of the csv final file

In [3]:
import pandas as pd
import os

# ==========================================
# 1. CONFIGURATION
# ==========================================
CSV_PATH = "/home/phd2/Scrivania/CorsoRepo/embryo_valencia/dataset_final_merged.csv"
ROOT_IMG_DIR = "/home/phd2/Documenti/embryo/valencia/extracted_equatorial_frames"

# ==========================================
# 2. SCAN DIRECTORIES
# ==========================================
print(f"Scanning directory: {ROOT_IMG_DIR} ...")

# Set to store names of all folders found in the directory tree
found_folders = set()
folder_paths = {} # Dictionary to store ID -> Full Path mapping (optional but useful)

# os.walk yields a 3-tuple (dirpath, dirnames, filenames)
for root, dirs, files in os.walk(ROOT_IMG_DIR):
    for dirname in dirs:
        # We assume the directory name is the ID (e.g., D2017.09.15_S0770_I631_1)
        clean_dirname = dirname.strip()
        found_folders.add(clean_dirname)
        folder_paths[clean_dirname] = os.path.join(root, dirname)

print(f"Total folders found in filesystem: {len(found_folders)}")

# ==========================================
# 3. LOAD DATASET & MATCH
# ==========================================
if os.path.exists(CSV_PATH):
    df = pd.read_csv(CSV_PATH)
    
    # Create a new column to indicate availability
    # We use boolean indexing to check membership in the set
    df['has_images'] = df['dish_well'].apply(lambda x: str(x).strip() in found_folders)
    
    # Optional: Store the path if found
    df['image_path'] = df['dish_well'].apply(lambda x: folder_paths.get(str(x).strip(), None))

    # ==========================================
    # 4. REPORT STATISTICS
    # ==========================================
    total_ids = len(df)
    found_count = df['has_images'].sum()
    missing_count = total_ids - found_count
    
    print("\n" + "="*40)
    print("MATCHING REPORT")
    print("="*40)
    print(f"Total IDs in CSV:       {total_ids}")
    print(f"IDs found in Folders:   {found_count}  ({(found_count/total_ids)*100:.1f}%)")
    print(f"IDs MISSING images:     {missing_count}  ({(missing_count/total_ids)*100:.1f}%)")
    print("="*40)
    
    # Show breakdown by Note (Outcome/Group)
    print("\nBreakdown by Group (Note):")
    print(df.groupby('Note')['has_images'].value_counts().unstack().fillna(0))
    
    # Show missing examples (first 5)
    if missing_count > 0:
        print("\n--- Example Missing IDs (First 5) ---")
        print(df[~df['has_images']]['dish_well'].head(5).to_string(index=False))
        
    # Show found examples (first 5)
    if found_count > 0:
        print("\n--- Example Found IDs (First 5) ---")
        print(df[df['has_images']]['dish_well'].head(5).to_string(index=False))

    # Optional: Save the report/updated CSV
    # df.to_csv(CSV_PATH.replace(".csv", "_checked.csv"), index=False)
    
else:
    print(f"Error: CSV file not found at {CSV_PATH}")

Scanning directory: /home/phd2/Documenti/embryo/valencia/extracted_equatorial_frames ...
Total folders found in filesystem: 211

MATCHING REPORT
Total IDs in CSV:       204
IDs found in Folders:   159  (77.9%)
IDs MISSING images:     45  (22.1%)

Breakdown by Group (Note):
has_images        False  True 
Note                          
GV_to_GV              7     29
GV_to_M1              4      8
GV_to_M2             11     36
M2_to_blasto         10     30
M2_to_no_blasto       4     16
rM2_to_blasto         2      6
rM2_to_no_blasto      7     34

--- Example Missing IDs (First 5) ---
     D2016.11.14_S1895_I106_6
     D2016.11.14_S1895_I106_7
 D2018.12.21_S02510_I0106_D_1
 D2018.12.21_S02510_I0106_D_2
D2019.02.15_S012161_I0631_D_1

--- Example Found IDs (First 5) ---
    D2017.09.15_S0770_I631_2
    D2017.09.15_S0770_I631_3
    D2017.09.15_S0770_I631_4
D2018.09.18_S01710_I0406_D_1
D2018.09.18_S01710_I0406_D_2


In [4]:
import pandas as pd
import os

# ==========================================
# 1. SETUP
# ==========================================
CSV_PATH = "/home/phd2/Scrivania/CorsoRepo/embryo_valencia/dataset_final_merged.csv"
ROOT_IMG_DIR = "/home/phd2/Documenti/embryo/valencia/extracted_equatorial_frames"

print("--- DIAGNOSTIC MODE ---")

# 1. Load Folder Names (Raw)
disk_folders = set()
for root, dirs, files in os.walk(ROOT_IMG_DIR):
    for d in dirs:
        disk_folders.add(d)

# 2. Load CSV IDs
if os.path.exists(CSV_PATH):
    df = pd.read_csv(CSV_PATH)
    csv_ids = df['dish_well'].dropna().astype(str).tolist()
    
    # 3. Find Missing
    # We check for EXACT match first
    missing_ids = [i for i in csv_ids if i not in disk_folders]
    
    print(f"\nTotal Folders on Disk: {len(disk_folders)}")
    print(f"Total IDs in CSV:      {len(csv_ids)}")
    print(f"Missing IDs:           {len(missing_ids)}")
    
    if len(missing_ids) > 0:
        print("\n" + "="*50)
        print("MISMATCH ANALYSIS")
        print("="*50)
        
        # Take the first missing ID to analyze
        target_id = missing_ids[0]
        print(f"analyzing first missing ID: '{target_id}'")
        print(f"(Length: {len(target_id)} chars)")
        
        # 4. Search for 'Close Suspects' on Disk
        # We try to find this ID inside the folder names, or vice versa, or by sample code
        # Extract a core chunk (e.g., the Sample ID part 'S1895')
        import re
        # Try to find 'S' followed by 4 digits
        match = re.search(r'(S\d{4,5})', target_id)
        if match:
            core_pattern = match.group(1)
            print(f"\nSearching disk for folders containing '{core_pattern}'...")
            suspects = [f for f in disk_folders if core_pattern in f]
            
            if suspects:
                print(f"Found {len(suspects)} potential matches on disk. Comparing:")
                print(f"{'CSV EXPECTS':<40} | {'ACTUAL FOLDER ON DISK'}")
                print("-" * 80)
                for s in suspects[:5]: # Show top 5
                    print(f"'{target_id}'\n   vs\n'{s}'")
                    print("-" * 30)
                    
                    # Check for common invisible issues
                    if target_id.strip() == s.strip() and target_id != s:
                        print("   -> DETECTED: Whitespace mismatch (check spaces at start/end)")
                    elif target_id.lower() == s.lower() and target_id != s:
                        print("   -> DETECTED: Case mismatch (Upper/Lower case)")
                    elif target_id.replace('.', '-') == s or target_id == s.replace('.', '-'):
                        print("   -> DETECTED: Date separator mismatch (. vs -)")
            else:
                print("No folders found with that Sample ID. The folder might use a completely different naming convention.")
        
        print("\n" + "="*50)
        print("RAW STRING DUMP (Check for spaces)")
        print("="*50)
        print("First 3 Missing IDs from CSV:")
        for i in missing_ids[:3]:
            print(f"  '{i}'")
            
        print("\nFirst 3 Folders found on Disk:")
        for i in list(disk_folders)[:3]:
            print(f"  '{i}'")
            
else:
    print("CSV not found.")

--- DIAGNOSTIC MODE ---

Total Folders on Disk: 211
Total IDs in CSV:      204
Missing IDs:           45

MISMATCH ANALYSIS
analyzing first missing ID: 'D2016.11.14_S1895_I106_6'
(Length: 24 chars)

Searching disk for folders containing 'S1895'...
No folders found with that Sample ID. The folder might use a completely different naming convention.

RAW STRING DUMP (Check for spaces)
First 3 Missing IDs from CSV:
  'D2016.11.14_S1895_I106_6'
  'D2016.11.14_S1895_I106_7'
  'D2018.12.21_S02510_I0106_D_1'

First 3 Folders found on Disk:
  'D2021.09.21_S01044_I3026_P_2'
  'D2020.06.01_S01605_I0631_D_5'
  '2022'


In [8]:
import pandas as pd
import os

# ==========================================
# CONFIG
# ==========================================
CSV_PATH = "/home/phd2/Scrivania/CorsoRepo/embryo_valencia/dataset_final_merged.csv"
TARGET_DIR_PARENT = "/home/phd2/Documenti/embryo/valencia/extracted_equatorial_frames/2018"
TARGET_FOLDER_NAME = "D2018.12.21_S02510_I0106_D_1"

print("--- DEEP DEBUGGING: D2018.12.21_S02510_I0106_D_1 ---")

# 1. CHECK FILESYSTEM VISIBILITY
full_target_path = os.path.join(TARGET_DIR_PARENT, TARGET_FOLDER_NAME)
print(f"\n1. Checking Filesystem Path: {full_target_path}")
if os.path.exists(full_target_path):
    print("   [OK] Path exists on disk.")
    
    # Get the actual string from the OS to compare bytes
    try:
        actual_folders = os.listdir(TARGET_DIR_PARENT)
        # Find the one that looks like our target
        fs_name = next(f for f in actual_folders if TARGET_FOLDER_NAME in f)
        print(f"   [INFO] Name read from disk: '{fs_name}'")
    except StopIteration:
        print("   [ERROR] Path exists but os.listdir() couldn't find the entry. Permission issue?")
        fs_name = "ERROR"
else:
    print("   [FAIL] Path does NOT exist. Check typos in parent path.")
    fs_name = "ERROR"

# 2. CHECK CSV DATA
print(f"\n2. Checking CSV Entry in: {CSV_PATH}")
if os.path.exists(CSV_PATH):
    df = pd.read_csv(CSV_PATH)
    # Filter for the target
    # We use 'str.contains' to be loose first, just to find the row
    subset = df[df['dish_well'].astype(str).str.contains("S0770_I631_3", na=False)]
    
    if len(subset) > 0:
        csv_name = subset.iloc[0]['dish_well']
        print(f"   [OK] Found row in CSV. ID: '{csv_name}'")
    else:
        print("   [FAIL] Could not find this ID in the CSV at all.")
        csv_name = "ERROR"
else:
    print("   [FAIL] CSV file missing.")
    csv_name = "ERROR"

# 3. BYTE-LEVEL COMPARISON
if fs_name != "ERROR" and csv_name != "ERROR":
    print("\n3. BYTE-LEVEL COMPARISON")
    print("   We will print the ASCII/Unicode value of every character.")
    print("   If they differ, you will see it here.")
    
    print(f"\n   {'Index':<5} | {'CSV Char':<10} {'Code':<10} || {'DISK Char':<10} {'Code':<10} | {'Match?'}")
    print("   " + "-"*70)
    
    max_len = max(len(csv_name), len(fs_name))
    
    for i in range(max_len):
        # CSV Char
        if i < len(csv_name):
            c_char = csv_name[i]
            c_code = ord(c_char)
            c_disp = repr(c_char)
        else:
            c_char, c_code, c_disp = "END", "", ""
            
        # Disk Char
        if i < len(fs_name):
            d_char = fs_name[i]
            d_code = ord(d_char)
            d_disp = repr(d_char)
        else:
            d_char, d_code, d_disp = "END", "", ""
            
        match = "OK" if c_char == d_char else "MISMATCH <---"
        print(f"   {i:<5} | {c_disp:<10} {c_code:<10} || {d_disp:<10} {d_code:<10} | {match}")

    if csv_name == fs_name:
        print("\n   [RESULT] Strings are IDENTICAL. If logic failed, it was the search algorithm, not the strings.")
    else:
        print("\n   [RESULT] Strings are DIFFERENT. See the mismatch arrow above.")
        print("   Common causes: 160 (Non-breaking space) vs 32 (Space), or hidden BOM markers.")

--- DEEP DEBUGGING: D2018.12.21_S02510_I0106_D_1 ---

1. Checking Filesystem Path: /home/phd2/Documenti/embryo/valencia/extracted_equatorial_frames/2018/D2018.12.21_S02510_I0106_D_1
   [FAIL] Path does NOT exist. Check typos in parent path.

2. Checking CSV Entry in: /home/phd2/Scrivania/CorsoRepo/embryo_valencia/dataset_final_merged.csv
   [OK] Found row in CSV. ID: 'D2017.09.15_S0770_I631_3'


In [10]:
import pandas as pd
import os

# ==========================================
# 1. CONFIGURATION
# ==========================================
CSV_PATH = "/home/phd2/Scrivania/CorsoRepo/embryo_valencia/dataset_final_merged.csv"
ROOT_IMG_DIR = "/home/phd2/Documenti/embryo/valencia/extracted_equatorial_frames"

print(f"Scanning directory: {ROOT_IMG_DIR}")

# ==========================================
# 2. ROBUST FILESYSTEM SCAN
# ==========================================
# We collect specific details to debug matches
found_data = []

for root, dirs, files in os.walk(ROOT_IMG_DIR):
    for dirname in dirs:
        # We store the cleaned name AND the full path
        clean_name = dirname.strip()
        full_path = os.path.join(root, dirname)
        found_data.append({
            'filesystem_id': clean_name,
            'image_path': full_path
        })

# Create a DataFrame of what is actually on the disk
df_files = pd.DataFrame(found_data)
print(f"Folders found on disk: {len(df_files)}")

# ==========================================
# 3. LOAD & MERGE
# ==========================================
if os.path.exists(CSV_PATH):
    df_data = pd.read_csv(CSV_PATH)
    
    # Ensure ID column is string and stripped of whitespace
    df_data['dish_well'] = df_data['dish_well'].astype(str).str.strip()
    
    # Check for duplicates in CSV (which might skew counts)
    if df_data['dish_well'].duplicated().any():
        print(f"Note: {df_data['dish_well'].duplicated().sum()} duplicate IDs found in CSV.")

    # --- THE MERGE ---
    # Left join: Keep all CSV rows, add path if found
    df_merged = pd.merge(
        df_data, 
        df_files, 
        left_on='dish_well', 
        right_on='filesystem_id', 
        how='left'
    )
    
    # Define 'has_images' based on whether the path is NaN or not
    df_merged['has_images'] = df_merged['image_path'].notna()

    # ==========================================
    # 4. VERIFICATION OF TARGET ID
    # ==========================================
    target_id = "D2018.12.21_S02510_I0106_D_1"
    print("\n" + "="*40)
    print(f"VERIFYING TARGET: {target_id}")
    print("="*40)
    
    # Check if this ID exists in the FILES dataframe
    file_check = df_files[df_files['filesystem_id'] == target_id]
    if not file_check.empty:
        print(f"[DISK] Found in df_files! Path: {file_check.iloc[0]['image_path']}")
    else:
        print(f"[DISK] NOT found in df_files. (This would be very strange given the previous debug)")

    # Check the merge result
    row = df_merged[df_merged['dish_well'] == target_id]
    if not row.empty:
        status = row.iloc[0]['has_images']
        path = row.iloc[0]['image_path']
        print(f"[MERGE] Final Status: {'FOUND' if status else 'MISSING'}")
        print(f"[MERGE] Path mapped: {path}")
    else:
        print("[MERGE] ID not found in dataset dataframe.")

    # ==========================================
    # 5. FINAL REPORT
    # ==========================================
    found_count = df_merged['has_images'].sum()
    missing_count = len(df_merged) - found_count
    
    print("\n" + "="*40)
    print("FINAL ACCURATE REPORT")
    print("="*40)
    print(f"Total IDs:      {len(df_merged)}")
    print(f"Found:          {found_count} ({(found_count/len(df_merged))*100:.1f}%)")
    print(f"Missing:        {missing_count}")
    
    if missing_count > 0:
        print("\n--- Genuine Missing IDs (First 5) ---")
        print(df_merged[~df_merged['has_images']]['dish_well'].head(5).to_string(index=False))

    # Save
    save_path = CSV_PATH.replace(".csv", "_with_paths.csv")
    
    # Clean up output (remove the helper column 'filesystem_id')
    df_final = df_merged.drop(columns=['filesystem_id'])
    df_final.to_csv(save_path, index=False)
    print(f"\nSaved updated dataset to: {save_path}")

else:
    print("CSV not found.")

Scanning directory: /home/phd2/Documenti/embryo/valencia/extracted_equatorial_frames
Folders found on disk: 211

VERIFYING TARGET: D2018.12.21_S02510_I0106_D_1
[DISK] NOT found in df_files. (This would be very strange given the previous debug)
[MERGE] Final Status: MISSING
[MERGE] Path mapped: nan

FINAL ACCURATE REPORT
Total IDs:      204
Found:          159 (77.9%)
Missing:        45

--- Genuine Missing IDs (First 5) ---
     D2016.11.14_S1895_I106_6
     D2016.11.14_S1895_I106_7
 D2018.12.21_S02510_I0106_D_1
 D2018.12.21_S02510_I0106_D_2
D2019.02.15_S012161_I0631_D_1

Saved updated dataset to: /home/phd2/Scrivania/CorsoRepo/embryo_valencia/dataset_final_merged_with_paths.csv


In [11]:
# print all missing IDs from the previous analysis
if 'df_merged' in locals():
    missing_ids = df_merged[~df_merged['has_images']]['dish_well'].tolist()
    print("\n--- COMPLETE LIST OF MISSING IDs ---")
    for mid in missing_ids:
        print(f"  '{mid}'")


--- COMPLETE LIST OF MISSING IDs ---
  'D2016.11.14_S1895_I106_6'
  'D2016.11.14_S1895_I106_7'
  'D2018.12.21_S02510_I0106_D_1'
  'D2018.12.21_S02510_I0106_D_2'
  'D2019.02.15_S012161_I0631_D_1'
  'D2019.02.15_S012161_I0631_D_2'
  'D2019.02.15_S012161_I0631_D_5'
  'D2019.02.15_S012161_I0631_D_6'
  'D2020.08.03_S02162_I0406_D_1'
  'D2020.08.03_S02162_I0406_D_5'
  'D2020.08.03_S02162_I0406_D_7'
  'D2018.12.21_S02510_I0106_D_4'
  'D2019.02.15_S012161_I0631_D_3'
  'D2020.08.03_S02162_I0406_D_3'
  'D2020.08.03_S02162_I0406_D_8'
  'D2016.11.14_S1895_I106_8'
  'D2016.11.14_S1895_I106_9'
  'D2018.12.21_S02510_I0106_D_3'
  'D2019.02.15_S012161_I0631_D_4'
  'D2020.08.03_S02162_I0406_D_2'
  'D2020.08.03_S02162_I0406_D_4'
  'D2020.08.03_S02162_I0406_D_6'
  'D2016.11.15_S0516_I631_1'
  'D2016.11.15_S0516_I631_2'
  'D2020.08.03_S00395_I3026_P_1'
  'D2020.08.03_S00395_I3026_P_2'
  'D2020.08.03_S00395_I3026_P_3'
  'D2020.08.03_S00395_I3026_P_4'
  'D2020.08.03_S00395_I3026_P_5'
  'D2024.07.13_S01059_I