# Preprocessing Immagini Blastocisti 

**Caratteristiche principali:**
- Rilevamento della root `cellPIV` e import opzionale di `config` (fallback se non presente).
- Funzione per organizzare i video per classificazione (`blasto` / `no_blasto`).
- Verifica immagini corrotte con log e riepilogo.
- Funzione per tentare di correggere JPEG troncati.
- Opzioni `DRY_RUN` e `FIX_IMAGES` per non eseguire modifiche senza volerlo.

**Istruzioni d'uso:** esegui le celle nell'ordine. Imposta le opzioni (SRC_DIR, DEST_DIR, EXCEL_PATH, DRY_RUN, FIX_IMAGES) nella cella dei parametri prima di lanciare le operazioni.

In [27]:
%matplotlib inline
import os, sys, shutil, logging, time
from pathlib import Path
import pandas as pd
from PIL import Image, ImageFile

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

cwd = Path().resolve()
root = cwd
for _ in range(10):
    if root.name == 'cellPIV':
        break
    if root.parent == root:
        break
    root = root.parent

fallback_root = Path('/home/phd2/Scrivania/CorsoRepo/cellPIV')
if root.name != 'cellPIV':
    root = fallback_root

if str(root) not in sys.path:
    sys.path.append(str(root))

try:
    from config import Config_00_preprocessing as conf
    print("Imported conf from config.Config_00_preprocessing")
except Exception as e:
    conf = None
    print("Could not import conf (ok). Please set paths manually if needed.")

print("Using root:", root)


Imported conf from config.Config_00_preprocessing
Using root: /home/phd2/Scrivania/CorsoRepo/cellPIV


In [28]:
from pathlib import Path
try:
    SRC_DIR = Path(conf.dest_dir_time_conversion) if conf and hasattr(conf, 'dest_dir_time_conversion') else (_ for _ in ()).throw(ValueError("conf.dest_dir_time_conversion not set"))
    DEST_DIR = Path(conf.dest_dir_blastoData) if conf and hasattr(conf, 'dest_dir_blastoData') else (_ for _ in ()).throw(ValueError("conf.dest_dir_blastoData not set"))
    EXCEL_PATH = Path(conf.filtered_blasto_dataset) if conf and hasattr(conf, 'filtered_blasto_dataset') else (_ for _ in ()).throw(ValueError("conf.filtered_blasto_dataset not set"))
except Exception as e:
    raise RuntimeError(f"Error setting SRC_DIR or DEST_DIR: {e}")

DRY_RUN = False
FIX_IMAGES = True
SAVE_LOGS = True
MAX_FIX_FILES = None
IMAGE_EXTENSIONS = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.gif')

print("SRC_DIR =", SRC_DIR)
print("DEST_DIR =", DEST_DIR)
print("EXCEL_PATH =", EXCEL_PATH)
print("DRY_RUN =", DRY_RUN, "FIX_IMAGES =", FIX_IMAGES, "SAVE_LOGS =", SAVE_LOGS)


SRC_DIR = /home/phd2/Scrivania/CorsoData/ScopeData_time_conversion
DEST_DIR = /home/phd2/Scrivania/CorsoData/blastocisti
EXCEL_PATH = /home/phd2/Scrivania/CorsoRepo/cellPIV/datasets/filtered_blasto_dataset.csv
DRY_RUN = False FIX_IMAGES = True SAVE_LOGS = True


In [29]:
from pathlib import Path
import shutil
import logging
import pandas as pd

def organize_videos_by_classification(src_dir, dest_dir, excel_path, dry_run=True, save_logs=True):
    """
    Robust copy/move that handles directories and files:
    - If video_path is dir -> use shutil.copytree
    - If file -> use shutil.copy2
    - If dest exists -> create dest_name_dupN to avoid overwriting
    - dry_run=True only simula le operazioni e logga le azioni
    """
    stats = {'moved_count': 0, 'not_moved': [], 'errors': []}
    blasto_dir = Path(dest_dir) / "blasto"
    no_blasto_dir = Path(dest_dir) / "no_blasto"
    if not dry_run:
        blasto_dir.mkdir(parents=True, exist_ok=True)
        no_blasto_dir.mkdir(parents=True, exist_ok=True)

    try:
        if not Path(excel_path).exists():
            raise FileNotFoundError(f"Excel file {excel_path} does not exist")
        else:
            if not str(excel_path).lower().endswith(('.xls', '.xlsx')):
                df = pd.read_csv(excel_path)
            else:
                df = pd.read_excel(excel_path)
    except Exception as e:
        raise RuntimeError(f"Could not read excel {excel_path}: {e}")

    # Normalizza i nomi delle colonne in minuscolo
    df.columns = df.columns.str.lower()
    required_columns = ['dish_well', 'blasto ny']
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Excel must contain all of the following columns: {required_columns}")

    relevant = df[required_columns].copy()

    for _, row in relevant.iterrows():
        video_name = str(row['dish_well']).strip()

        # classification check
        try:
            classification = int(row['blasto ny'])
        except Exception:
            stats['not_moved'].append(f"{video_name}: invalid classification")
            continue

        # build source path (try year-extraction, fallback simple join)
        try:
            year = video_name.split("_")[0][1:5]
            video_path = Path(src_dir) / year / video_name
        except Exception:
            video_path = Path(src_dir) / video_name

        if not video_path.exists():
            stats['not_moved'].append(f"{video_name} not found at {video_path}")
            continue

        target_dir = blasto_dir if classification == 1 else no_blasto_dir
        dest_path = Path(target_dir) / video_path.name

        # ensure target parent exists for files
        try:
            if video_path.is_dir():
                # If dest exists -> find unique name
                if dest_path.exists():
                    i = 1
                    new_dest = dest_path.with_name(dest_path.name + f"_dup{i}")
                    while new_dest.exists():
                        i += 1
                        new_dest = dest_path.with_name(dest_path.name + f"_dup{i}")
                    dest_path = new_dest

                if dry_run:
                    logging.info(f"[DRY_RUN] Would copytree: {video_path} -> {dest_path}")
                else:
                    shutil.copytree(str(video_path), str(dest_path))
                    logging.info(f"Copied directory: {video_path} -> {dest_path}")

            elif video_path.is_file():
                dest_path.parent.mkdir(parents=True, exist_ok=True)
                # If dest exists -> find unique filename
                if dest_path.exists():
                    base = dest_path.stem
                    suf = dest_path.suffix
                    i = 1
                    new_dest = dest_path.with_name(f"{base}_dup{i}{suf}")
                    while new_dest.exists():
                        i += 1
                        new_dest = dest_path.with_name(f"{base}_dup{i}{suf}")
                    dest_path = new_dest

                if dry_run:
                    logging.info(f"[DRY_RUN] Would copy2: {video_path} -> {dest_path}")
                else:
                    shutil.copy2(str(video_path), str(dest_path))
                    logging.info(f"Copied file: {video_path} -> {dest_path}")

            else:
                stats['not_moved'].append(f"{video_name}: exists but not file/dir ({video_path})")

            stats['moved_count'] += 1

        except IsADirectoryError as e:
            msg = f"IsADirectoryError processing {video_path}: {e}"
            logging.warning(msg)
            stats['errors'].append(msg)
        except PermissionError as e:
            msg = f"PermissionError processing {video_path}: {e}"
            logging.warning(msg)
            stats['errors'].append(msg)
        except Exception as e:
            msg = f"Error processing {video_name}: {e}"
            logging.warning(msg)
            stats['errors'].append(msg)

    # Save log if requested
    if save_logs:
        dest_dirp = Path(dest_dir)
        dest_dirp.mkdir(parents=True, exist_ok=True)
        log_path = dest_dirp / "videos_not_moved.txt"
        with open(log_path, 'w') as f:
            f.write("Video non spostati e motivazioni:\n\n")
            f.write("\n".join(stats['not_moved']))
            f.write(f"\n\nTotale video processati (tentativi): {len(relevant)}\n")
            f.write(f"Totale video copiati (counted moves): {stats['moved_count']}\n")
            if stats['errors']:
                f.write("\nErrors:\n")
                f.write("\n".join(stats['errors']))
    return stats


def check_for_corrupted_images(root_dir, log_file_path=None, image_extensions=IMAGE_EXTENSIONS):
    root_dir = Path(root_dir)
    corrupted_folders = []
    total_images = 0
    corrupted_images = 0
    per_folder = {}
    for folder_name in sorted(os.listdir(root_dir)):
        folder_path = root_dir / folder_name
        if not folder_path.is_dir():
            continue
        folder_corrupted = False
        folder_total = 0
        folder_corrupted_count = 0
        for dirpath, _, files in os.walk(folder_path):
            for file_name in files:
                if file_name.lower().endswith(image_extensions):
                    folder_total += 1
                    total_images += 1
                    image_path = Path(dirpath) / file_name
                    try:
                        with Image.open(image_path) as img:
                            img.verify()
                    except Exception as e:
                        folder_corrupted_count += 1
                        corrupted_images += 1
                        folder_corrupted = True
        per_folder[folder_name] = {'total': folder_total, 'corrupted': folder_corrupted_count}
        if folder_corrupted:
            corrupted_folders.append(folder_name)
    summary = {
        'total_images': total_images,
        'corrupted_images': corrupted_images,
        'corrupted_folders_count': len(corrupted_folders),
        'corrupted_folders': corrupted_folders,
        'per_folder': per_folder
    }
    if log_file_path:
        with open(log_file_path, 'w') as lf:
            lf.write("Verifica immagini corrotte:\n\n")
            for folder, vals in per_folder.items():
                lf.write(f"Folder: {folder} -> total: {vals['total']} corrupted: {vals['corrupted']}\n")
            lf.write("\n--- Riassunto ---\n")
            lf.write(f"Totale immagini controllate: {total_images}\n")
            lf.write(f"Totale immagini corrotte: {corrupted_images}\n")
            lf.write(f"Cartelle con immagini corrotte: {len(corrupted_folders)}\n")
            lf.write(f"Cartelle corrotte: {', '.join(corrupted_folders) if corrupted_folders else 'Nessuna'}\n")
    return summary

def fix_truncated_jpeg(file_path):
    ImageFile.LOAD_TRUNCATED_IMAGES = True
    try:
        with Image.open(file_path) as img:
            img.save(file_path, "JPEG")
        return True
    except Exception as e:
        logging.warning(f"Could not fix {file_path}: {e}")
        return False

def fix_images_in_directory(root_path, image_extensions=('.jpg',), max_files=None):
    fixed = 0
    attempted = 0
    errors = []
    for dirpath, _, files in os.walk(root_path):
        for file in files:
            if max_files is not None and attempted >= max_files:
                return {'attempted': attempted, 'fixed': fixed, 'errors': errors}
            if file.lower().endswith(image_extensions):
                attempted += 1
                fp = Path(dirpath) / file
                success = fix_truncated_jpeg(fp)
                if success:
                    fixed += 1
    return {'attempted': attempted, 'fixed': fixed, 'errors': errors}


In [30]:
print("--- Organizzazione video per classificazione ---")
print("DRY_RUN =", DRY_RUN)
stats = organize_videos_by_classification(SRC_DIR, DEST_DIR, EXCEL_PATH, dry_run=DRY_RUN, save_logs=SAVE_LOGS)
print("Risultati organizzazione:")
print(f"  Moved count (attempts counted): {stats.get('moved_count')}")
print(f"  Not moved examples (first 20):\n{stats.get('not_moved')[:20]}")
if stats.get('errors'):
    print("  Errors:")
    for e in stats.get('errors')[:10]:
        print("   ", e)


--- Organizzazione video per classificazione ---
DRY_RUN = False


2025-09-02 13:42:35,744 - INFO - Copied directory: /home/phd2/Scrivania/CorsoData/ScopeData_time_conversion/2013/D2013.02.19_S0675_I141_1 -> /home/phd2/Scrivania/CorsoData/blastocisti/blasto/D2013.02.19_S0675_I141_1
2025-09-02 13:42:35,823 - INFO - Copied directory: /home/phd2/Scrivania/CorsoData/ScopeData_time_conversion/2013/D2013.02.19_S0675_I141_2 -> /home/phd2/Scrivania/CorsoData/blastocisti/blasto/D2013.02.19_S0675_I141_2
2025-09-02 13:42:35,912 - INFO - Copied directory: /home/phd2/Scrivania/CorsoData/ScopeData_time_conversion/2013/D2013.02.19_S0675_I141_3 -> /home/phd2/Scrivania/CorsoData/blastocisti/no_blasto/D2013.02.19_S0675_I141_3
2025-09-02 13:42:36,023 - INFO - Copied directory: /home/phd2/Scrivania/CorsoData/ScopeData_time_conversion/2013/D2013.02.19_S0675_I141_4 -> /home/phd2/Scrivania/CorsoData/blastocisti/no_blasto/D2013.02.19_S0675_I141_4
2025-09-02 13:42:36,114 - INFO - Copied directory: /home/phd2/Scrivania/CorsoData/ScopeData_time_conversion/2013/D2013.02.19_S0675

Risultati organizzazione:
  Moved count (attempts counted): 6002
  Not moved examples (first 20):
[]


In [31]:
len(stats.get('not_moved'))

0

In [32]:
target_scan_dir = DEST_DIR if DEST_DIR.exists() else SRC_DIR
log_file = (DEST_DIR / "corrupted_images_log.txt") if DEST_DIR.exists() else (Path('.') / "corrupted_images_log.txt")
print("Scanning directory:", target_scan_dir)
summary = check_for_corrupted_images(target_scan_dir, log_file_path=(log_file if SAVE_LOGS else None))
print("\n--- Summary ---")
print(f"Total images scanned: {summary['total_images']}")
print(f"Total corrupted images: {summary['corrupted_images']}")
print(f"Corrupted folders count: {summary['corrupted_folders_count']}")
print("Some per-folder counts (sample):")
import itertools
for k,v in itertools.islice(summary['per_folder'].items(), 10):
    print(k, "->", v)
if SAVE_LOGS:
    print("Log saved to:", log_file)


Scanning directory: /home/phd2/Scrivania/CorsoData/blastocisti

--- Summary ---
Total images scanned: 3779067
Total corrupted images: 0
Corrupted folders count: 0
Some per-folder counts (sample):
blasto -> {'total': 1586912, 'corrupted': 0}
no_blasto -> {'total': 2192155, 'corrupted': 0}
Log saved to: /home/phd2/Scrivania/CorsoData/blastocisti/corrupted_images_log.txt


In [33]:
if not FIX_IMAGES:
    print("FIX_IMAGES is False. Set FIX_IMAGES=True to run fixes (slow).")
else:
    target_fix_dir = DEST_DIR if DEST_DIR.exists() else SRC_DIR
    print("Attempting to fix images under:", target_fix_dir)
    res = fix_images_in_directory(target_fix_dir, image_extensions=('.jpg',), max_files=MAX_FIX_FILES)
    print("Fix results:", res)
    if SAVE_LOGS and DEST_DIR.exists():
        with open(DEST_DIR / 'fix_images_log.txt', 'w') as f:
            f.write(str(res))
        print("Fix log saved to:", DEST_DIR / 'fix_images_log.txt')


Attempting to fix images under: /home/phd2/Scrivania/CorsoData/blastocisti
Fix results: {'attempted': 3779067, 'fixed': 3779067, 'errors': []}
Fix log saved to: /home/phd2/Scrivania/CorsoData/blastocisti/fix_images_log.txt


In [34]:

from IPython.display import display
summary_items = {
    'moved_count': stats.get('moved_count', None),
    'not_moved_count': len(stats.get('not_moved', [])),
    'corrupted_total': summary.get('corrupted_images', None),
    'corrupted_folders': summary.get('corrupted_folders_count', None)
}
summary_df = pd.DataFrame([summary_items])
display(summary_df.T.rename(columns={0:'value'}))
if SAVE_LOGS and DEST_DIR.exists():
    summary_path = DEST_DIR / 'preprocessing_summary.csv'
    summary_df.to_csv(summary_path, index=False)
    print("Summary saved to:", summary_path)


Unnamed: 0,value
moved_count,6002
not_moved_count,0
corrupted_total,0
corrupted_folders,0


Summary saved to: /home/phd2/Scrivania/CorsoData/blastocisti/preprocessing_summary.csv


In [35]:
print("Full pipeline cell (only executes changes if DRY_RUN=False).")
print("Current DRY_RUN:", DRY_RUN)
print("To run the full pipeline with file moves set DRY_RUN=False and re-run this cell.")

Full pipeline cell (only executes changes if DRY_RUN=False).
Current DRY_RUN: False
To run the full pipeline with file moves set DRY_RUN=False and re-run this cell.
