## Select videos in the dataset

In [7]:
import os
import shutil
import pandas as pd
import numpy as np
import re

# ================= CONFIGURATION =================
# Path to your CSV dataset
CSV_PATH = "/home/phd2/Scrivania/CorsoRepo/embryo_valencia/datasets/dataset_final_merged.csv"

# Root directory containing the year folders (2016, 2017...) and frames
SOURCE_ROOT = "/home/phd2/Documenti/embryo/marilena_videos/extracted_equatorial_frames"

# Where you want the files to go
DEST_ROOT = "/home/phd2/Documenti/embryo/marilena_videos/final_videos"

# ================= HELPER FUNCTIONS =================

def parse_filename(filename):
    """
    Parses a filename to extract frame number and time.
    Expected format: D..._Frame_Z_Timeh.jpg
    Example: D2021..._P_1_37_0_6.85h.jpg
    
    Returns:
        dict: {'frame': int, 'time': float, 'parts': list} or None
    """
    try:
        # Remove extension
        name_no_ext, ext = os.path.splitext(filename)
        parts = name_no_ext.split('_')
        
        if len(parts) < 3:
            return None
            
        # Extract Frame (3rd from last)
        frame_str = parts[-3]
        if not frame_str.isdigit():
            return None
        frame_num = int(frame_str)
        
        # Extract Time (last part)
        # It usually ends with 'h', e.g. "6.85h"
        time_str = parts[-1]
        if time_str.lower().endswith('h'):
            time_val_str = time_str[:-1]
        else:
            time_val_str = time_str
            
        # Replace comma just in case
        time_val = float(time_val_str.replace(',', '.'))
        
        return {
            'filename': filename,
            'frame': frame_num,
            'time': time_val,
            'parts': parts,
            'ext': ext
        }
    except Exception:
        # If parsing fails (e.g. weird file), return None to skip
        return None

def construct_new_filename(info, t0):
    """
    Reconstructs the filename with the shifted time.
    New Time = Old Time - t0
    """
    new_time = info['time'] - t0
    # Ensure non-negative (just in case of float weirdness, though logic prevents it)
    if new_time < 0: new_time = 0.0
    
    # Format: original parts except the last one
    base_parts = info['parts'][:-1]
    
    # Reconstruct last part: "0.25h"
    new_time_str = f"{new_time:.2f}h"
    
    new_name_no_ext = "_".join(base_parts) + "_" + new_time_str
    return new_name_no_ext + info['ext']

# ================= MAIN SCRIPT =================

def main():
    # 1. Load the Dataset
    try:
        # Using csv read
        df = pd.read_csv(CSV_PATH, sep=',') 
        print(f"Loaded dataset with {len(df)} rows.")
    except Exception as e:
        print(f"Error loading CSV: {e}")
        return

    # Check columns
    required_cols = ['dish_well', 'Note', 'start frame', 'end frame']
    for col in required_cols:
        if col not in df.columns:
            print(f"Error: CSV is missing column '{col}'")
            return

    # 2. Index the Source Directory
    print(f"Indexing folders in {SOURCE_ROOT}...")
    source_map = {}
    for root, dirs, files in os.walk(SOURCE_ROOT):
        for dirname in dirs:
            source_map[dirname] = os.path.join(root, dirname)
    print(f"Found {len(source_map)} source folders.")

    # 3. Process
    success_count = 0
    missing_count = 0
    skipped_count = 0
    
    print("\nStarting processing...")
    
    for index, row in df.iterrows():
        dish_id = str(row['dish_well']).strip()
        category = str(row['Note']).strip()
        
        # Parse start/end frames (Handle NaNs)
        try:
            start_lim = int(row['start frame']) if pd.notna(row['start frame']) else 1
        except: start_lim = 1
            
        try:
            end_lim = int(row['end frame']) if pd.notna(row['end frame']) else 999999
        except: end_lim = 999999

        # Handle bad categories
        if category.lower() in ['nan', 'none', '']:
            category = "Uncategorized"

        dest_folder = os.path.join(DEST_ROOT, category, dish_id)

        # Check if source exists
        if dish_id in source_map:
            src_folder = source_map[dish_id]
            
            # Destination logic
            if not os.path.exists(dest_folder):
                try:
                    os.makedirs(dest_folder, exist_ok=True)
                    
                    # --- STEP 1: SCAN & FILTER ---
                    all_files = sorted([f for f in os.listdir(src_folder) if f.lower().endswith('.jpg')])
                    
                    valid_frames = []
                    
                    for f in all_files:
                        info = parse_filename(f)
                        if info is None: continue
                        
                        # Filter Logic
                        if start_lim <= info['frame'] <= end_lim:
                            valid_frames.append(info)
                    
                    # Sort by frame number (crucial for time alignment)
                    valid_frames.sort(key=lambda x: x['frame'])
                    
                    if not valid_frames:
                        print(f"Warning: No valid frames found for {dish_id} in range {start_lim}-{end_lim}")
                        # Cleanup empty dir if created
                        os.rmdir(dest_folder)
                        continue

                    # --- STEP 2: DETERMINE T0 ---
                    # The first frame in our kept sequence defines the new time zero
                    t0 = valid_frames[0]['time']
                    
                    # --- STEP 3: COPY & RENAME ---
                    files_copied_count = 0
                    for info in valid_frames:
                        src_file = os.path.join(src_folder, info['filename'])
                        
                        # Calculate new name with shifted time
                        new_name = construct_new_filename(info, t0)
                        dest_file = os.path.join(dest_folder, new_name)
                        
                        shutil.copy2(src_file, dest_file)
                        files_copied_count += 1
                    
                    success_count += 1
                    if success_count % 10 == 0:
                        print(f"Processed {success_count} videos (Last: {dish_id} -> {files_copied_count} frames, t0={t0}h)")
                        
                except Exception as e:
                    print(f"Error processing {dish_id}: {e}")
            else:
                skipped_count += 1
        else:
            # print(f"MISSING: Source folder not found for ID: {dish_id}")
            missing_count += 1

    # ================= SUMMARY =================
    print("-" * 30)
    print("Process Complete.")
    print(f"Successfully Processed: {success_count}")
    print(f"Skipped (Dest Exists):  {skipped_count}")
    print(f"Missing Source Folders: {missing_count}")
    print("-" * 30)

if __name__ == "__main__":
    main()

Loaded dataset with 168 rows.
Indexing folders in /home/phd2/Documenti/embryo/marilena_videos/extracted_equatorial_frames...
Found 259 source folders.

Starting processing...
Processed 10 videos (Last: D2018.12.21_S02510_I0106_D_2 -> 96 frames, t0=0.0h)
Processed 20 videos (Last: D2019.05.26_S01883_I0406_D_11 -> 93 frames, t0=0.0h)
Processed 30 videos (Last: D2021.09.20_S02522_I0406_D_1 -> 89 frames, t0=0.0h)
Processed 40 videos (Last: D2021.09.22_S00966_I0758_D_6 -> 93 frames, t0=0.0h)
Processed 50 videos (Last: D2019.06.20_S01903_I0406_D_2 -> 93 frames, t0=0.0h)
Processed 60 videos (Last: D2017.09.15_S0770_I631_5 -> 91 frames, t0=0.0h)
Processed 70 videos (Last: D2019.03.14_S00126_I0758_D_5 -> 88 frames, t0=0.0h)
Processed 80 videos (Last: D2020.08.03_S02162_I0406_D_4 -> 89 frames, t0=0.0h)
Processed 90 videos (Last: D2016.11.15_S1896_I106_7 -> 466 frames, t0=0.0h)
Processed 100 videos (Last: D2019.02.15_S01533_I0057_P_4 -> 575 frames, t0=0.0h)
Processed 110 videos (Last: D2021.07.10