In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import time
import os # For checking file existence

# --- Configuration Parameters ---
# Adjust these values based on your Synthesia video characteristics
CALIBRATION_FRAMES = 30  # Number of initial frames to calibrate unpressed key colors
KEY_PRESS_COLOR_THRESHOLD = 30 # Threshold for detecting a color change (adjust as needed)

# Parameters for piano ROI detection (more general, less reliant on "white" keys)
MIN_PIANO_ROI_HEIGHT_RATIO = 0.1 # Minimum height of the detected piano region as a ratio of frame height
MAX_PIANO_ROI_HEIGHT_RATIO = 0.5 # Maximum height of the detected piano region as a ratio of frame height
PIANO_ROI_HORIZONTAL_MARGIN_RATIO = 0.02 # Margin to add to the sides of the detected piano ROI
VERTICAL_EDGE_THRESHOLD = 50 # Threshold for Canny edge detection in ROI finding
VERTICAL_EDGE_MIN_LINE_LENGTH_RATIO = 0.05 # Minimum length of vertical lines for key segmentation

# Parameters for key segmentation
MIN_KEY_WIDTH_RATIO = 0.005 # Minimum width of a detected key as a ratio of piano ROI width
MAX_KEY_WIDTH_RATIO = 0.05  # Maximum width of a detected key as a ratio of piano ROI width
KEY_SEGMENTATION_CANNY_LOW = 50
KEY_SEGMENTATION_CANNY_HIGH = 150
KEY_SEGMENTATION_HOUGH_THRESHOLD = 30 # Lower threshold for more lines

# --- Helper Functions ---

def get_average_color(image_patch):
    """Calculates the average BGR color of an image patch."""
    return np.mean(image_patch, axis=(0, 1))

def color_distance(color1, color2):
    """Calculates the Euclidean distance between two BGR colors."""
    return np.sqrt(np.sum((np.array(color1) - np.array(color2))**2))

def map_key_to_note(key_index):
    """
    Maps a key index (0-based, left to right) to a generic musical note name.
    This is a simplified mapping. For a precise music sheet, you'd need to
    accurately determine white/black keys and their positions relative to C notes.
    """
    return f"Key {key_index + 1}"

def display_frame_with_detections(frame, piano_roi_coords, key_regions, title="Detected Piano and Keys"):
    """
    Displays the frame with the detected piano ROI and individual key regions.
    Suitable for Jupyter Notebook.
    """
    plt.figure(figsize=(12, 8))
    plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) # Convert BGR to RGB for matplotlib

    # Draw piano ROI
    if piano_roi_coords:
        x, y, w, h = piano_roi_coords
        plt.gca().add_patch(plt.Rectangle((x, y), w, h,
                                         edgecolor='lime', facecolor='none', lw=2, label='Piano ROI'))

    # Draw individual key regions
    if key_regions:
        for key in key_regions:
            plt.gca().add_patch(plt.Rectangle((key['x'], key['y']), key['w'], key['h'],
                                             edgecolor='cyan', facecolor='none', lw=1))
            # Optional: Add key index text
            plt.text(key['x'] + key['w'] / 2, key['y'] + key['h'] / 2, str(key['id']),
                     color='white', fontsize=8, ha='center', va='center',
                     bbox=dict(facecolor='black', alpha=0.5, edgecolor='none', pad=1))

    plt.title(title)
    plt.axis('off') # Hide axes
    plt.legend()
    plt.show()

# --- Main Detection Functions ---

def find_piano_roi(frame):
    """
    Attempts to automatically find the piano region (ROI) in the video frame.
    This heuristic looks for a prominent horizontal band of vertical edges,
    characteristic of piano keys.
    """
    height, width, _ = frame.shape
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Apply Canny edge detection to highlight edges, especially vertical ones
    edges = cv2.Canny(gray, VERTICAL_EDGE_THRESHOLD, VERTICAL_EDGE_THRESHOLD * 2, apertureSize=3)

    # Calculate horizontal projection of vertical edges
    # We are interested in vertical lines, so we apply a horizontal kernel for detection
    # A simple way to get vertical edge strength per row:
    vertical_edge_strength_per_row = np.sum(edges, axis=1)

    # Smooth the projection to find larger regions
    # Use a kernel size that is a percentage of the frame height
    kernel_size = max(1, height // 30) # Average over ~3% of height
    smoothed_projection = np.convolve(vertical_edge_strength_per_row, np.ones(kernel_size) / kernel_size, mode='valid')

    if len(smoothed_projection) == 0:
        print("Warning: No significant vertical edge projection found for piano ROI.")
        return None

    # Find the peak in the smoothed projection
    max_val = np.max(smoothed_projection)

    # Identify potential y-coordinates where edge density is high
    # Look for regions that are at least 50% of the max edge density
    potential_y_centers = np.where(smoothed_projection > max_val * 0.5)[0]

    if len(potential_y_centers) == 0:
        print("Warning: No strong piano ROI detected based on vertical edge projection.")
        return None

    # Determine the start and end rows of the most prominent band
    # Group contiguous indices
    diffs = np.diff(potential_y_centers)
    breaks = np.where(diffs > 1)[0] + 1

    segments = np.split(potential_y_centers, breaks)

    # Find the segment with the most points (largest continuous band)
    if not segments or all(len(s) == 0 for s in segments):
        print("Warning: No continuous segments found in vertical edge projection.")
        return None

    longest_segment = max(segments, key=len)

    if len(longest_segment) == 0:
        print("Warning: Longest segment is empty.")
        return None

    # Estimate the top and bottom of the piano region based on the longest segment
    y_start_segment = longest_segment[0]
    y_end_segment = longest_segment[-1] + kernel_size # Adjust for convolution offset

    # Ensure the detected height is within reasonable bounds
    detected_height = y_end_segment - y_start_segment
    min_allowed_height = int(height * MIN_PIANO_ROI_HEIGHT_RATIO)
    max_allowed_height = int(height * MAX_PIANO_ROI_HEIGHT_RATIO)

    if detected_height < min_allowed_height or detected_height > max_allowed_height:
        print(f"Warning: Detected piano ROI height ({detected_height}) is outside expected range ({min_allowed_height}-{max_allowed_height}). Adjusting.")
        # Try to center a fixed-ratio height around the detected segment
        center_y = (y_start_segment + y_end_segment) // 2
        piano_height = int(height * (MIN_PIANO_ROI_HEIGHT_RATIO + MAX_PIANO_ROI_HEIGHT_RATIO) / 2)
        y_start = max(0, center_y - piano_height // 2)
        y_end = min(height, center_y + piano_height // 2)
    else:
        y_start = y_start_segment
        y_end = y_end_segment

    # Add horizontal margins
    x_start = int(width * PIANO_ROI_HORIZONTAL_MARGIN_RATIO)
    x_end = int(width * (1 - PIANO_ROI_HORIZONTAL_MARGIN_RATIO))

    # Final validation of ROI dimensions
    if (x_end - x_start) < (width * 0.5) or (y_end - y_start) < min_allowed_height:
        print("Warning: Final piano ROI is too small or narrow after adjustments. Check parameters.")
        return None

    print(f"Detected Piano ROI: x={x_start}, y={y_start}, w={x_end-x_start}, h={y_end-y_start}")
    return (x_start, y_start, x_end - x_start, y_end - y_start)


def segment_keys(piano_roi_frame, piano_roi_coords):
    """
    Segments the detected piano ROI into individual key regions.
    This heuristic attempts to find vertical lines (key dividers) and
    then defines rectangular regions for each key.
    """
    x_roi, y_roi, w_roi, h_roi = piano_roi_coords

    # Convert to grayscale and apply blur
    gray_roi = cv2.cvtColor(piano_roi_frame, cv2.COLOR_BGR2GRAY)
    blurred_roi = cv2.GaussianBlur(gray_roi, (5, 5), 0)

    # Use Canny edge detection to find vertical lines (key dividers)
    edges = cv2.Canny(blurred_roi, KEY_SEGMENTATION_CANNY_LOW, KEY_SEGMENTATION_CANNY_HIGH, apertureSize=3)

    # Find vertical lines using Hough Transform
    # We're looking for lines that are mostly vertical
    min_line_length = int(h_roi * VERTICAL_EDGE_MIN_LINE_LENGTH_RATIO)
    lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=KEY_SEGMENTATION_HOUGH_THRESHOLD,
                            minLineLength=min_line_length, maxLineGap=10)

    key_dividers_x = []
    if lines is not None:
        for line in lines:
            x1, y1, x2, y2 = line[0]
            # Filter for mostly vertical lines (small slope)
            if abs(x1 - x2) < 5 and abs(y1 - y2) > min_line_length: # vertical enough and long enough
                key_dividers_x.append(x1)

    # Add the left and right boundaries of the ROI as implicit dividers
    key_dividers_x.append(0)
    key_dividers_x.append(w_roi)
    key_dividers_x = sorted(list(set(key_dividers_x))) # Remove duplicates and sort

    key_regions = []
    # Iterate through the sorted dividers to define key regions
    for i in range(len(key_dividers_x) - 1):
        kx_start_relative = key_dividers_x[i]
        kx_end_relative = key_dividers_x[i+1]
        k_width = kx_end_relative - kx_start_relative

        # Filter out very small or very large regions that are unlikely to be keys
        # Width should be within a reasonable range relative to the ROI width
        if k_width > w_roi * MIN_KEY_WIDTH_RATIO and k_width < w_roi * MAX_KEY_WIDTH_RATIO:
            key_regions.append({
                'id': len(key_regions), # Assign a simple ID
                'x': x_roi + kx_start_relative, # Absolute X coordinate
                'y': y_roi,                       # Absolute Y coordinate (top of piano ROI)
                'w': k_width,
                'h': h_roi,                       # Full height of piano ROI
            })

    # If no keys are found, or too few, fall back to a simple grid
    if len(key_regions) < 10: # Assuming at least 10 keys should be visible
        print("Warning: Could not reliably segment keys. Falling back to fixed grid.")
        key_regions = []
        num_estimated_keys = 20 # A reasonable guess for visible keys
        key_width_estimate = w_roi // num_estimated_keys

        for i in range(num_estimated_keys):
            kx_start_relative = i * key_width_estimate
            kx_end_relative = (i + 1) * key_width_estimate
            if kx_end_relative > w_roi: # Don't go beyond ROI width
                kx_end_relative = w_roi

            k_width = kx_end_relative - kx_start_relative
            if k_width > 0:
                key_regions.append({
                    'id': len(key_regions),
                    'x': x_roi + kx_start_relative,
                    'y': y_roi,
                    'w': k_width,
                    'h': h_roi,
                })

    print(f"Segmented {len(key_regions)} potential key regions.")
    return key_regions


def calibrate_baselines(video_path, piano_roi_coords, key_regions):
    """
    Calibrates the 'unpressed' color baseline for each key region
    by averaging color over initial frames.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Could not open video {video_path}")
        return None

    baseline_colors = defaultdict(lambda: np.zeros(3)) # Stores sum of colors for averaging
    frame_counts = defaultdict(int) # Stores count of frames for averaging

    print(f"Calibrating baselines over {CALIBRATION_FRAMES} frames...")
    for i in range(CALIBRATION_FRAMES):
        ret, frame = cap.read()
        if not ret:
            print("Warning: Could not read enough frames for calibration. Using available frames.")
            break

        x_roi, y_roi, w_roi, h_roi = piano_roi_coords

        # Ensure ROI is within frame boundaries
        if not (0 <= y_roi < frame.shape[0] and 0 <= x_roi < frame.shape[1] and
                y_roi + h_roi <= frame.shape[0] and x_roi + w_roi <= frame.shape[1]):
            print(f"Error: Piano ROI {piano_roi_coords} is out of frame bounds {frame.shape}. Skipping calibration.")
            cap.release()
            return None

        piano_roi_frame = frame[y_roi:y_roi+h_roi, x_roi:x_roi+w_roi]

        for key in key_regions:
            key_id = key['id']
            # Key coordinates are absolute, need to convert to relative for piano_roi_frame
            kx_relative = key['x'] - x_roi
            ky_relative = key['y'] - y_roi
            kw = key['w']
            kh = key['h']

            # Ensure key patch is within piano_roi_frame boundaries
            if not (0 <= ky_relative < piano_roi_frame.shape[0] and 0 <= kx_relative < piano_roi_frame.shape[1] and
                    ky_relative + kh <= piano_roi_frame.shape[0] and kx_relative + kw <= piano_roi_frame.shape[1]):
                # print(f"Warning: Key {key_id} patch out of bounds for piano ROI. Skipping.")
                continue # Skip this key for this frame

            key_patch = piano_roi_frame[ky_relative:ky_relative+kh, kx_relative:kx_relative+kw]

            if key_patch.shape[0] > 0 and key_patch.shape[1] > 0:
                baseline_colors[key_id] += get_average_color(key_patch)
                frame_counts[key_id] += 1

    cap.release()

    final_baselines = {}
    for key_id, sum_color in baseline_colors.items():
        if frame_counts[key_id] > 0:
            final_baselines[key_id] = sum_color / frame_counts[key_id]
        else:
            print(f"Warning: No frames processed for key {key_id} during calibration. Defaulting to black.")
            final_baselines[key_id] = np.array([0., 0., 0.]) # Default to black if no data

    print("Calibration complete.")
    return final_baselines


def detect_key_presses(video_path, piano_roi_coords, key_regions, baseline_colors):
    """
    Detects key presses based on color changes within key regions.
    Records start and end times for each note.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Could not open video {video_path}")
        return []

    fps = cap.get(cv2.CAP_PROP_FPS)
    if fps == 0:
        print("Warning: Could not get FPS, defaulting to 30.")
        fps = 30.0

    current_key_states = {key['id']: False for key in key_regions} # True if pressed
    note_start_times = {} # Stores start time of a note when pressed
    detected_notes = [] # List of (note_name, start_time, end_time)

    frame_idx = 0
    print("Detecting key presses...")

    # Skip calibration frames
    for _ in range(CALIBRATION_FRAMES):
        cap.read()

    while True:
        ret, frame = cap.read()
        if not ret:
            break # End of video

        current_time = frame_idx / fps

        x_roi, y_roi, w_roi, h_roi = piano_roi_coords

        # Ensure ROI is within frame boundaries
        if not (0 <= y_roi < frame.shape[0] and 0 <= x_roi < frame.shape[1] and
                y_roi + h_roi <= frame.shape[0] and x_roi + w_roi <= frame.shape[1]):
            print(f"Error: Piano ROI {piano_roi_coords} is out of frame bounds {frame.shape}. Stopping detection.")
            break

        piano_roi_frame = frame[y_roi:y_roi+h_roi, x_roi:x_roi+w_roi]

        # Draw the piano ROI boundary for visualization
        cv2.rectangle(frame, (x_roi, y_roi), (x_roi + w_roi, y_roi + h_roi), (0, 255, 0), 2)

        for key in key_regions:
            key_id = key['id']
            # Key coordinates are absolute, need to convert to relative for piano_roi_frame
            kx_relative = key['x'] - x_roi
            ky_relative = key['y'] - y_roi
            kw = key['w']
            kh = key['h']

            # Ensure key patch is valid and within bounds
            if not (0 <= ky_relative < piano_roi_frame.shape[0] and 0 <= kx_relative < piano_roi_frame.shape[1] and
                    ky_relative + kh <= piano_roi_frame.shape[0] and kx_relative + kw <= piano_roi_frame.shape[1]):
                # print(f"Warning: Key {key_id} patch out of bounds for piano ROI. Skipping.")
                continue # Skip this key for this frame

            key_patch = piano_roi_frame[ky_relative:ky_relative+kh, kx_relative:kx_relative+kw]

            if key_patch.shape[0] == 0 or key_patch.shape[1] == 0:
                continue

            current_color = get_average_color(key_patch)

            # Ensure baseline exists for this key
            if key_id not in baseline_colors:
                continue

            distance = color_distance(current_color, baseline_colors[key_id])

            is_pressed = distance > KEY_PRESS_COLOR_THRESHOLD

            if is_pressed and not current_key_states[key_id]:
                # Key just pressed
                current_key_states[key_id] = True
                note_start_times[key_id] = current_time
                cv2.rectangle(frame, (key['x'], key['y']), (key['x'] + key['w'], key['y'] + key['h']), (0, 0, 255), 2) # Red for pressed
                cv2.putText(frame, "P", (key['x'] + 5, key['y'] + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
            elif not is_pressed and current_key_states[key_id]:
                # Key just released
                current_key_states[key_id] = False
                note_name = map_key_to_note(key_id)
                start_time = note_start_times.get(key_id, current_time) # Use current_time if start not recorded
                detected_notes.append((note_name, start_time, current_time))
                cv2.rectangle(frame, (key['x'], key['y']), (key['x'] + key['w'], key['y'] + key['h']), (255, 0, 0), 1) # Blue for released
                cv2.putText(frame, "R", (key['x'] + 5, key['y'] + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
            elif is_pressed and current_key_states[key_id]:
                # Key is still pressed
                cv2.rectangle(frame, (key['x'], key['y']), (key['x'] + key['w'], key['y'] + key['h']), (0, 255, 255), 1) # Yellow for held
                cv2.putText(frame, "H", (key['x'] + 5, key['y'] + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 255), 1)
            else:
                # Key is unpressed
                cv2.rectangle(frame, (key['x'], key['y']), (key['x'] + key['w'], key['y'] + key['h']), (0, 255, 0), 1) # Green for unpressed

        # Display the frame in a window (for real-time feedback, might not work well in all Jupyter setups)
        # cv2.imshow("Synthesia Key Press Detection", frame)
        # if cv2.waitKey(1) & 0xFF == ord('q'):
        #     break

        frame_idx += 1

    # Add any notes that were still pressed at the end of the video
    for key_id, is_pressed in current_key_states.items():
        if is_pressed:
            note_name = map_key_to_note(key_id)
            start_time = note_start_times.get(key_id, current_time)
            detected_notes.append((note_name, start_time, current_time))

    cap.release()
    # cv2.destroyAllWindows() # Only if cv2.imshow was used
    print("Detection complete.")
    return detected_notes

# --- Main Execution Block for Jupyter ---
# This block is designed to be run directly in a Jupyter Notebook cell.

# Define your video path here
video_path = "../../Data/video/Twinkle_Twinkle_Little_Star_12_Variations_-_Mozart_Piano_Tutorial_Synthesia.mp4"  # <--- IMPORTANT: Change this to your video file

if not os.path.exists(video_path):
    print(f"Error: Video file not found at '{video_path}'. Please update the 'video_path' variable.")
else:
    # Step 1: Find Piano ROI
    print("Attempting to find piano ROI...")
    cap_initial = cv2.VideoCapture(video_path)
    if not cap_initial.isOpened():
        print(f"Error: Could not open video {video_path}")
    else:
        ret, first_frame = cap_initial.read()
        if not ret:
            print("Error: Could not read first frame.")
        else:
            piano_roi_coords = find_piano_roi(first_frame)

            if piano_roi_coords is None:
                print("Failed to automatically detect piano ROI. Please adjust parameters or consider manual selection.")
            else:
                # Step 2: Segment Keys within ROI
                x, y, w, h = piano_roi_coords
                piano_roi_frame_for_segmentation = first_frame[y:y+h, x:x+w]
                key_regions = segment_keys(piano_roi_frame_for_segmentation, piano_roi_coords)

                if not key_regions:
                    print("Failed to segment individual keys within the detected piano ROI. Exiting.")
                else:
                    # Display the first frame with detected ROI and keys in Jupyter
                    print("\nDisplaying detected piano ROI and key regions on the first frame:")
                    display_frame_with_detections(first_frame.copy(), piano_roi_coords, key_regions,
                                                  title="Detected Piano and Keys on First Frame")

                    # Step 3: Calibrate Baselines
                    baseline_colors = calibrate_baselines(video_path, piano_roi_coords, key_regions)
                    if not baseline_colors:
                        print("Failed to calibrate baseline colors. Exiting.")
                    else:
                        # Step 4: Detect Key Presses
                        print("\nStarting key press detection...")
                        detected_notes = detect_key_presses(video_path, piano_roi_coords, key_regions, baseline_colors)

                        print("\n--- Detected Notes ---")
                        if detected_notes:
                            for note_name, start_time, end_time in detected_notes:
                                duration = end_time - start_time
                                print(f"Note: {note_name}, Start: {start_time:.2f}s, End: {end_time:.2f}s, Duration: {duration:.2f}s")
                        else:
                            print("No notes detected.")

                        print("\n--- Next Steps ---")
                        print("This data (note name, start time, end time) can now be used to generate a music sheet.")
                        print("You would typically use a music notation library (e.g., `music21` in Python) to convert")
                        print("this structured data into a visual music score, or export it as a MIDI file.")
        cap_initial.release()

In [None]:
import cv2
import numpy as np
import os
from IPython.display import Image, display

# ---------- CONFIG ----------
VIDEO_PATH = '../../Data/video/Twinkle_Twinkle_Little_Star_12_Variations_-_Mozart_Piano_Tutorial_Synthesia.mp4'
MAX_FRAMES = 500
MIN_BLACK_KEYS = 10
WHITE_PIXEL_RATIO = 0.4
SCAN_STRIDE = 10  # seberapa vertikal bergeser tiap scan ROI

# ---------- OUTPUT DIR ----------
os.makedirs("output_SYNtoSHEET", exist_ok=True)

# ---------- Fungsi: Deteksi apakah ROI seperti piano ----------
def is_piano_like_frame(roi, min_black_keys=10, white_pixel_ratio=0.5):
    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
    white_pixels = np.sum(gray > 200)
    total_pixels = gray.shape[0] * gray.shape[1]
    white_ratio = white_pixels / total_pixels

    _, thresh = cv2.threshold(gray, 50, 255, cv2.THRESH_BINARY_INV)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    black_keys = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if h > roi.shape[0] * 0.5 and w < roi.shape[1] * 0.05:
            black_keys.append((x, y, w, h))

    return white_ratio > white_pixel_ratio and len(black_keys) >= min_black_keys, white_ratio, len(black_keys)

# ---------- Temukan frame pertama yang mengandung pola piano ----------
def find_frame_with_piano_by_whiteness(video_path, max_frames=500, debug=False):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise Exception(f"‚ùå Gagal membuka video: {video_path}")

    frame_found = None
    best_roi = None
    frame_index = 0

    while frame_index < max_frames:
        ret, frame = cap.read()
        if not ret:
            break

        height, width = frame.shape[:2]
        for y_start in range(0, height - height // 4, SCAN_STRIDE):
            roi = frame[y_start:y_start + height // 4, :]
            if roi.shape[0] < 10:
                continue

            is_piano, white_ratio, num_black = is_piano_like_frame(roi)

            if debug:
                print(f"Frame {frame_index}, ROI y={y_start}: Putih={white_ratio:.2f}, Hitam={num_black} ‚Üí {'‚úÖ' if is_piano else '‚ùå'}")

            if is_piano:
                frame_found = frame
                best_roi = roi
                cap.release()
                return frame_found, frame_index, best_roi

        frame_index += 1

    cap.release()
    return None, None, None

# ---------- Deteksi tuts putih & hitam ----------
def detect_keys_from_black_spacing(image, debug=False):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 50, 255, cv2.THRESH_BINARY_INV)

    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    black_keys = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if h > image.shape[0] * 0.5 and w < image.shape[1] * 0.05:
            black_keys.append((x, y, w, h))

    black_keys = sorted(black_keys, key=lambda b: b[0])
    black_x_centers = [x + w // 2 for x, y, w, h in black_keys]

    white_keys = []
    if len(black_x_centers) >= 2:
        dists = [black_x_centers[i+1] - black_x_centers[i] for i in range(len(black_x_centers)-1)]
        avg_gap = np.median(dists)
        est_white_width = int(avg_gap * 0.75)

        first_black_center = black_x_centers[0]
        if first_black_center - est_white_width > 0:
            white_keys.append((first_black_center - est_white_width, 0, est_white_width, image.shape[0]))

        for i in range(len(black_x_centers) - 1):
            x1 = black_x_centers[i]
            x2 = black_x_centers[i+1]
            gap = x2 - x1

            if gap < avg_gap * 1.5:
                white_keys.append((x1 + (gap // 2) - (est_white_width // 2), 0, est_white_width, image.shape[0]))
            else:
                half_gap = gap // 2
                white_keys.append((x1 + (half_gap // 2) - (est_white_width // 2), 0, est_white_width, image.shape[0]))
                white_keys.append((x1 + half_gap + (half_gap // 2) - (est_white_width // 2), 0, est_white_width, image.shape[0]))

        last_black_center = black_x_centers[-1]
        last_white_right = white_keys[-1][0] + white_keys[-1][2]
        if last_white_right < image.shape[1] - est_white_width * 0.5:
            white_keys.append((last_black_center + 1, 0, est_white_width, image.shape[0]))

        last_white_right = white_keys[-1][0] + white_keys[-1][2]
        if last_white_right + est_white_width * 0.5 < image.shape[1]:
            white_keys.append((last_white_right, 0, est_white_width, image.shape[0]))

    vis = image.copy()
    for x, y, w, h in white_keys:
        cv2.rectangle(vis, (x, y), (x + w, y + h), (0, 255, 0), 2)
    for x, y, w, h in black_keys:
        cv2.rectangle(vis, (x, y), (x + w, y + h), (255, 0, 0), 2)

    if debug:
        print(f"Hitam: {len(black_keys)}, Putih estimasi: {len(white_keys)}")

    return vis, white_keys, black_keys

# ---------- MAIN ----------
frame, found_index, roi = find_frame_with_piano_by_whiteness(VIDEO_PATH, max_frames=MAX_FRAMES, debug=True)

if frame is None:
    raise Exception("‚ùå Tidak ditemukan frame dengan pola piano.")

print(f"\n‚úÖ Frame piano ditemukan di index ke-{found_index}\n")

cv2.imwrite("output_SYNtoSHEET/piano_frame.png", roi)
display(Image(filename="output_SYNtoSHEET/piano_frame.png"))

detected_img, white_keys, black_keys = detect_keys_from_black_spacing(roi, debug=True)
cv2.imwrite("output_SYNtoSHEET/detected_from_spacing.png", detected_img)
display(Image(filename="output_SYNtoSHEET/detected_from_spacing.png"))


In [None]:
import cv2
import numpy as np
import os
from IPython.display import Image, display

# ---------- CONFIG ----------
VIDEO_PATH = '../../Data/video/piano_visualizer.mp4'  # <--- IMPORTANT: Update this path to your video
MAX_FRAMES_TO_SCAN = 500 # Max frames to scan to find the initial piano frame
MIN_BLACK_KEYS = 10 # Minimum number of black keys expected in the piano ROI
WHITE_PIXEL_RATIO = 0.4 # Minimum white pixel ratio in the piano ROI (adjusted slightly)

# Parameters for dynamic piano ROI detection (more general, less reliant on "white" keys)
MIN_PIANO_ROI_HEIGHT_RATIO = 0.1 # Minimum height of the detected piano region as a ratio of frame height
MAX_PIANO_ROI_HEIGHT_RATIO = 0.5 # Maximum height of the detected piano region as a ratio of frame height
PIANO_ROI_HORIZONTAL_MARGIN_RATIO = 0.02 # Margin to add to the sides of the detected piano ROI
VERTICAL_EDGE_THRESHOLD = 50 # Threshold for Canny edge detection in ROI finding
VERTICAL_EDGE_MIN_LINE_LENGTH_RATIO = 0.05 # Minimum length of vertical lines for key segmentation in ROI finding

# Parameters for key segmentation within the detected ROI
MIN_KEY_WIDTH_RATIO = 0.005 # Minimum width of a detected key as a ratio of piano ROI width
MAX_KEY_WIDTH_RATIO = 0.05  # Maximum width of a detected key as a ratio of piano ROI width
KEY_SEGMENTATION_CANNY_LOW = 50
KEY_SEGMENTATION_CANNY_HIGH = 150
KEY_SEGMENTATION_HOUGH_THRESHOLD = 30 # Lower threshold for more lines

# ---------- OUTPUT DIR ----------
os.makedirs("output_SYNtoSHEET", exist_ok=True)

# ---------- Helper Functions ----------

def get_average_color(image_patch):
    """Calculates the average BGR color of an image patch."""
    return np.mean(image_patch, axis=(0, 1))

def color_distance(color1, color2):
    """Calculates the Euclidean distance between two BGR colors."""
    return np.sqrt(np.sum((np.array(color1) - np.array(color2))**2))

# ---------- Your Original Function (Adapted for general ROI) ----------
def is_piano_like_frame(roi_frame, min_black_keys=10, white_pixel_ratio=0.5, debug=False):
    """
    Checks if a given ROI frame looks like a piano based on white pixel dominance
    and the presence of a certain number of black-key-like contours.
    """
    if roi_frame.shape[0] == 0 or roi_frame.shape[1] == 0:
        if debug: print("ROI frame is empty.")
        return False, 0, 0

    gray = cv2.cvtColor(roi_frame, cv2.COLOR_BGR2GRAY)

    # Calculate white pixel dominance
    white_pixels = np.sum(gray > 200)
    total_pixels = gray.shape[0] * gray.shape[1]
    white_ratio = white_pixels / total_pixels

    # Detect black keys (dark, narrow, vertical contours)
    # Invert threshold to find dark areas
    _, thresh = cv2.threshold(gray, 50, 255, cv2.THRESH_BINARY_INV)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    black_keys_count = 0
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        # Heuristic for black keys: tall (at least 50% of ROI height) and narrow (less than 5% of ROI width)
        if h > roi_frame.shape[0] * 0.5 and w < roi_frame.shape[1] * 0.05 and w > 5: # min width 5 pixels
            black_keys_count += 1

    is_piano = white_ratio > white_pixel_ratio and black_keys_count >= min_black_keys
    if debug:
        print(f"  ROI Check: White Ratio={white_ratio:.2f}, Black Keys={black_keys_count} -> {'‚úÖ' if is_piano else '‚ùå'}")
    return is_piano, white_ratio, black_keys_count

# ---------- New/Improved Function: Find Piano ROI Anywhere in Frame ----------
def find_dynamic_piano_roi(frame, debug=False):
    """
    Attempts to automatically find the piano region (ROI) anywhere in the video frame.
    This heuristic looks for a prominent horizontal band of vertical edges,
    characteristic of piano keys, and then validates it with piano-like features.
    """
    height, width, _ = frame.shape
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Apply Canny edge detection to highlight edges, especially vertical ones
    edges = cv2.Canny(gray, VERTICAL_EDGE_THRESHOLD, VERTICAL_EDGE_THRESHOLD * 2, apertureSize=3)

    # Calculate horizontal projection of vertical edges (sum of edge pixels per row)
    vertical_edge_strength_per_row = np.sum(edges, axis=1)

    # Smooth the projection to find larger regions
    kernel_size = max(1, height // 30) # Average over ~3% of height
    smoothed_projection = np.convolve(vertical_edge_strength_per_row, np.ones(kernel_size) / kernel_size, mode='valid')

    if len(smoothed_projection) == 0:
        if debug: print("No significant vertical edge projection found.")
        return None

    # Find the peak in the smoothed projection
    max_val = np.max(smoothed_projection)

    # Identify potential y-coordinates where edge density is high (e.g., top 50% of peak)
    potential_y_centers = np.where(smoothed_projection > max_val * 0.5)[0]

    if len(potential_y_centers) == 0:
        if debug: print("No strong potential y-centers from projection.")
        return None

    # Determine the start and end rows of the most prominent band
    # Group contiguous indices to find the longest continuous segment
    diffs = np.diff(potential_y_centers)
    breaks = np.where(diffs > 1)[0] + 1
    segments = np.split(potential_y_centers, breaks)

    if not segments or all(len(s) == 0 for s in segments):
        if debug: print("No continuous segments found in vertical edge projection.")
        return None

    longest_segment = max(segments, key=len)

    if len(longest_segment) == 0:
        if debug: print("Longest segment is empty.")
        return None

    y_start_segment = longest_segment[0]
    y_end_segment = longest_segment[-1] + kernel_size # Adjust for convolution offset from convolution

    # Ensure the detected height is within reasonable bounds
    detected_height = y_end_segment - y_start_segment
    min_allowed_height = int(height * MIN_PIANO_ROI_HEIGHT_RATIO)
    max_allowed_height = int(height * MAX_PIANO_ROI_HEIGHT_RATIO)

    # If the detected height is too small or too large, try to adjust it
    if detected_height < min_allowed_height or detected_height > max_allowed_height:
        if debug: print(f"Detected height {detected_height} outside bounds [{min_allowed_height}-{max_allowed_height}], adjusting.")
        # Center a 'typical' piano height around the detected segment's center
        center_y = (y_start_segment + y_end_segment) // 2
        piano_height = int(height * (MIN_PIANO_ROI_HEIGHT_RATIO + MAX_PIANO_ROI_HEIGHT_RATIO) / 2)
        y_start = max(0, center_y - piano_height // 2)
        y_end = min(height, center_y + piano_height // 2)
    else:
        y_start = y_start_segment
        y_end = y_end_segment

    # Add horizontal margins
    x_start = int(width * PIANO_ROI_HORIZONTAL_MARGIN_RATIO)
    x_end = int(width * (1 - PIANO_ROI_HORIZONTAL_MARGIN_RATIO))

    # Final validation of ROI dimensions
    if (x_end - x_start) < (width * 0.5) or (y_end - y_start) < min_allowed_height:
        if debug: print("Final piano ROI is too small or narrow after adjustments.")
        return None

    # Ensure coordinates are integers and within frame bounds
    x_start, y_start, x_end, y_end = int(x_start), int(y_start), int(x_end), int(y_end)
    x_start = max(0, x_start)
    y_start = max(0, y_start)
    x_end = min(width, x_end)
    y_end = min(height, y_end)

    # Calculate width and height for the return tuple
    roi_width = x_end - x_start
    roi_height = y_end - y_start

    if roi_width <= 0 or roi_height <= 0:
        if debug: print("Calculated ROI dimensions are non-positive.")
        return None

    if debug: print(f"Candidate Piano ROI: x={x_start}, y={y_start}, w={roi_width}, h={roi_height}")
    return (x_start, y_start, roi_width, roi_height)


# ---------- Function: Find the first frame that looks like a piano ----------
def find_first_piano_frame_and_roi(video_path, max_frames_to_scan=500, debug=False):
    """
    Scans initial frames to find the first frame that contains a piano-like region.
    Returns the frame and the coordinates of the detected piano ROI.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise Exception(f"‚ùå Gagal membuka video: {video_path}")

    frame_found = None
    piano_roi_coords = None
    frame_index = 0

    print(f"Scanning up to {max_frames_to_scan} frames to find piano...")
    while frame_index < max_frames_to_scan:
        ret, frame = cap.read()
        if not ret:
            break # End of video or error

        if debug: print(f"Processing Frame {frame_index}...")

        # Step 1: Find a candidate ROI using edge detection
        candidate_roi_coords = find_dynamic_piano_roi(frame, debug=debug)

        if candidate_roi_coords:
            x, y, w, h = candidate_roi_coords
            # Extract the candidate ROI
            candidate_roi_frame = frame[y:y+h, x:x+w]

            # Step 2: Validate the candidate ROI using piano-like features (your original logic)
            is_piano, white_ratio, num_black = is_piano_like_frame(
                candidate_roi_frame,
                min_black_keys=MIN_BLACK_KEYS,
                white_pixel_ratio=WHITE_PIXEL_RATIO,
                debug=debug
            )

            if is_piano:
                frame_found = frame
                piano_roi_coords = candidate_roi_coords
                break # Piano frame found!

        frame_index += 1

    cap.release()
    return frame_found, piano_roi_coords, frame_index

# ---------- Fungsi: Deteksi tuts putih & hitam (from your original code) ----------
def detect_keys_from_black_spacing(image, debug=False):
    """
    Detects black and white key regions within a given piano ROI image.
    Assumes `image` is already the cropped piano ROI.
    """
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Invert threshold to find dark areas (black keys)
    _, thresh = cv2.threshold(gray, 50, 255, cv2.THRESH_BINARY_INV)

    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    black_keys = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        # Filter for black keys: tall and narrow within the ROI
        if h > image.shape[0] * 0.5 and w < image.shape[1] * 0.05 and w > 5: # Min width 5 pixels
            black_keys.append((x, y, w, h))

    black_keys = sorted(black_keys, key=lambda b: b[0])
    black_x_centers = [x + w // 2 for x, y, w, h in black_keys]

    white_keys = []
    if len(black_x_centers) >= 2:
        # Estimate average gap between black keys
        dists = [black_x_centers[i+1] - black_x_centers[i] for i in range(len(black_x_centers)-1)]
        avg_gap = np.median(dists) if dists else 0
        if avg_gap == 0: avg_gap = image.shape[1] / 10 # Fallback if no valid gaps

        est_white_width = int(avg_gap * 0.75) # Estimate white key width

        # 1Ô∏è‚É£ DETECT LEFTMOST WHITE KEY (before the first black key)
        if black_x_centers[0] - est_white_width > 0:
            white_keys.append((0, 0, black_x_centers[0] - 1, image.shape[0])) # From left edge to first black key

        # 2Ô∏è‚É£ DETECT WHITE KEYS BETWEEN BLACK KEYS
        for i in range(len(black_x_centers) - 1):
            x1 = black_x_centers[i]
            x2 = black_x_centers[i+1]
            gap = x2 - x1

            if gap < avg_gap * 1.5: # Typical gap for one white key
                # One white key between these two black keys
                white_keys.append((x1 + (gap // 2) - (est_white_width // 2), 0, est_white_width, image.shape[0]))
            else: # Larger gap, likely two white keys (E-F or B-C transition)
                half_gap = gap // 2
                white_keys.append((x1 + (half_gap // 2) - (est_white_width // 2), 0, est_white_width, image.shape[0]))
                white_keys.append((x1 + half_gap + (half_gap // 2) - (est_white_width // 2), 0, est_white_width, image.shape[0]))

        # 3Ô∏è‚É£ DETECT RIGHTMOST WHITE KEY (after the last black key)
        last_black_center = black_x_centers[-1]
        # Approximate position for the last white key
        if last_black_center + est_white_width < image.shape[1]:
            white_keys.append((last_black_center + 1, 0, image.shape[1] - (last_black_center + 1), image.shape[0]))
    elif len(black_x_centers) > 0: # If only a few black keys, or just one
        # Fallback: simple estimation of white keys based on available space
        # This part might need more sophisticated logic for robust detection
        white_keys = [(0, 0, image.shape[1], image.shape[0])] # Treat whole area as one white key block
    else: # No black keys detected, assume all white or no piano
        white_keys = [] # No keys detected

    # Filter out very small white keys that might be noise
    white_keys = [wk for wk in white_keys if wk[2] > image.shape[1] * MIN_KEY_WIDTH_RATIO]

    # Sort all keys by their x-coordinate
    all_keys = sorted(white_keys + black_keys, key=lambda k: k[0])

    # Convert relative coordinates of keys to absolute coordinates within the original frame
    # (This step is handled in the main block now, this function just returns relative to its input `image`)

    # Draw results for visualization
    vis = image.copy()
    for x, y, w, h in white_keys:
        cv2.rectangle(vis, (x, y), (x + w, y + h), (0, 255, 0), 2) # Green for white keys
    for x, y, w, h in black_keys:
        cv2.rectangle(vis, (x, y), (x + w, y + h), (255, 0, 0), 2) # Red for black keys

    if debug:
        print(f"Detected Black Keys: {len(black_keys)}, Estimated White Keys: {len(white_keys)}")

    return vis, white_keys, black_keys


# ---------- MAIN EXECUTION ----------
if __name__ == "__main__":
    if not os.path.exists(VIDEO_PATH):
        print(f"Error: Video file not found at '{VIDEO_PATH}'. Please update the 'VIDEO_PATH' variable.")
    else:
        # Step 1: Find the first piano-like frame and its ROI
        print("Starting video analysis to find piano region...")
        first_piano_frame, piano_roi_coords, found_index = find_first_piano_frame_and_roi(
            VIDEO_PATH, max_frames_to_scan=MAX_FRAMES_TO_SCAN, debug=True
        )

        if first_piano_frame is None or piano_roi_coords is None:
            print("‚ùå Tidak ditemukan frame dengan pola piano yang valid dalam rentang yang ditentukan.")
        else:
            print(f"\n‚úÖ Frame piano ditemukan di index ke-{found_index}")
            print(f"   Piano ROI: x={piano_roi_coords[0]}, y={piano_roi_coords[1]}, "
                  f"w={piano_roi_coords[2]}, h={piano_roi_coords[3]}")

            # Extract the detected piano ROI from the found frame
            x_roi, y_roi, w_roi, h_roi = piano_roi_coords
            # Ensure ROI coordinates are valid before slicing
            x_roi_end = min(x_roi + w_roi, first_piano_frame.shape[1])
            y_roi_end = min(y_roi + h_roi, first_piano_frame.shape[0])

            # Re-calculate w and h based on potentially clipped ends
            w_roi_clipped = x_roi_end - x_roi
            h_roi_clipped = y_roi_end - y_roi

            if w_roi_clipped <= 0 or h_roi_clipped <= 0:
                print("Error: Clipped ROI has zero or negative dimensions. Cannot proceed.")
            else:
                piano_roi_image = first_piano_frame[y_roi:y_roi_end, x_roi:x_roi_end]

                # Save and display the detected piano ROI
                cv2.imwrite("output_SYNtoSHEET/detected_piano_roi.png", piano_roi_image)
                print("\nDetected Piano ROI (cropped):")
                display(Image(filename="output_SYNtoSHEET/detected_piano_roi.png"))

                # Step 2: Detect individual keys within the piano ROI
                detected_img_relative, white_keys_relative, black_keys_relative = \
                    detect_keys_from_black_spacing(piano_roi_image, debug=True)

                # Convert relative key coordinates to absolute coordinates in the original frame
                all_keys_absolute = []
                for x, y, w, h in white_keys_relative:
                    all_keys_absolute.append({'type': 'white', 'id': len(all_keys_absolute),
                                              'x': x_roi + x, 'y': y_roi + y, 'w': w, 'h': h})
                for x, y, w, h in black_keys_relative:
                    all_keys_absolute.append({'type': 'black', 'id': len(all_keys_absolute),
                                              'x': x_roi + x, 'y': y_roi + y, 'w': w, 'h': h})

                # Sort all keys by their absolute x-coordinate for consistent ordering
                all_keys_absolute.sort(key=lambda k: k['x'])

                # Create a visualization of the full frame with detected piano ROI and keys
                full_frame_vis = first_piano_frame.copy()
                cv2.rectangle(full_frame_vis, (x_roi, y_roi), (x_roi + w_roi_clipped, y_roi + h_roi_clipped), (0, 255, 255), 2) # Yellow for overall ROI

                for key in all_keys_absolute:
                    color = (0, 255, 0) if key['type'] == 'white' else (255, 0, 0)
                    cv2.rectangle(full_frame_vis, (key['x'], key['y']), (key['x'] + key['w'], key['y'] + key['h']), color, 2)
                    cv2.putText(full_frame_vis, str(key['id']), (key['x'] + 5, key['y'] + 20),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)

                cv2.imwrite("output_SYNtoSHEET/full_frame_with_detected_keys.png", full_frame_vis)
                print("\nFull frame with detected piano ROI and individual keys:")
                display(Image(filename="output_SYNtoSHEET/full_frame_with_detected_keys.png"))

                print("\n--- Next Steps ---")
                print("The `all_keys_absolute` list now contains the absolute coordinates of each detected key.")
                print("You can use this list to define the regions for color change detection in the full video.")
                print("The next step would be to implement the 'calibrate_baselines' and 'detect_key_presses'")
                print("functions (from my previous response) using these `all_keys_absolute` coordinates.")


In [1]:

import cv2
import numpy as np
import os
import sys
from IPython.display import Image, display
import matplotlib.pyplot as plt

# ---------- CONFIG ----------
VIDEO_PATH = '../../Data/video/Twinkle_Twinkle_Little_Star_12_Variations_-_Mozart_Piano_Tutorial_Synthesia.mp4'  # <--- PENTING: Perbarui jalur ini ke file video Anda
MAX_FRAMES_TO_SCAN = 500 # Maksimal frame yang akan dipindai untuk menemukan frame piano awal
MIN_BLACK_KEYS = 10 # Jumlah minimal tuts hitam yang diharapkan di ROI piano
WHITE_PIXEL_RATIO = 0.5 # Rasio piksel putih minimal di ROI piano
ROI_HEIGHT_RATIO = 0.25 # Rasio tinggi ROI awal (hanya bagian bawah frame)

# Parameter untuk deteksi tekan tuts (dari kode sebelumnya, untuk kelengkapan)
CALIBRATION_FRAMES = 30  # Jumlah frame awal untuk kalibrasi warna tuts yang tidak ditekan
KEY_PRESS_COLOR_THRESHOLD = 30 # Ambang batas untuk mendeteksi perubahan warna (sesuaikan sesuai kebutuhan)

# ---------- OUTPUT DIR ----------
os.makedirs("output_SYNtoSHEET", exist_ok=True)

# ---------- Fungsi: Deteksi apakah frame seperti piano ----------
def is_piano_like_frame(roi, min_black_keys=10, white_pixel_ratio=0.5, debug=False):
    """
    Memeriksa apakah ROI yang diberikan terlihat seperti piano berdasarkan dominasi piksel putih
    dan keberadaan sejumlah kontur mirip tuts hitam.
    """
    if roi.shape[0] == 0 or roi.shape[1] == 0:
        if debug: print("  ROI frame kosong.")
        return False, 0, 0

    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)

    # Hitung dominasi warna putih
    white_pixels = np.sum(gray > 200)
    total_pixels = gray.shape[0] * gray.shape[1]
    white_ratio = white_pixels / total_pixels

    # Deteksi hitam (tuts hitam)
    _, thresh = cv2.threshold(gray, 50, 255, cv2.THRESH_BINARY_INV)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    black_keys_count = 0
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        # Heuristik untuk tuts hitam: tinggi (setidaknya 50% dari tinggi ROI) dan sempit (kurang dari 5% dari lebar ROI)
        if h > roi.shape[0] * 0.5 and w < roi.shape[1] * 0.05 and w > 5: # lebar min 5 piksel
            black_keys_count += 1

    is_piano = white_ratio > white_pixel_ratio and black_keys_count >= min_black_keys
    if debug:
        print(f"  Pemeriksaan ROI: Rasio Putih={white_ratio:.2f}, Tuts Hitam={black_keys_count} -> {'‚úÖ' if is_piano else '‚ùå'}")
    return is_piano, white_ratio, black_keys_count

# ---------- Fungsi: Temukan frame pertama yang mirip piano (menggunakan logika awal Anda) ----------
def find_frame_with_piano_by_whiteness(video_path, max_frames=500, debug=False):
    """
    Memindai frame awal untuk menemukan frame pertama yang berisi wilayah mirip piano
    di bagian bawah frame, berdasarkan rasio putih dan tuts hitam.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise Exception(f"‚ùå Gagal membuka video: {video_path}")

    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    h_roi = int(frame_height * ROI_HEIGHT_RATIO)

    frame_found = None
    frame_index = 0
    initial_roi_coords = None

    print(f"Memindai hingga {max_frames} frame untuk menemukan piano di bagian bawah video...")
    while frame_index < max_frames:
        ret, frame = cap.read()
        if not ret:
            break

        # ROI selalu di bagian bawah frame
        current_roi_y_start = frame_height - h_roi
        current_roi_coords = (0, current_roi_y_start, frame_width, h_roi)
        roi_segment = frame[current_roi_y_start:frame_height, 0:frame_width]

        is_piano, white_ratio, num_black = is_piano_like_frame(
            roi_segment,
            min_black_keys=MIN_BLACK_KEYS,
            white_pixel_ratio=WHITE_PIXEL_RATIO,
            debug=debug
        )

        if debug:
            print(f"Frame {frame_index}: Putih={white_ratio:.2f}, Hitam={num_black} ‚Üí {'‚úÖ' if is_piano else '‚ùå'}")

        if is_piano:
            frame_found = frame
            initial_roi_coords = current_roi_coords
            break

        frame_index += 1

    cap.release()
    return frame_found, initial_roi_coords, frame_index

# ---------- Fungsi: Deteksi tuts putih & hitam ----------
def detect_keys_from_black_spacing(image, debug=False):
    """
    Mendeteksi wilayah tuts hitam dan putih di dalam gambar ROI piano yang diberikan.
    Mengasumsikan `image` sudah merupakan ROI piano yang dipotong.
    """
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 50, 255, cv2.THRESH_BINARY_INV)

    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    black_keys = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if h > image.shape[0] * 0.5 and w < image.shape[1] * 0.05 and w > 5: # Lebar min 5 piksel
            black_keys.append((x, y, w, h))

    black_keys = sorted(black_keys, key=lambda b: b[0])
    black_x_centers = [x + w // 2 for x, y, w, h in black_keys]

    white_keys = []
    if len(black_x_centers) >= 2:
        dists = [black_x_centers[i+1] - black_x_centers[i] for i in range(len(black_x_centers)-1)]
        avg_gap = np.median(dists) if dists else 0
        if avg_gap == 0: avg_gap = image.shape[1] / 10 # Fallback jika tidak ada jarak yang valid

        est_white_width = int(avg_gap * 0.75) # Perkirakan lebar tuts putih

        # 1Ô∏è‚É£ DETEKSI TUTS PUTIH PALING KIRI (sebelum tuts hitam pertama)
        if black_x_centers[0] - est_white_width > 0:
            # Dari tepi kiri hingga tuts hitam pertama
            white_keys.append((0, 0, black_x_centers[0] - 1, image.shape[0]))

        # 2Ô∏è‚É£ DETEKSI TUTS PUTIH DI ANTARA TUTS HITAM
        for i in range(len(black_x_centers) - 1):
            x1 = black_x_centers[i]
            x2 = black_x_centers[i+1]
            gap = x2 - x1

            if gap < avg_gap * 1.5: # Jarak tipikal untuk satu tuts putih
                # Satu tuts putih di antara dua tuts hitam ini
                white_keys.append((x1 + (gap // 2) - (est_white_width // 2), 0, est_white_width, image.shape[0]))
            else: # Jarak lebih besar, kemungkinan dua tuts putih (transisi E-F atau B-C)
                half_gap = gap // 2
                white_keys.append((x1 + (half_gap // 2) - (est_white_width // 2), 0, est_white_width, image.shape[0]))
                white_keys.append((x1 + half_gap + (half_gap // 2) - (est_white_width // 2), 0, est_white_width, image.shape[0]))

        # 3Ô∏è‚É£ DETEKSI TUTS PUTIH PALING KANAN (setelah tuts hitam terakhir)
        last_black_center = black_x_centers[-1]
        # Posisi perkiraan untuk tuts putih terakhir
        if last_black_center + est_white_width < image.shape[1]:
            white_keys.append((last_black_center + 1, 0, image.shape[1] - (last_black_center + 1), image.shape[0]))
    elif len(black_x_centers) > 0: # Jika hanya beberapa tuts hitam, atau hanya satu
        # Fallback: estimasi sederhana tuts putih berdasarkan ruang yang tersedia
        white_keys = [(0, 0, image.shape[1], image.shape[0])] # Perlakukan seluruh area sebagai satu blok tuts putih
    else: # Tidak ada tuts hitam terdeteksi, asumsikan semua putih atau tidak ada piano
        white_keys = [] # Tidak ada tuts terdeteksi

    # Saring tuts putih yang sangat kecil yang mungkin merupakan noise
    # Menggunakan MIN_KEY_WIDTH_RATIO dari konfigurasi atas
    white_keys = [wk for wk in white_keys if wk[2] > image.shape[1] * 0.005] # Menggunakan nilai hardcoded sementara

    # Urutkan semua tuts berdasarkan koordinat x-nya
    all_keys = sorted(white_keys + black_keys, key=lambda k: k[0])

    # Gambar hasil untuk visualisasi
    vis = image.copy()
    for x, y, w, h in white_keys:
        cv2.rectangle(vis, (x, y), (x + w, y + h), (0, 255, 0), 2) # Hijau untuk tuts putih
    for x, y, w, h in black_keys:
        cv2.rectangle(vis, (x, y), (x + w, y + h), (255, 0, 0), 2) # Merah untuk tuts hitam

    if debug:
        print(f"Tuts Hitam Terdeteksi: {len(black_keys)}, Tuts Putih Estimasi: {len(white_keys)}")

    return vis, white_keys, black_keys

# ---------- Fungsi Deteksi Tekan Tuts (dari kode sebelumnya) ----------
def get_average_color(image_patch):
    """Menghitung rata-rata warna BGR dari patch gambar."""
    return np.mean(image_patch, axis=(0, 1))

def color_distance(color1, color2):
    """Menghitung jarak Euclidean antara dua warna BGR."""
    return np.sqrt(np.sum((np.array(color1) - np.array(color2))**2))

def map_key_to_note(key_index):
    """
    Memetakan indeks tuts (berbasis 0, kiri ke kanan) ke nama not musik generik.
    Ini adalah pemetaan yang disederhanakan.
    """
    return f"Key {key_index + 1}"

def calibrate_baselines(video_path, piano_roi_coords, key_regions):
    """
    Mengkalibrasi baseline warna 'tidak ditekan' untuk setiap wilayah tuts
    dengan merata-ratakan warna selama frame awal.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Tidak dapat membuka video {video_path}")
        return None

    baseline_colors = defaultdict(lambda: np.zeros(3))
    frame_counts = defaultdict(int)

    print(f"Mengkalibrasi baseline selama {CALIBRATION_FRAMES} frame...")
    for i in range(CALIBRATION_FRAMES):
        ret, frame = cap.read()
        if not ret:
            print("Peringatan: Tidak dapat membaca cukup frame untuk kalibrasi. Menggunakan frame yang tersedia.")
            break

        x_roi, y_roi, w_roi, h_roi = piano_roi_coords

        if not (0 <= y_roi < frame.shape[0] and 0 <= x_roi < frame.shape[1] and
                y_roi + h_roi <= frame.shape[0] and x_roi + w_roi <= frame.shape[1]):
            print(f"Error: ROI Piano {piano_roi_coords} di luar batas frame {frame.shape}. Melewatkan kalibrasi.")
            cap.release()
            return None

        piano_roi_frame = frame[y_roi:y_roi+h_roi, x_roi:x_roi+w_roi]

        for key in key_regions:
            key_id = key['id']
            kx_relative = key['x'] - x_roi
            ky_relative = key['y'] - y_roi
            kw = key['w']
            kh = key['h']

            if not (0 <= ky_relative < piano_roi_frame.shape[0] and 0 <= kx_relative < piano_roi_frame.shape[1] and
                    ky_relative + kh <= piano_roi_frame.shape[0] and kx_relative + kw <= piano_roi_frame.shape[1]):
                continue

            key_patch = piano_roi_frame[ky_relative:ky_relative+kh, kx_relative:kx_relative+kw]

            if key_patch.shape[0] > 0 and key_patch.shape[1] > 0:
                baseline_colors[key_id] += get_average_color(key_patch)
                frame_counts[key_id] += 1

    cap.release()

    final_baselines = {}
    for key_id, sum_color in baseline_colors.items():
        if frame_counts[key_id] > 0:
            final_baselines[key_id] = sum_color / frame_counts[key_id]
        else:
            print(f"Peringatan: Tidak ada frame yang diproses untuk kunci {key_id} selama kalibrasi. Default ke hitam.")
            final_baselines[key_id] = np.array([0., 0., 0.])

    print("Kalibrasi selesai.")
    return final_baselines


def detect_key_presses(video_path, piano_roi_coords, key_regions, baseline_colors):
    """
    Mendeteksi penekanan tuts berdasarkan perubahan warna di dalam wilayah tuts.
    Merekam waktu mulai dan berakhir untuk setiap not.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Tidak dapat membuka video {video_path}")
        return []

    fps = cap.get(cv2.CAP_PROP_FPS)
    if fps == 0:
        print("Peringatan: Tidak dapat mendapatkan FPS, default ke 30.")
        fps = 30.0

    current_key_states = {key['id']: False for key in key_regions}
    note_start_times = {}
    detected_notes = []

    frame_idx = 0
    print("Mendeteksi penekanan tuts...")

    # Lewati frame kalibrasi
    for _ in range(CALIBRATION_FRAMES):
        cap.read()

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        current_time = frame_idx / fps

        x_roi, y_roi, w_roi, h_roi = piano_roi_coords

        if not (0 <= y_roi < frame.shape[0] and 0 <= x_roi < frame.shape[1] and
                y_roi + h_roi <= frame.shape[0] and x_roi + w_roi <= frame.shape[1]):
            print(f"Error: ROI Piano {piano_roi_coords} di luar batas frame {frame.shape}. Menghentikan deteksi.")
            break

        piano_roi_frame = frame[y_roi:y_roi+h_roi, x_roi:x_roi+w_roi]

        # Gambar batas ROI piano untuk visualisasi
        cv2.rectangle(frame, (x_roi, y_roi), (x_roi + w_roi, y_roi + h_roi), (0, 255, 0), 2)

        for key in key_regions:
            key_id = key['id']
            kx_relative = key['x'] - x_roi
            ky_relative = key['y'] - y_roi
            kw = key['w']
            kh = key['h']

            if not (0 <= ky_relative < piano_roi_frame.shape[0] and 0 <= kx_relative < piano_roi_frame.shape[1] and
                    ky_relative + kh <= piano_roi_frame.shape[0] and kx_relative + kw <= piano_roi_frame.shape[1]):
                continue

            key_patch = piano_roi_frame[ky_relative:ky_relative+kh, kx_relative:kx_relative+kw]

            if key_patch.shape[0] == 0 or key_patch.shape[1] == 0:
                continue

            current_color = get_average_color(key_patch)

            if key_id not in baseline_colors:
                continue

            distance = color_distance(current_color, baseline_colors[key_id])

            is_pressed = distance > KEY_PRESS_COLOR_THRESHOLD

            if is_pressed and not current_key_states[key_id]:
                current_key_states[key_id] = True
                note_start_times[key_id] = current_time
                cv2.rectangle(frame, (key['x'], key['y']), (key['x'] + key['w'], key['y'] + key['h']), (0, 0, 255), 2) # Merah untuk ditekan
                cv2.putText(frame, "P", (key['x'] + 5, key['y'] + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
            elif not is_pressed and current_key_states[key_id]:
                current_key_states[key_id] = False
                note_name = map_key_to_note(key_id)
                start_time = note_start_times.get(key_id, current_time)
                detected_notes.append((note_name, start_time, current_time))
                cv2.rectangle(frame, (key['x'], key['y']), (key['x'] + key['w'], key['y'] + key['h']), (255, 0, 0), 1) # Biru untuk dilepaskan
                cv2.putText(frame, "R", (key['x'] + 5, key['y'] + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
            elif is_pressed and current_key_states[key_id]:
                cv2.rectangle(frame, (key['x'], key['y']), (key['x'] + key['w'], key['y'] + key['h']), (0, 255, 255), 1) # Kuning untuk ditahan
                cv2.putText(frame, "H", (key['x'] + 5, key['y'] + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 255), 1)
            else:
                cv2.rectangle(frame, (key['x'], key['y']), (key['x'] + key['w'], key['y'] + key['h']), (0, 255, 0), 1) # Hijau untuk tidak ditekan

        # cv2.imshow("Deteksi Tekan Tuts Synthesia", frame) # Ini akan membuka jendela terpisah
        # if cv2.waitKey(1) & 0xFF == ord('q'):
        #     break

        frame_idx += 1

    # Tambahkan not yang masih ditekan di akhir video
    for key_id, is_pressed in current_key_states.items():
        if is_pressed:
            note_name = map_key_to_note(key_id)
            start_time = note_start_times.get(key_id, current_time)
            detected_notes.append((note_name, start_time, current_time))

    cap.release()
    # cv2.destroyAllWindows() # Hanya jika cv2.imshow digunakan
    print("Deteksi selesai.")
    return detected_notes

# ---------- BLOK EKSEKUSI UTAMA ----------
# Blok ini dirancang untuk dijalankan langsung di sel Jupyter Notebook.

if not os.path.exists(VIDEO_PATH):
    print(f"Error: File video tidak ditemukan di '{VIDEO_PATH}'. Harap perbarui variabel 'VIDEO_PATH'.")
    sys.exit(1)
else:
    # Langkah 1: Temukan frame mirip piano pertama dan ROI awalnya (dari logika Anda)
    first_piano_frame, initial_roi_coords, found_index = find_frame_with_piano_by_whiteness(
        VIDEO_PATH, max_frames=MAX_FRAMES_TO_SCAN, debug=True
    )

    if first_piano_frame is None:
        print("‚ùå Tidak ditemukan frame dengan pola piano di bagian bawah video. Menghentikan.")
    else:
        print(f"\n‚úÖ Frame piano ditemukan di index ke-{found_index}")
        print(f"   Deteksi awal ROI Piano: x={initial_roi_coords[0]}, y={initial_roi_coords[1]}, "
              f"w={initial_roi_coords[2]}, h={initial_roi_coords[3]}")

        # Buat salinan frame untuk digambar dan ditampilkan
        frame_to_display = first_piano_frame.copy()
        x_init, y_init, w_init, h_init = initial_roi_coords
        cv2.rectangle(frame_to_display, (x_init, y_init), (x_init + w_init, y_init + h_init), (0, 255, 255), 2) # Kuning untuk deteksi awal

        print("\nDeteksi awal ROI Piano di frame pertama:")
        plt.figure(figsize=(12, 8))
        plt.imshow(cv2.cvtColor(frame_to_display, cv2.COLOR_BGR2RGB))
        plt.title(f"Frame {found_index} dengan Deteksi Awal ROI Piano (Kuning)")
        plt.axis('off')
        plt.show()

        print("\nüí° Sekarang, silakan sesuaikan ROI piano secara manual.")
        print("   Di jendela yang muncul, **klik pada sudut kiri atas piano, lalu seret mouse Anda ke sudut kanan bawah piano untuk membuat kotak.**")
        print("   Tekan ENTER atau SPACE untuk mengonfirmasi pilihan Anda.")
        print("   Tekan 'c' untuk membatalkan.")

        # Tampilkan frame untuk pemilihan manual
        cv2.namedWindow("Sesuaikan ROI Piano Manual", cv2.WINDOW_NORMAL)
        cv2.resizeWindow("Sesuaikan ROI Piano Manual", 800, 600)

        # cv2.selectROI mengembalikan (x, y, w, h)
        manual_roi = cv2.selectROI("Sesuaikan ROI Piano Manual", first_piano_frame, fromCenter=False, showCrosshair=True)
        cv2.destroyWindow("Sesuaikan ROI Piano Manual")

        x_manual, y_manual, w_manual, h_manual = [int(val) for val in manual_roi]

        if w_manual > 0 and h_manual > 0:
            corrected_piano_roi_coords = (x_manual, y_manual, w_manual, h_manual)
            print(f"‚úÖ ROI piano berhasil disesuaikan secara manual: x={x_manual}, y={y_manual}, w={w_manual}, h={h_manual}")
        else:
            print("‚ùå Penyesuaian ROI manual dibatalkan atau tidak valid. Menghentikan.")
            sys.exit(1)

        # Ekstrak ROI piano yang telah dikoreksi dari frame yang ditemukan
        x_roi, y_roi, w_roi, h_roi = corrected_piano_roi_coords
        x_roi_end = min(x_roi + w_roi, first_piano_frame.shape[1])
        y_roi_end = min(y_roi + h_roi, first_piano_frame.shape[0])

        w_roi_clipped = x_roi_end - x_roi
        h_roi_clipped = y_roi_end - y_roi

        if w_roi_clipped <= 0 or h_roi_clipped <= 0:
            print("Error: ROI yang dikoreksi memiliki dimensi nol atau negatif. Tidak dapat melanjutkan.")
            sys.exit(1)
        else:
            piano_roi_image = first_piano_frame[y_roi:y_roi_end, x_roi:x_roi_end]

            # Simpan dan tampilkan ROI piano yang terdeteksi (setelah koreksi)
            cv2.imwrite("output_SYNtoSHEET/corrected_piano_roi.png", piano_roi_image)
            print("\nROI Piano yang Dikoreksi (terpotong):")
            display(Image(filename="output_SYNtoSHEET/corrected_piano_roi.png"))

            # Langkah 2: Deteksi tuts individual di dalam ROI piano yang telah dikoreksi
            detected_img_relative, white_keys_relative, black_keys_relative = \
                detect_keys_from_black_spacing(piano_roi_image, debug=True)

            # Konversi koordinat tuts relatif ke koordinat absolut dalam frame asli
            all_keys_absolute = []
            for x, y, w, h in white_keys_relative:
                all_keys_absolute.append({'type': 'white', 'id': len(all_keys_absolute),
                                          'x': x_roi + x, 'y': y_roi + y, 'w': w, 'h': h})
            for x, y, w, h in black_keys_relative:
                all_keys_absolute.append({'type': 'black', 'id': len(all_keys_absolute),
                                          'x': x_roi + x, 'y': y_roi + y, 'w': w, 'h': h})

            # Urutkan semua tuts berdasarkan koordinat x absolutnya untuk urutan yang konsisten
            all_keys_absolute.sort(key=lambda k: k['x'])

            # Buat visualisasi frame penuh dengan ROI piano yang dikoreksi dan tuts
            full_frame_vis = first_piano_frame.copy()
            cv2.rectangle(full_frame_vis, (x_roi, y_roi), (x_roi + w_roi_clipped, y_roi + h_roi_clipped), (0, 255, 0), 3) # Hijau untuk ROI yang dikoreksi

            for key in all_keys_absolute:
                color = (0, 255, 0) if key['type'] == 'white' else (255, 0, 0)
                cv2.rectangle(full_frame_vis, (key['x'], key['y']), (key['x'] + key['w'], key['y'] + key['h']), color, 2)
                cv2.putText(full_frame_vis, str(key['id']), (key['x'] + 5, key['y'] + 20),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)

            cv2.imwrite("output_SYNtoSHEET/full_frame_with_corrected_keys.png", full_frame_vis)
            print("\nFrame penuh dengan ROI piano yang dikoreksi dan tuts individual:")
            display(Image(filename="output_SYNtoSHEET/full_frame_with_corrected_keys.png"))

            # Langkah 3: Kalibrasi Baseline
            # Pastikan kalibrasi dimulai dari awal video, jadi kita perlu membuka ulang video
            baseline_colors = calibrate_baselines(VIDEO_PATH, corrected_piano_roi_coords, all_keys_absolute)
            if not baseline_colors:
                print("Gagal mengkalibrasi warna baseline. Menghentikan.")
            else:
                # Langkah 4: Deteksi Tekan Tuts
                print("\nMemulai deteksi penekanan tuts...")
                detected_notes = detect_key_presses(VIDEO_PATH, corrected_piano_roi_coords, all_keys_absolute, baseline_colors)

                print("\n--- Not Terdeteksi ---")
                if detected_notes:
                    for note_name, start_time, end_time in detected_notes:
                        duration = end_time - start_time
                        print(f"Not: {note_name}, Mulai: {start_time:.2f}s, Akhir: {end_time:.2f}s, Durasi: {duration:.2f}s")
                else:
                    print("Tidak ada not terdeteksi.")

                print("\n--- Langkah Selanjutnya ---")
                print("Data ini (nama not, waktu mulai, waktu akhir) sekarang dapat digunakan untuk membuat lembaran musik.")
                print("Anda biasanya akan menggunakan pustaka notasi musik (misalnya, `music21` di Python) untuk mengkonversi")
                print("data terstruktur ini menjadi skor musik visual, atau mengekspornya sebagai file MIDI.")


Memindai hingga 500 frame untuk menemukan piano di bagian bawah video...
  Pemeriksaan ROI: Rasio Putih=0.00, Tuts Hitam=0 -> ‚ùå
Frame 0: Putih=0.00, Hitam=0 ‚Üí ‚ùå
  Pemeriksaan ROI: Rasio Putih=0.00, Tuts Hitam=0 -> ‚ùå
Frame 1: Putih=0.00, Hitam=0 ‚Üí ‚ùå
  Pemeriksaan ROI: Rasio Putih=0.62, Tuts Hitam=7 -> ‚ùå
Frame 2: Putih=0.62, Hitam=7 ‚Üí ‚ùå
  Pemeriksaan ROI: Rasio Putih=0.62, Tuts Hitam=7 -> ‚ùå
Frame 3: Putih=0.62, Hitam=7 ‚Üí ‚ùå
  Pemeriksaan ROI: Rasio Putih=0.62, Tuts Hitam=7 -> ‚ùå
Frame 4: Putih=0.62, Hitam=7 ‚Üí ‚ùå
  Pemeriksaan ROI: Rasio Putih=0.62, Tuts Hitam=7 -> ‚ùå
Frame 5: Putih=0.62, Hitam=7 ‚Üí ‚ùå
  Pemeriksaan ROI: Rasio Putih=0.62, Tuts Hitam=7 -> ‚ùå
Frame 6: Putih=0.62, Hitam=7 ‚Üí ‚ùå
  Pemeriksaan ROI: Rasio Putih=0.62, Tuts Hitam=7 -> ‚ùå
Frame 7: Putih=0.62, Hitam=7 ‚Üí ‚ùå
  Pemeriksaan ROI: Rasio Putih=0.62, Tuts Hitam=7 -> ‚ùå
Frame 8: Putih=0.62, Hitam=7 ‚Üí ‚ùå
  Pemeriksaan ROI: Rasio Putih=0.62, Tuts Hitam=7 -> ‚ùå
Frame 9: Putih=0.62, Hit

In [13]:
import cv2
import numpy as np
import os
import subprocess
from collections import defaultdict
import music21
from sklearn.cluster import KMeans

class SynthesiaToSheetMusic:
    def __init__(self, video_path):
        self.video_path = video_path
        self.cap = cv2.VideoCapture(video_path)
        if not self.cap.isOpened():
            raise ValueError("Could not open video file")

        self.fps = self.cap.get(cv2.CAP_PROP_FPS)
        self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
        self.piano_region = None
        self.black_keys_pos = []
        self.white_keys_pos = []
        self.key_mapping = {}
        self.note_events = []
        self.color_clusters = None
        self.color_centers = []
        self.min_color_area = 50  # Minimum pixels to consider a valid note
        self.current_notes = defaultdict(list)#

    def find_piano_region(self):
        """Find the piano region by scanning frames until piano pattern is detected"""
        max_frames_to_scan = int(self.fps * 10)  # Scan up to 10 seconds
        scan_step = max(1, int(self.fps / 2))  # Check 2 frames per second

        for frame_num in range(0, min(max_frames_to_scan, self.total_frames), scan_step):
            self.cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
            ret, frame = self.cap.read()
            if not ret:
                continue

            # Try multiple detection methods
            detected = self._detect_piano_by_edges(frame) or \
                      self._detect_piano_by_contours(frame) or \
                      self._detect_piano_by_pattern(frame)

            if detected:
                print(f"Found piano at frame {frame_num} ({(frame_num/self.fps):.1f} seconds)")
                return True

        # Fallback: if not found, use bottom 20% of last checked frame
        print("Warning: Piano not detected automatically, using fallback position")
        h, w = frame.shape[:2]
        self.piano_region = (0, int(h*0.8), w, int(h*0.2))
        return True

    def _detect_piano_by_edges(self, frame):
        """Detect piano using edge detection"""
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        edges = cv2.Canny(gray, 50, 150)

        # Look for horizontal lines in bottom half of frame
        h, w = frame.shape[:2]
        roi = edges[int(h*0.5):h, 0:w]

        lines = cv2.HoughLinesP(roi, 1, np.pi/180, threshold=50,
                              minLineLength=w*0.3, maxLineGap=10)

        if lines is not None:
            # Count horizontal lines (within ¬±5 degrees of horizontal)
            horizontal_lines = 0
            for line in lines:
                x1, y1, x2, y2 = line[0]
                angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
                if abs(angle) < 5 or abs(angle - 180) < 5:
                    horizontal_lines += 1
                    if horizontal_lines > 3:  # Found several horizontal lines
                        y_mode = max(set([l[0][1] for l in lines]), key=[l[0][1] for l in lines].count)
                        self.piano_region = (0, y_mode + int(h*0.5) - 20, w, 140)
                        return True
        return False

    def _detect_piano_by_contours(self, frame):
        """Detect piano using contour analysis"""
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        _, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)

        # Look for large horizontal rectangles in bottom half
        h, w = frame.shape[:2]
        roi = thresh[int(h*0.6):h, 0:w]

        contours, _ = cv2.findContours(roi, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        piano_contours = []

        for cnt in contours:
            x, y, w, h = cv2.boundingRect(cnt)
            aspect_ratio = w / float(h)
            if aspect_ratio > 5 and w > 100 and h > 10:
                piano_contours.append((x, y, w, h))

        if piano_contours:
            # Combine contours
            min_x = min(c[0] for c in piano_contours)
            max_x = max(c[0]+c[2] for c in piano_contours)
            min_y = min(c[1] for c in piano_contours)
            max_y = max(c[1]+c[3] for c in piano_contours)

            # Adjust y to full frame coordinates
            min_y += int(frame.shape[0] * 0.6)
            max_y += int(frame.shape[0] * 0.6)

            self.piano_region = (min_x, min_y, max_x - min_x, max_y - min_y)
            return True
        return False

    def _detect_piano_by_pattern(self, frame):
        """Detect piano by looking for black/white key pattern"""
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        h, w = frame.shape[:2]

        # Sample vertical strips to find repeating pattern
        strip_width = 20
        strip_count = 10
        start_x = w // 4  # Avoid edges where UI might be

        patterns = []

        for i in range(strip_count):
            x = start_x + i * strip_width
            if x + strip_width >= w:
                break

            strip = gray[int(h*0.8):h, x:x+strip_width]
            avg_brightness = np.mean(strip)
            patterns.append(1 if avg_brightness > 128 else 0)  # 1=white, 0=black

        # Look for alternating pattern (black and white keys)
        if len(patterns) >= 5:
            # Count transitions between black and white
            transitions = sum(1 for i in range(len(patterns)-1) if patterns[i] != patterns[i+1])

            if transitions >= 3:  # Found key pattern
                self.piano_region = (0, int(h*0.7), w, int(h*0.3))
                return True

        return False

    def detect_piano_keys(self):
        """Detect black and white piano keys in the region"""
        if not self.piano_region:
            return False

        # Get a clean frame (first frame is usually clean)
        self.cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
        ret, frame = self.cap.read()
        if not ret:
            return False

        x, y, w, h = self.piano_region
        piano_img = frame[y:y+h, x:x+w]
        gray = cv2.cvtColor(piano_img, cv2.COLOR_BGR2GRAY)

        # Adaptive threshold to handle varying lighting
        thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                     cv2.THRESH_BINARY_INV, 11, 2)

        # Morphological operations to clean up the image
        kernel = np.ones((3,3), np.uint8)
        thresh = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=2)

        # Find vertical lines (key separators)
        lines = cv2.HoughLinesP(thresh, 1, np.pi/180, threshold=50,minLineLength=h//2, maxLineGap=10)

        if lines is None:
            return False

        # Group vertical lines by x position
        x_positions = []
        for line in lines:
            x1, y1, x2, y2 = line[0]
            if abs(x1 - x2) < 5 and abs(y1 - y2) > h//3:  # Vertical line
                x_positions.append(x1)

        if not x_positions:
            return False

        # Cluster nearby x positions
        x_positions = np.array(x_positions)
        clustering = KMeans(n_clusters=min(50, len(x_positions)), random_state=0).fit(x_positions.reshape(-1, 1))
        clusters = [int(np.mean(x_positions[clustering.labels_ == i]))for i in range(clustering.n_clusters)]
        clusters.sort()

        # Detect black keys (wider gaps between some white keys)
        key_widths = [clusters[i+1] - clusters[i] for i in range(len(clusters)-1)]
        avg_width = np.median(key_widths)

        # Pattern of black keys: groups of 2 and 3 with different gaps
        black_key_indices = []
        i = 0
        while i < len(clusters) - 1:
            gap = clusters[i+1] - clusters[i]
            if gap > avg_width * 1.5:  # Gap before black key
                # Check if this is part of a black key group
                if i + 3 < len(clusters) and clusters[i+3] - clusters[i+2] > avg_width * 1.5:
                    # Group of 2 black keys
                    black_key_indices.extend([i+1, i+2])
                    i += 3
                elif i + 4 < len(clusters) and clusters[i+4] - clusters[i+3] > avg_width * 1.5:
                    # Group of 3 black keys
                    black_key_indices.extend([i+1, i+2, i+3])
                    i += 4
                else:
                    i += 1
            else:
                i += 1

        # Map positions to piano keys
        self.black_keys_pos = [clusters[i] for i in black_key_indices]
        self.white_keys_pos = [x for i, x in enumerate(clusters) if i not in black_key_indices]

        # Create key mapping (position to note)
        # This maps to a standard 88-key piano
        notes = ['A0', 'A#0', 'B0']
        for octave in range(1, 8):
            notes.extend([
                f'C{octave}', f'C#{octave}', f'D{octave}', f'D#{octave}',
                f'E{octave}', f'F{octave}', f'F#{octave}', f'G{octave}',
                f'G#{octave}', f'A{octave}', f'A#{octave}', f'B{octave}'
            ])
        notes.append('C8')

        # Assign notes to white keys (skipping black keys)
        white_key_notes = []
        skip_indices = {1, 3, 6, 8, 10}  # Black key positions in each octave
        for i, note in enumerate(notes):
            if i % 12 not in skip_indices:
                white_key_notes.append(note)

        for pos, note in zip(self.white_keys_pos, white_key_notes[:len(self.white_keys_pos)]):
            self.key_mapping[pos] = note

        print(f"Detected {len(self.white_keys_pos)} white keys and {len(self.black_keys_pos)} black keys")
        return True

    def detect_color_clusters(self, frame):
        """Detect color clusters using k-means"""
        x, y, w, h = self.piano_region
        piano_img = frame[y:y+h, x:x+w]

        # Mask out very dark and very light pixels (background)
        hsv = cv2.cvtColor(piano_img, cv2.COLOR_BGR2HSV)
        mask = cv2.inRange(hsv, (0, 50, 50), (180, 255, 255))
        colored_pixels = piano_img[mask > 0]

        if len(colored_pixels) == 0:
            return []

        # Use elbow method to find optimal k
        max_k = min(8, len(colored_pixels) // 100)  # Limit to 8 colors max
        if max_k < 2:
            return []

        # Find optimal k using silhouette score
        best_k = 2
        best_score = -1
        for k in range(2, max_k + 1):
            kmeans = KMeans(n_clusters=k, random_state=0).fit(colored_pixels)
            if hasattr(kmeans, 'score'):
                score = -kmeans.score(colored_pixels)  # Inertia (lower is better)
                if score > best_score:
                    best_score = score
                    best_k = k

        # Perform final clustering
        kmeans = KMeans(n_clusters=best_k, random_state=0).fit(colored_pixels)
        self.color_centers = kmeans.cluster_centers_

        return kmeans.labels_

    def detect_notes(self):
        """Process video to detect note events with dynamic color detection"""
        if not self.piano_region or not self.key_mapping:
            return False

        x, y, w, h = self.piano_region
        key_positions = sorted(self.key_mapping.keys())

        # Process video frame by frame (sample every nth frame for performance)
        frame_step = max(1, int(self.fps // 30))  # Aim for ~30 fps processing
        for frame_num in range(0, self.total_frames, frame_step):
            self.cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
            ret, frame = self.cap.read()
            if not ret:
                continue

            piano_img = frame[y:y+h, x:x+w]

            # Detect color clusters in this frame
            color_labels = self.detect_color_clusters(frame)
            if len(color_labels) == 0:
                continue

            # Create a label map for the entire piano region
            hsv = cv2.cvtColor(piano_img, cv2.COLOR_BGR2HSV)
            mask = cv2.inRange(hsv, (0, 50, 50), (180, 255, 255))
            label_map = np.zeros(piano_img.shape[:2], dtype=np.int8) - 1
            label_map[mask > 0] = color_labels

            # Check each key position for colored pixels
            for i, pos in enumerate(key_positions):
                # Define key area (adjust based on key width)
                key_start = pos - 5 if i == 0 else (key_positions[i-1] + pos) // 2
                key_end = pos + 5 if i == len(key_positions)-1 else (key_positions[i+1] + pos) // 2

                key_area = label_map[:, key_start:key_end]
                colored_pixels = key_area[key_area >= 0]

                if len(colored_pixels) > self.min_color_area:
                    # Find dominant color in the key area
                    unique, counts = np.unique(colored_pixels, return_counts=True)
                    color_id = unique[np.argmax(counts)]
                    color_bgr = self.color_centers[color_id]

                    note = self.key_mapping[pos]

                    # Check if this note is already active for this color
                    active = False
                    for active_note, start_frame in self.current_notes.get(color_id, []):
                        if active_note == note:
                            active = True
                            break

                    if not active:
                        if color_id not in self.current_notes:
                            self.current_notes[color_id] = []
                        self.current_notes[color_id].append((note, frame_num))
                else:
                    # Check if note was released
                    for color_id in list(self.current_notes.keys()):
                        for idx, (active_note, start_frame) in enumerate(self.current_notes[color_id]):
                            if active_note == note:
                                # Note released
                                duration = frame_num - start_frame
                                self.note_events.append({
                                    'note': note,
                                    'color': color_id,
                                    'color_rgb': self.color_centers[color_id],
                                    'start': start_frame / self.fps,
                                    'duration': duration / self.fps
                                })
                                del self.current_notes[color_id][idx]
                                break

        # Add any remaining active notes
        for color_id in self.current_notes:
            for note, start_frame in self.current_notes[color_id]:
                duration = (self.total_frames - start_frame) / self.fps
                self.note_events.append({
                    'note': note,
                    'color': color_id,
                    'color_rgb': self.color_centers[color_id],
                    'start': start_frame / self.fps,
                    'duration': duration / self.fps
                })

        print(f"Detected {len(self.note_events)} note events across {len(set(e['color'] for e in self.note_events))} colors")
        return True

    def calculate_tempo(self):
        """Estimate tempo from note intervals using autocorrelation"""
        if not self.note_events:
            return 120  # Default tempo

        # Get all note onsets
        onsets = sorted([event['start'] for event in self.note_events])

        if len(onsets) < 2:
            return 120

        # Calculate inter-onset intervals
        intervals = np.diff(onsets)

        # Filter out very short intervals (grace notes/ornaments)
        intervals = intervals[intervals > 0.05]

        if len(intervals) == 0:
            return 120

        # Autocorrelation to find periodicity
        max_lag = int(round(2.0 * self.fps))  # Up to 2 seconds
        autocorr = np.zeros(max_lag)

        for lag in range(1, max_lag):
            sum_product = 0
            count = 0
            for i in range(len(intervals) - lag):
                sum_product += intervals[i] * intervals[i + lag]
                count += 1
            if count > 0:
                autocorr[lag] = sum_product / count

        # Find peaks in autocorrelation
        peaks = []
        for i in range(1, len(autocorr)-1):
            if autocorr[i] > autocorr[i-1] and autocorr[i] > autocorr[i+1]:
                peaks.append(i)

        if not peaks:
            return 120

        # The first significant peak is likely the beat period
        beat_period = peaks[0] / self.fps
        tempo = 60 / beat_period

        # Round to nearest standard tempo
        standard_tempos = [40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100,105, 110, 115, 120, 125, 130, 135, 140, 145, 150]
        tempo = min(standard_tempos, key=lambda x: abs(x - tempo))

        return tempo

    def create_musicxml(self, output_path):
        """Create MusicXML file with separate parts for each color"""
        score = music21.stream.Score()
        tempo = self.calculate_tempo()

        # Group notes by color
        color_notes = defaultdict(list)
        for event in self.note_events:
            color_notes[event['color']].append(event)

        # Create a part for each color
        for color_id, events in color_notes.items():
            part = music21.stream.Part()

            # Add instrument name based on color
            color_rgb = events[0]['color_rgb']
            instrument = music21.instrument.Instrument(instrumentName=f"Part {color_id} (RGB: {color_rgb})"
            )
            part.append(instrument)

            # Sort events by start time
            events.sort(key=lambda x: x['start'])

            # Convert to music21 notes
            current_time = 0.0
            beat_duration = 60 / tempo

            for event in events:
                # Add rest if needed
                if event['start'] > current_time:
                    rest_duration = event['start'] - current_time
                    quarter_length = rest_duration / beat_duration
                    rest = music21.note.Rest(quarterLength=quarter_length)
                    part.append(rest)
                    current_time = event['start']

                # Add note
                note = music21.note.Note(event['note'])
                note.quarterLength = event['duration'] / beat_duration
                part.append(note)
                current_time = event['start'] + event['duration']

            score.insert(0, part)

        # Add tempo to the first part
        if len(score.parts) > 0:
            score.parts[0].insert(0, music21.tempo.MetronomeMark(number=tempo))

        # Write to MusicXML
        score.write('musicxml', fp=output_path)
        print(f"Saved MusicXML to {output_path}")

    def convert_to_sheet_music(self, output_path):
        """Main conversion process"""
        if not self.find_piano_region():
            print("Error: Could not find piano region")
            return False

        if not self.detect_piano_keys():
            print("Error: Could not detect piano keys")
            return False

        if not self.detect_notes():
            print("Error: Could not detect notes")
            return False

        # Create temporary MusicXML file
        musicxml_path = os.path.splitext(output_path)[0] + '.musicxml'
        self.create_musicxml(musicxml_path)

        # Convert to PDF using MuseScore (must be installed)
        try:
            subprocess.run(['mscore', musicxml_path, '-o', output_path], check=True)
            print(f"Successfully created sheet music at {output_path}")
            return True
        except (subprocess.CalledProcessError, FileNotFoundError):
            print("MuseScore not found or conversion failed. Created MusicXML file instead.")
            return False

    def release_resources(self):
        """Release video resources when done"""
        if self.cap.isOpened():
            self.cap.release()

    def __del__(self):
        """Destructor to ensure resources are released"""
        self.release_resources()

# Example usage
if __name__ == "__main__":
    converter = SynthesiaToSheetMusic("../../Data/video/Yakuza_6_-_Destiny_Piano_Cover.mp4")
    converter.convert_to_sheet_music("output_sheet_music.pdf")

Error: Could not detect piano keys
