## Visualization of Spectograms with Labels and Predictions

Goals: 
- first line: spectorgam (well readable with axes describtions)
- second line: ground-truth labels: bars from each onset to offset in color of class
- third line: ground-truth labels: bars from each onset to offset in color of class

maybe use parts of Bens code but also make it efficient

In [5]:
import os
from glob import glob
import pandas as pd
from transformers import WhisperFeatureExtractor
from transformers.audio_utils import mel_filter_bank
import librosa
from tqdm import tqdm
import numpy as npS
import matplotlib.pyplot as plt
from ipywidgets import interact, fixed
import json
import re
import matplotlib.colors as mcolors
from matplotlib.patches import Patch
import matplotlib.cm as cm
from IPython.display import clear_output, display
from matplotlib.patches import Rectangle
from IPython.display import Audio, display

In [6]:
# use WhisperSegFeatureExtractor to generate LogMel Spectorgams (other melscale and norm than librosa)

class WhisperSegFeatureExtractor( WhisperFeatureExtractor ):
    def __init__(self, sr, spec_time_step, min_frequency = None, max_frequency = None, chunk_length = 30 ):
        
        hop_length = int( spec_time_step * sr )
        if hop_length != spec_time_step * sr:
            print("Warning: spec_time_step * sr must be an integer. Consider changing the sampling rate sr.")
        
        if sr <= 32000:
            n_fft = 512
        elif sr <= 80000:
            n_fft = 1024
        elif sr <= 150000:
            n_fft = 2048
        elif sr <= 300000:
            n_fft = 4096
        else:
            n_fft = 8192
            
        if min_frequency is None:
            min_frequency = 0
        if max_frequency is None:
            max_frequency = sr // 2
            
        super().__init__(             
            feature_size=80,
            sampling_rate=sr,
            hop_length=hop_length,
            chunk_length = chunk_length,
            n_fft=n_fft,
            padding_value=0.0,
            return_attention_mask=False )
            
        self.mel_filters = mel_filter_bank(
            num_frequency_bins=1 + n_fft // 2,
            num_mel_filters=80,
            min_frequency=min_frequency,
            max_frequency=max_frequency,
            sampling_rate=sr,
            norm="slaney",
            mel_scale="slaney",
        )
            
class SpecViewer:
    def __init__( self,  ):
        self.colors = [np.array(mcolors.hex2color(color_string)) for color_string in list(mcolors.TABLEAU_COLORS.values()) + list(mcolors.CSS4_COLORS.values())][1:] # Skip the first color since it looks not so good ...
        unique_colors = None
        for color_arr in self.colors:
            if unique_colors is None:
                unique_colors = np.asarray([color_arr])
            else:
                if np.all( unique_colors == color_arr, axis = 1 ).sum() == 0:
                    unique_colors = np.concatenate( [unique_colors, color_arr[np.newaxis,:]], axis = 0 )
        self.colors = unique_colors[ unique_colors.mean(axis = 1) < 0.8, : ]
        
        self.cmap = cm.get_cmap("magma")
            
    """"
    The following functions are used for implement an interactive visulization function to see the spectrogram and the label
    """
    def filter_by_category(self, d, category):
        """Returns a filtered dictionary containing only items in the given category."""
        mask = np.array(d["cluster"]) == str(category)
        return {
            "onset": np.array(d["onset"])[mask].tolist(),
            "offset": np.array(d["offset"])[mask].tolist(),
            "cluster": np.array(d["cluster"])[mask].tolist()
        }


    def chunk_audio(self, audio, start_time, end_time, sr):
        start_idx = int( start_time * sr )
        end_idx = int( end_time * sr )
        chunked_audio = audio[start_idx:end_idx]
        return chunked_audio    

    def chunk_label(self, label, start_time, end_time ):
        
        label_onset_arr = np.array(label["onset"])
        label_offset_arr = np.array(label["offset"])
        
        intersected_indices = np.logical_and( label_onset_arr < end_time, label_offset_arr > start_time )
        chunked_label = {
                "onset": (np.maximum(label_onset_arr[intersected_indices], start_time ) - start_time).tolist(),
                "offset": (np.minimum(label_offset_arr[intersected_indices], end_time ) - start_time).tolist(),
                "cluster": [ label["cluster"][idx] for idx in np.argwhere(intersected_indices)[:,0] ]
            }
        return chunked_label   
    
    def min_max_norm(self, im, min_value = None, max_value = None ):
        if min_value is None:
            min_value = im.min()
        if max_value is None:
            max_value = im.max()
        return (im -  min_value ) / max( max_value - min_value, 1e-12 )

    def overlap_ratio(self, a_start, a_end, b_start, b_end):
        inter_start = max(a_start, b_start)
        inter_end = min(a_end, b_end)
        inter = max(0, inter_end - inter_start)
        union = max(a_end, b_end) - min(a_start, b_start)
        return inter / union if union > 0 else 0
    
    
    def get_fp_fn_indices(self, predictions, labels, tolerance=0.1):
        # predictions, labels: Dicts mit onset, offset, cluster
        # Rückgabewerte: Listen der Indizes von FP und FN
        import pandas as pd

        pred_df = pd.DataFrame(predictions)
        label_df = pd.DataFrame(labels)
        matched_pred = set()
        matched_label = set()
        fp_indices = []
        fn_indices = []

        for label_idx, label_row in label_df.iterrows():
            label_onset = label_row["onset"]
            label_offset = label_row["offset"]
            label_cluster = label_row["cluster"]

            found = False
            for pred_idx, pred_row in pred_df.iterrows():
                pred_onset = pred_row["onset"]
                pred_offset = pred_row["offset"]
                pred_cluster = pred_row["cluster"]
                intersection = max(0, min(pred_offset, label_offset) - max(pred_onset, label_onset))
                union = max(pred_offset, label_offset) - min(pred_onset, label_onset)
                overlap_ratio = intersection / union if union > 0 else 0

                if overlap_ratio > tolerance:
                    found = True
                    if pred_cluster == label_cluster:
                        matched_pred.add(pred_idx)
                        matched_label.add(label_idx)
                    else:
                        matched_pred.add(pred_idx) # zählt als falsch klassifiziert, nicht FN
                        matched_label.add(label_idx)
                    break

            if not found:
                fn_indices.append(label_idx)

        for pred_idx in range(len(pred_df)):
            if pred_idx not in matched_pred:
                fp_indices.append(pred_idx)

        return fp_indices, fn_indices
    
    def plot_spec_and_labels(self, offset, window_size, audio, prediction, label, sr, audio_file_name, feature_extractor, precision_bits , min_spec_value, max_spec_value, xticks_step_size ):
        
        all_unique_clusters = sorted(list(set( list(label["cluster"]) + list(prediction["cluster"]) )))
        cluster_color_mapper = {}
        for cluster in all_unique_clusters:
            if cluster not in cluster_color_mapper:
                cluster_color_mapper[cluster] = self.colors[ len(cluster_color_mapper) % len(self.colors) ]
        
        patches = [Patch(color=color, label=cluster) for cluster, color in cluster_color_mapper.items()]
                
        start_time = offset
        end_time = start_time + window_size
        
        audio_chunked = self.chunk_audio( audio, start_time, end_time, sr )
        label_chunked = self.chunk_label( label, start_time, end_time )
        prediction_chunked = self.chunk_label( prediction, start_time, end_time )
        
        spec = feature_extractor( audio_chunked, sampling_rate=sr, padding = "do_not_pad" )["input_features"][0]
                
        ## convert spec to colorful (3 channel)
        spec_colorful =  self.cmap(self.min_max_norm(spec,min_spec_value, max_spec_value))[:,:,:3]
        spec_colorful = np.flipud(spec_colorful) 
        
        spec_time_step = feature_extractor.hop_length / sr
        spec_xticks_step_size = int(np.round( xticks_step_size / spec_time_step )) 
        spec_xticks_values = np.arange(0, spec.shape[1]+1, spec_xticks_step_size )
        
        # spec_xticks_labels = np.round(spec_xticks_values * spec_time_step + start_time, precision_bits) 
        xticks_format = "%%.%df"%(precision_bits)
        spec_xticks_labels = [ xticks_format%(v) for v in spec_xticks_values * spec_time_step + start_time ]
        
        
        spec_labels_image = np.ones( ( spec.shape[1], 3 ), dtype = np.float32 )
        for pos in range(len(label_chunked["onset"])):
            onset_idx = int(np.round(label_chunked["onset"][pos]/spec_time_step))
            offset_idx = int(np.round(label_chunked["offset"][pos]/spec_time_step)) 
            cluster = label_chunked["cluster"][pos]
            
            ## Add a gap manually if there are two connected segments that have the same cluster but are segmented into two parts (either by human or by machine)
            if pos + 1<len(label_chunked["onset"]) and \
                          offset_idx == int(np.round(label_chunked["onset"][pos+1]/spec_time_step)) and \
                          cluster == label_chunked["cluster"][pos+1]:
                offset_idx -= 1
            
            spec_labels_image[onset_idx:offset_idx,:] = cluster_color_mapper[cluster]
        spec_labels_image = np.tile( spec_labels_image[np.newaxis,:,:], [40,1,1] )
        
        
        spec_preds_image = np.ones( (spec.shape[1], 3), dtype = np.float32 )
        for pos in range(len(prediction_chunked["onset"])):
            onset_idx = int(np.round(prediction_chunked["onset"][pos]/spec_time_step))
            offset_idx = int(np.round(prediction_chunked["offset"][pos]/spec_time_step))
            cluster = prediction_chunked["cluster"][pos]
            
            if pos + 1<len(prediction_chunked["onset"]) and \
                            offset_idx == int(np.round(prediction_chunked["onset"][pos+1]/spec_time_step)) and \
                            cluster == prediction_chunked["cluster"][pos+1]:
                offset_idx -= 1
            
            spec_preds_image[onset_idx:offset_idx,:] = cluster_color_mapper[cluster]
        spec_preds_image = np.tile( spec_preds_image[np.newaxis,:,:], [40,1,1] )
        
        
        canvas_image = np.ones( ( spec_colorful.shape[0] + 10 + 40 + 10 + 40, spec_labels_image.shape[1], 3 ) )
        canvas_image[:spec_colorful.shape[0],:,:] = spec_colorful
        canvas_image[spec_colorful.shape[0]+10:spec_colorful.shape[0]+50,:,:] = spec_preds_image 
        canvas_image[spec_colorful.shape[0]+60:spec_colorful.shape[0]+100,:,:] = spec_labels_image

        fig = plt.figure(figsize=(12, 3), constrained_layout=True)
        gs = fig.add_gridspec(3, 1, height_ratios=[spec.shape[0], 40, 40], hspace=0.2)


        # Spektrogramm
        ax_spec = fig.add_subplot(gs[0])
        ax_spec.imshow(spec_colorful, aspect='equal', origin='upper') 
        ax_spec.set_ylabel("Frequency (kHz)")
        ax_spec.set_xticks(spec_xticks_values)
        ax_spec.set_xticklabels(spec_xticks_labels)
        ax_spec.set_xlabel("Time (s)")

        # Y-Ticks wie vorher (deine Tick-Logik hier rein!)
        num_mel_bins = spec.shape[0]
        mel_bin_freqs = np.linspace(sr / 2, 0, num_mel_bins)
        tick_freqs_khz = np.arange(0, int(sr / 2 / 1000) + 1, 1)
        tick_positions = [np.argmin(np.abs(mel_bin_freqs - f * 1000)) for f in tick_freqs_khz]
        tick_labels = [f"{f}" for f in tick_freqs_khz]
        ax_spec.set_yticks(tick_positions)
        ax_spec.set_yticklabels(tick_labels)

        # Prediction-Balken
        ax_pred = fig.add_subplot(gs[1])
        ax_pred.imshow(spec_preds_image, aspect='equal', origin='upper')
        ax_pred.set_yticks([])
        ax_pred.set_xticks([])
        ax_pred.spines[['top', 'bottom', 'right', 'left']].set_visible(False)

        # Label-Balken
        ax_label = fig.add_subplot(gs[2])
        ax_label.imshow(spec_labels_image, aspect='equal', origin='upper')
        ax_label.set_xticks([]) 
        ax_label.set_yticks([])
        ax_label.spines[['top', 'bottom', 'right', 'left']].set_visible(False)
        
        # Legende oben rechts
        patches = [Patch(color=color, label=cluster) for cluster, color in cluster_color_mapper.items()]
        patches.append(Patch(facecolor='none', edgecolor='red', linewidth=2, label='False Positive'))
        patches.append(Patch(facecolor='none', edgecolor='blue', linewidth=2, label='False Negative'))
        plt.legend(handles=patches, loc="upper right", bbox_to_anchor=(1, 1))


        fp_indices, fn_indices = self.get_fp_fn_indices(prediction_chunked, label_chunked)
        # Für False Positives: auf ax_pred (Prediction-Balken)
        for idx in fp_indices:
            onset = prediction_chunked["onset"][idx]
            offset = prediction_chunked["offset"][idx]
            onset_idx = int(np.round(onset / spec_time_step))
            offset_idx = int(np.round(offset / spec_time_step))
            rect = Rectangle((onset_idx, -1), offset_idx - onset_idx, spec_preds_image.shape[0],
                            linewidth=2, edgecolor='red', facecolor='none')
            ax_pred.add_patch(rect)

        # Für False Negatives: auf ax_label (Label-Balken)
        for idx in fn_indices:
            onset = label_chunked["onset"][idx]
            offset = label_chunked["offset"][idx]
            onset_idx = int(np.round(onset / spec_time_step))
            offset_idx = int(np.round(offset / spec_time_step))
            rect = Rectangle((onset_idx, -1), offset_idx - onset_idx, spec_labels_image.shape[0],
                            linewidth=2, edgecolor='blue', facecolor='none')
            ax_label.add_patch(rect)

        audio_chunked = self.chunk_audio(audio, offset, offset+window_size, sr)
        display(Audio(audio_chunked, rate=sr))
                
    def visualize( self, audio, sr, prediction = None, label = None, min_frequency = None, max_frequency = None, precision_bits = 3, audio_file_name = "", window_size = 5.0, xticks_step_size = 0.5, spec_width = 1000):
    
        feature_extractor = WhisperSegFeatureExtractor( sr, window_size / spec_width, min_frequency, max_frequency )
        
        
        whole_spec = feature_extractor( audio, sampling_rate=sr, padding = "do_not_pad" )["input_features"][0]
        min_spec_value = None  # np.percentile( whole_spec, 0.02)
        max_spec_value = None  # np.percentile( whole_spec, 99.98)
        
        if isinstance( label, pd.DataFrame ):
            label_dict = label.to_dict("list")
            
        if isinstance( prediction, pd.DataFrame ):
            prediction = prediction.to_dict("list")
        
        if label is None:
            label = {"onset":[], "offset":[], "cluster":[] }
        if prediction is None:
            prediction = {"onset":[], "offset":[], "cluster":[] }
                
        label["cluster"] = list(map(str, label["cluster"]))
        prediction["cluster"] = list(map(str, prediction["cluster"]))
        
        return interact(self.plot_spec_and_labels, 
                    offset=(0, max(0, len(audio)/sr - window_size ), window_size / 20 ), 
                    window_size = fixed(window_size), 
                    audio = fixed(audio), 
                    prediction = fixed(prediction),
                    label = fixed(label), 
                    sr = fixed(sr), 
                    audio_file_name = fixed(audio_file_name),
                    feature_extractor = fixed(feature_extractor),
                    precision_bits = fixed(precision_bits),
                    min_spec_value = fixed(min_spec_value),
                    max_spec_value = fixed(max_spec_value),
                    xticks_step_size = fixed(xticks_step_size)
                        )
    def _evaluate_performance_single(self, label, prediction, tolerance=0.1



    ):
        import pandas as pd

        label = {"onset": [], "offset": [], "cluster": []} if label is None else label
        prediction = {"onset": [], "offset": [], "cluster": []} if prediction is None else prediction
        label["cluster"] = list(map(str, label["cluster"]))
        prediction["cluster"] = list(map(str, prediction["cluster"]))

        label_df = pd.DataFrame(label)
        pred_df = pd.DataFrame(prediction)
        matched_pred = set()
        matched_label = set()
        false_class = 0

        for label_idx, label_row in label_df.iterrows():
            label_onset = label_row["onset"]
            label_offset = label_row["offset"]
            label_cluster = label_row["cluster"]

            for pred_idx, pred_row in pred_df.iterrows():
                if pred_idx in matched_pred:
                    continue
                pred_onset = pred_row["onset"]
                pred_offset = pred_row["offset"]
                pred_cluster = pred_row["cluster"]

                intersection = max(0, min(pred_offset, label_offset) - max(pred_onset, label_onset))
                union = max(pred_offset, label_offset) - min(pred_onset, label_onset)
                overlap_ratio = intersection / union if union > 0 else 0

                if overlap_ratio > tolerance:
                    if pred_cluster == label_cluster:
                        matched_pred.add(pred_idx)
                        matched_label.add(label_idx)
                        break
                    else:
                        false_class += 1
                        matched_pred.add(pred_idx)
                        matched_label.add(label_idx)
                        break

        tp = len(matched_label) - false_class
        fp = len(pred_df) - len(matched_pred)
        fn = len(label_df) - len(matched_label)
        fc = false_class

        print(f"True Positives (TP): {tp}")
        print(f"False Class (FC):    {fc}  (timed right, wrong label)")
        print(f"False Positives (FP): {fp}")
        print(f"False Negatives (FN): {fn}\n")

        denom_precision = tp + fp + fc
        denom_recall = tp + fn + fc

        precision = tp / denom_precision if denom_precision > 0 else 0
        recall = tp / denom_recall if denom_recall > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

        print(f"Segment-wise Precision: {precision:.4f}")
        print(f"Segment-wise Recall:    {recall:.4f}")
        print(f"Segment-wise F1 Score:  {f1:.4f}")


    def evaluate_performance_overall(self, prediction=None, label=None, tolerance=0.1):
        import pandas as pd

        label = {"onset": [], "offset": [], "cluster": []} if label is None else label
        prediction = {"onset": [], "offset": [], "cluster": []} if prediction is None else prediction
        label["cluster"] = list(map(str, label["cluster"]))
        prediction["cluster"] = list(map(str, prediction["cluster"]))

        self.pred_df = pd.DataFrame(prediction)
        self.label_df = pd.DataFrame(label)

        if self.pred_df.empty or self.label_df.empty:
            print("Keine Vorhersagen oder Labels vorhanden.")
            return

        pred_df = self.pred_df.copy()
        label_df = self.label_df.copy()

        # categories can be ["vocal", "target"] or anything that matches your use case
        categories = ["vocal", "target"]

        for cat in categories:
            label_cat = self.filter_by_category(label, cat)
            prediction_cat = self.filter_by_category(prediction, cat)
            print(f"\n=== Evaluation for category: {cat.upper()} ===")
            self._evaluate_performance_single(label_cat, prediction_cat, tolerance)  # see below

        matched_pred = set()
        matched_label = set()
        false_class = 0  # Zähler für zeitlich passende, aber falsch klassifizierte Vorhersagen

        # Suche True Positives und False Class
        for label_idx, label_row in label_df.iterrows():
            label_onset = label_row["onset"]
            label_offset = label_row["offset"]
            label_cluster = label_row["cluster"]

            for pred_idx, pred_row in pred_df.iterrows():
                if pred_idx in matched_pred:
                    continue  # Diese Vorhersage wurde schon gematcht

                pred_onset = pred_row["onset"]
                pred_offset = pred_row["offset"]
                pred_cluster = pred_row["cluster"]

                intersection = max(0, min(pred_offset, label_offset) - max(pred_onset, label_onset))
                union = max(pred_offset, label_offset) - min(pred_onset, label_onset)
                overlap_ratio = intersection / union if union > 0 else 0

                if overlap_ratio > tolerance:
                    if pred_cluster == label_cluster:
                        matched_pred.add(pred_idx)
                        matched_label.add(label_idx)
                        break  # Gültiges Match gefunden
                    else:
                        false_class += 1
                        matched_pred.add(pred_idx)
                        matched_label.add(label_idx)
                        break  # Auch als Match gezählt, aber mit falscher Klasse

        tp = len(matched_label) - false_class
        fp = len(pred_df) - len(matched_pred)
        fn = len(label_df) - len(matched_label)
        fc = false_class

        print("\n--- Gesamtauswertung über alle Zeitbereiche ---")
        print(f"True Positives (TP): {tp}")
        print(f"False Class (FC):    {fc}  (zeitlich korrekt, aber falsches Label)")
        print(f"False Positives (FP): {fp}")
        print(f"False Negatives (FN): {fn}\n")

        # Präzision, Recall, F1 berechnen (unter Berücksichtigung von False Class)
        denom_precision = tp + fp + fc
        denom_recall = tp + fn + fc

        precision = tp / denom_precision if denom_precision > 0 else 0
        recall = tp / denom_recall if denom_recall > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

        print(f"Segment-wise Precision: {precision:.4f}")
        print(f"Segment-wise Recall:    {recall:.4f}")
        print(f"Segment-wise F1 Score:  {f1:.4f}")


    
def slice_audio_and_label( audio, label, sr, start_time, end_time ):
    sliced_audio = audio[ int( start_time * sr ):int( end_time * sr ) ]
    duration = len(sliced_audio) / sr
    ## get the actual ending time
    end_time = start_time + duration
    
    onsets = np.array( label["onset"] )
    offsets = np.array( label["offset"] )
    clusters = list(label["cluster"])
    
    target_indices = np.argwhere( np.logical_and( onsets < end_time, offsets > start_time ) )[:,0]
    
    sliced_onsets = [ max( 0, onsets[idx] - start_time ) for idx in target_indices ]
    sliced_offsets = [ min( offsets[idx] - start_time, end_time - start_time ) for idx in target_indices ]    
    sliced_clusters = [ clusters[idx] for idx in target_indices ]
    
    sliced_label = {
        "onset":sliced_onsets,
        "offset":sliced_offsets,
        "cluster":sliced_clusters,
    }
    
    if isinstance( label, pd.DataFrame ):
        sliced_label = pd.DataFrame( sliced_label )
    
    return sliced_audio, sliced_label


def remove_silent_sections(audio, labels, predictions, sr, silence_threshold=None):
    if silence_threshold is None:
        return audio, labels, predictions

    all_onsets = labels["onset"] + predictions["onset"]
    all_offsets = labels["offset"] + predictions["offset"]
    
    if not all_onsets or not all_offsets:
        return audio, labels, predictions  # Keine Events vorhanden

    intervals = sorted(zip(all_onsets, all_offsets), key=lambda x: x[0])
    merged_intervals = []

    for start, end in intervals:
        if not merged_intervals:
            merged_intervals.append([start, end])
        else:
            last = merged_intervals[-1]
            if start <= last[1]:
                last[1] = max(last[1], end)
            else:
                merged_intervals.append([start, end])

    new_audio = []
    new_labels = {"onset": [], "offset": [], "cluster": []}
    new_predictions = {"onset": [], "offset": [], "cluster": []}
    current_time = 0.0
    last_end = 0.0

    for i, (start, end) in enumerate(merged_intervals):
        gap = start - last_end
        if silence_threshold is not None and gap > silence_threshold:
            current_time += gap
        sliced_audio, sliced_label = slice_audio_and_label(audio, labels, sr, start, end)
        _, sliced_pred = slice_audio_and_label(audio, predictions, sr, start, end)

        new_audio.append(sliced_audio)

        new_labels["onset"] += [o + current_time for o in sliced_label["onset"]]
        new_labels["offset"] += [o + current_time for o in sliced_label["offset"]]
        new_labels["cluster"] += sliced_label["cluster"]

        new_predictions["onset"] += [o + current_time for o in sliced_pred["onset"]]
        new_predictions["offset"] += [o + current_time for o in sliced_pred["offset"]]
        new_predictions["cluster"] += sliced_pred["cluster"]

        current_time += len(sliced_audio) / sr
        last_end = end

    new_audio = np.concatenate(new_audio) if new_audio else np.array([], dtype=np.float32)
    return new_audio, new_labels, new_predictions


In [8]:
import json
import librosa
import numpy as np
import matplotlib.pyplot as plt

# --- Dateien laden ---
#wav_path = r"C:\Users\sophi\Documents\Masterarbeit\Results\pred\(2019_03_15-12_02_11)_CSWMUW240241_0000_first.wav"
#json_path = r"C:\Users\sophi\Documents\Masterarbeit\Results\pred\(2019_03_15-12_02_11)_CSWMUW240241_0000_first.json"
#pred_json_path = r"C:\Users\sophi\Documents\Masterarbeit\Results\pred\(2019_03_15-12_02_11)_CSWMUW240241_0000_first_pred.json"

wav_path = r"C:\Users\sophi\Documents\Masterarbeit\Results\pred\(2019_03_15-12_02_11)_CSWMUW240241_0000_first.wav"
#json_path = r"C:\Users\sophi\Documents\Masterarbeit\Results\pred\(2019_03_15-12_02_11)_CSWMUW240241_0000_first.json"
#pred_json_path = r"C:\Users\sophi\Documents\Masterarbeit\Results\pred\(2019_03_15-12_02_11)_CSWMUW240241_0000_first_pred.json"
#pred_json_path = r"C:\Users\sophi\Documents\Masterarbeit\Results\pred\train.py, only finetune, 5 epochs, on lemur_2call_data, test\(2019_03_15-12_02_11)_CSWMUW240241_0000_first.jsonr"
#pred_json_path = r"C:\Users\sophi\Documents\Masterarbeit\Results\models\16.6. 15 epochs, af_without_cp_newscheduler, only finetuning, lemur_data_2call\Neuer Ordner\(2019_03_15-12_02_11)_CSWMUW240241_0000_first.jsonr"
#pred_json_path = r"C:\Users\sophi\Documents\Masterarbeit\Results\models\16.6. 15 epochs, af_without_cp_newscheduler, only finetuning, lemur_data_2call\pred_empty.jsonr"
#pred_json_path = r"C:\Users\sophi\Documents\Masterarbeit\Results\pred\18.6\(2019_03_15-12_02_11)_CSWMUW240241_0000_first.jsonr"
# Audio laden

json_path = r"C:\Users\sophi\Documents\Masterarbeit\Results\Raven\labels_yesno.json"
pred_json_path = r"C:\Users\sophi\Documents\Masterarbeit\Results\pred\(2019_03_15-12_02_11)_CSWMUW240241_0000_first_pred.json"

audio, sr = librosa.load(wav_path, sr=None)

# Labels aus JSON laden
with open(json_path, "r") as f:
    labels = json.load(f)

# Prüfen ob alle nötigen Keys existieren
assert all(k in labels for k in ["onset", "offset", "cluster"]), "Labels JSON muss 'onset', 'offset', 'cluster' enthalten."

# Optional: Cluster als Strings sicherstellen
labels["cluster"] = list(map(str, labels["cluster"]))

# Predictions aus JSON laden
with open(pred_json_path, "r") as f:
    predictions = json.load(f)

# Prüfen ob alle nötigen Keys existieren
assert all(k in predictions for k in ["onset", "offset", "cluster"]), "Predictions JSON muss 'onset', 'offset', 'cluster' enthalten."

# Optional: Cluster als Strings sicherstellen
predictions["cluster"] = list(map(str, predictions["cluster"]))

# --- Stille (>3s) entfernen ---
audio_cleaned, labels_cleaned, predictions_cleaned = remove_silent_sections(
    audio, labels, predictions, sr, silence_threshold=None
)

# --- Spektrogramm und Annotationen visualisieren ---
viewer = SpecViewer()
viewer.evaluate_performance_overall(label=labels_cleaned,
    prediction=predictions_cleaned)

# Interaktive Visualisierung starten
widget = viewer.visualize(
    audio=audio_cleaned,
    sr=sr,
    label=labels,
    prediction=predictions_cleaned,
    audio_file_name=wav_path,
    window_size=40.0,      # Zeitfenstergröße in Sekunden
    xticks_step_size=5  # Schrittweite der x-Achsen-Beschriftung in Sekunden
);


  self.cmap = cm.get_cmap("magma")



=== Evaluation for category: VOCAL ===
True Positives (TP): 122
False Class (FC):    0  (timed right, wrong label)
False Positives (FP): 119
False Negatives (FN): 59

Segment-wise Precision: 0.5062
Segment-wise Recall:    0.6740
Segment-wise F1 Score:  0.5782

=== Evaluation for category: TARGET ===
True Positives (TP): 0
False Class (FC):    0  (timed right, wrong label)
False Positives (FP): 0
False Negatives (FN): 0

Segment-wise Precision: 0.0000
Segment-wise Recall:    0.0000
Segment-wise F1 Score:  0.0000

--- Gesamtauswertung über alle Zeitbereiche ---
True Positives (TP): 122
False Class (FC):    0  (zeitlich korrekt, aber falsches Label)
False Positives (FP): 119
False Negatives (FN): 59

Segment-wise Precision: 0.5062
Segment-wise Recall:    0.6740
Segment-wise F1 Score:  0.5782


interactive(children=(FloatSlider(value=1076.0, description='offset', max=2155.6073125, step=2.0), Output()), …