In [12]:

import pandas as pd
import numpy as np
import librosa
import os
import subprocess


server_data = os.path.join("G:\\dev","server_data")
metadata = os.path.join("G:\\dev","metadata")
models = os.path.join("G:\\dev\\models", "models")


# server_data = os.path.join("/home/rwd/dev","server_data")
# metadata = os.path.join("/home/rwd","metadata")
# models = os.path.join("/home/rwd", "output")





### ============= 

def validate_audio_files(server_data, upload_id):
    """
    Validate audio files in the server_data directory.
    """

    uploads = os.path.join(server_data, "uploads")
    
    
    audio_files = [f for f in os.listdir(uploads) if f.endswith('.wav') or f.endswith('.mp3') and f.startswith(upload_id)]
    
    
    
    return audio_files

def infer_signals(track):
    """
    Infer signals from audio tracks.
    """
    try:
        y, sr = librosa.load(track, sr=22050)
        return y
    except Exception as e:
        print(f"Error inferring signals for {track}: {e}")
        return None
    
import utils


def extract_y_middle(y, seconds):
        """
        takes in path of a file, loads it, and returns a ndarray of samples
        """
        
        if utils.assert_signal_length(y, 22050, seconds):
            
                # extract signal length * 22050
                y = utils.get_from_middle(y, 22050, seconds)
                y = utils.get_hanned(1, y, 22050, False)

                # normalization [0.0:1.0]
                y = utils.normalize_audio(y)

                print(f"Record was sliced successfully.")
                print(f"Length of y_minute: {len(y)}")
                print(f"Length of y: {len(y) / 22050} seconds")
                print(f"y.max: {np.max(y)}")
                print(f"y.min: {np.min(y)}")
                print(f"y.mean: {np.mean(y)}")
                print(f"y.std: {np.std(y)}")
                print(f"y.shape: {y.shape}")
                return y
        else:
            print(f"Record {y} was not long enough.")
            return pd.Series([y, np.nan])
        
import soundfile as sf

def generate_audio_from_frames(filename: str, signal, sampling_rate, server_data):
    
    os.makedirs(os.path.join(server_data, "30s"), exist_ok=True)
    
    if filename.endswith(".mp3"):
        x = filename.replace(".mp3", ".wav")
    
    
    try:
        sf.write(os.path.join(server_data, "30s", x), signal, sampling_rate)
        print(f"Audio file {x} generated successfully.")
    except Exception as e:
        print(f"Error generating audio file {x}: {e}")
    
    return os.path.join(server_data, "30s", x)


def split_to_frames(y, frame_length=22050, hop_length=11025):
    """
    Split the audio signal into frames.
    """

    frames_amount = y.shape[0] // 22050
    
    if frames_amount < 1:
        print("Audio signal is too short to split into frames.")
        return []

    [x*11025 for x in range(1, frames_amount * 2 + 1)]
    if y is None or len(y) == 0:
        print("No audio signal provided.")
        return []
    
    frames = librosa.util.frame(y, frame_length=frame_length, hop_length=hop_length).T
    return frames


def save_feature_to_server_data(feature_name: str, server_data, upload_id, feature):
    
    upload_dir = os.path.join(server_data, "features", upload_id, feature_name)
    print(upload_dir)
    if not os.path.exists(upload_dir):
        os.makedirs(upload_dir)

    with open(f"{os.path.join(upload_dir)}/{feature_name}.npy", "wb") as f:
        np.save(f, feature)

    print(os.listdir(upload_dir))
    
        


### =============  



In [2]:
from pathlib import Path


### ========= VIDEO GENERATION

def generate_video(upload_id, data_type, sample_location):
    
    match data_type:
        case "ft":
            
            print("ft_chosen")
            dir = os.path.join(server_data, "ft", upload_id)
            list_of_files = sorted(os.listdir(dir))
            framerate_real_time = list_of_files.__len__() / 30
            print(f"list_of_files.__len__() {list_of_files.__len__()}")
            print(f"framerate: {framerate_real_time}")
            
            upload_dir = os.path.join(server_data, "ft", upload_id, "video")
            
            if not os.path.exists(upload_dir):
                os.makedirs(upload_dir)
            
            output = os.path.join(upload_dir, upload_id)
            print(upload_dir)
            input_pattern = str(Path(dir) / "frame_%04d.png")
            

        case "spectr":
            
            print("spectr_chosen")
            dir = os.path.join(server_data, "spectr", upload_id)
            list_of_files = sorted(os.listdir(dir))
            framerate_real_time = list_of_files.__len__() / 30
            
            print(f"framerate: {framerate_real_time}")
            upload_dir = os.path.join(server_data, "spectr", upload_id, "video")
            
            if not os.path.exists(upload_dir):
                os.makedirs(upload_dir)
            
            output = os.path.join(upload_dir, upload_id)
            
            print(upload_dir)
            
            input_pattern = str(Path(dir) / "frame_%04d.png")

        case "mel_spectr":
            
            print("melspectr_chosen")
            dir = os.path.join(server_data, "mel_spectr", upload_id)
            list_of_files = sorted(os.listdir(dir))
            framerate_real_time = list_of_files.__len__() / 30
            
            print(f"framerate: {framerate_real_time}")
            upload_dir = os.path.join(server_data, "mel_spectr", upload_id, "video")
            
            if not os.path.exists(upload_dir):
                os.makedirs(upload_dir)
            
            output = os.path.join(upload_dir, upload_id)
            
            print(upload_dir)
            
            input_pattern = str(Path(dir) / "frame_%04d.png")

        case "power_spectr":
            
            print("power_spectr_chosen")
            dir = os.path.join(server_data, "power_spectr", upload_id)
            list_of_files = sorted(os.listdir(dir))
            framerate_real_time = list_of_files.__len__() / 30
            
            print(f"framerate: {framerate_real_time}")
            upload_dir = os.path.join(server_data, "power_spectr", upload_id, "video")
            
            if not os.path.exists(upload_dir):
                os.makedirs(upload_dir)
            
            output = os.path.join(upload_dir, upload_id)
            
            print(upload_dir)
            
            input_pattern = str(Path(dir) / "frame_%04d.png")
        
        case "mfcc":
            
            print("mfcc _ chosen")
            dir = os.path.join(server_data, "mfcc", upload_id)
            list_of_files = sorted(os.listdir(dir))
            framerate_real_time = list_of_files.__len__() / 30
            
            print(f"framerate: {framerate_real_time}")
            upload_dir = os.path.join(server_data, "mfcc", upload_id, "video")
            
            if not os.path.exists(upload_dir):
                os.makedirs(upload_dir)
            
            output = os.path.join(upload_dir, upload_id)
            
            print(upload_dir)
            
            input_pattern = str(Path(dir) / "frame_%04d.png")

        case "stft":
            
            print("stft _ chosen")
            dir = os.path.join(server_data, "stft", upload_id)
            list_of_files = sorted(os.listdir(dir))
            framerate_real_time = list_of_files.__len__() / 30
            
            print(f"framerate: {framerate_real_time}")
            upload_dir = os.path.join(server_data, "stft", upload_id, "video")
            
            if not os.path.exists(upload_dir):
                os.makedirs(upload_dir)
            
            output = os.path.join(upload_dir, upload_id)
            
            print(upload_dir)
            
            input_pattern = str(Path(dir) / "frame_%04d.png")

        case "cens":
            
            print("cens _ chosen")
            dir = os.path.join(server_data, "cens", upload_id)
            list_of_files = sorted(os.listdir(dir))
            framerate_real_time = list_of_files.__len__() / 30
            
            print(f"framerate: {framerate_real_time}")
            upload_dir = os.path.join(server_data, "cens", upload_id, "video")
            
            if not os.path.exists(upload_dir):
                os.makedirs(upload_dir)
            
            output = os.path.join(upload_dir, upload_id)
            
            print(upload_dir)
            
            input_pattern = str(Path(dir) / "frame_%04d.png")
        
        case "cqt":
            
            print("cqt _ chosen")
            dir = os.path.join(server_data, "cqt", upload_id)
            list_of_files = sorted(os.listdir(dir))
            framerate_real_time = list_of_files.__len__() / 30
            
            print(f"framerate: {framerate_real_time}")
            upload_dir = os.path.join(server_data, "cqt", upload_id, "video")
            
            if not os.path.exists(upload_dir):
                os.makedirs(upload_dir)
            
            output = os.path.join(upload_dir, upload_id)
            
            print(upload_dir)
            
            input_pattern = str(Path(dir) / "frame_%04d.png")
        
        case "tonnetz":
            
            print("tonnetz _ chosen")
            dir = os.path.join(server_data, "tonnetz", upload_id)
            list_of_files = sorted(os.listdir(dir))
            framerate_real_time = list_of_files.__len__() / 30
            
            print(f"framerate: {framerate_real_time}")
            upload_dir = os.path.join(server_data, "tonnetz", upload_id, "video")
            
            if not os.path.exists(upload_dir):
                os.makedirs(upload_dir)
            
            output = os.path.join(upload_dir, upload_id)
            
            print(upload_dir)
            
            input_pattern = str(Path(dir) / "frame_%04d.png")
        

        case _:
            raise ValueError(f"Unsupported data type: {data_type}")
            
        
    cmd = [
        "ffmpeg",
        "-framerate", f"{framerate_real_time:.2f}",
        "-y",
        "-i", input_pattern,
        "-i", sample_location,
        "-c:v", "libx264",
        "-pix_fmt", "yuv420p",
        "-c:a", "aac",
        "-b:a", "192k",
        "-shortest",
        f"{output}.mp4"
    ]
    
    print("Running command:", ' '.join(cmd))
    try:
        subprocess.run(cmd, check=True)
    except subprocess.CalledProcessError as e:
        print("FFmpeg failed!", e)
        
    
### ========== IMAGE GENERATION    
        
def generate_ft_graphs(frames_ft, server_data, upload_id):
    """
    Generate and save ft images from the frames.
    """
    import matplotlib.pyplot as plt
    
    
    
    seconds_per_frame = 30 / frames_ft.__len__()

    if not os.path.exists(os.path.join(server_data, "ft")):
        os.makedirs(os.path.join(server_data, "ft"))
    
    if not os.path.exists(os.path.join(server_data, "ft", upload_id)):
        os.makedirs(os.path.join(server_data, "ft", upload_id))
    
    for i, ft in enumerate(frames_ft):
        plt.figure(figsize=(10, 4))
        plt.xlim(0, 512)
        plt.plot(ft)
        plt.xlabel('Hz')
        plt.ylabel('Amplituda')
        plt.xticks(np.arange(0, 513, 16))
        
        plt.grid(True)
        plt.title(f'Frame {i+1} | Fourier Transform | {(i+1)*seconds_per_frame/100:.2f} s')
        plt.tight_layout()
        plt.savefig(os.path.join(server_data, "ft", upload_id, f'frame_{i+1:04d}.png'))
        plt.close()
        
        
def generate_spectrogram_graphs(frames_spectr, server_data, upload_id):
    """
    Generate and save spectrogram images from the frames.
    """
    import matplotlib.pyplot as plt
    
    seconds_per_frame = 30 / frames_spectr.__len__()
    
    if not os.path.exists(os.path.join(server_data, "spectr")):
        os.makedirs(os.path.join(server_data, "spectr"))
    
    if not os.path.exists(os.path.join(server_data, "spectr", upload_id)):
        os.makedirs(os.path.join(server_data, "spectr", upload_id))
    
    for i, spectr in enumerate(frames_spectr):
        
        fig = plt.figure(figsize=(10, 4))
        ax = fig.add_subplot(111)
        
        librosa.display.specshow(spectr, ax=ax,sr=22050, x_axis='time', y_axis='log')
        
        
        plt.title(f'Frame {i+1} | Spectrogram | {(i+1)*seconds_per_frame/100:.2f} s')
        plt.savefig(os.path.join(server_data, "spectr", upload_id, f'frame_{i+1:04d}.png'))
        plt.close()


def generate_mel_spectrogram_graphs(frames_spectr, server_data, upload_id):
    """
    Generate and save spectrogram images from the frames.
    """
    import matplotlib.pyplot as plt
    
    seconds_per_frame = 30 / frames_spectr.__len__()
    
    if not os.path.exists(os.path.join(server_data, "mel_spectr")):
        os.makedirs(os.path.join(server_data, "mel_spectr"))
    
    if not os.path.exists(os.path.join(server_data, "mel_spectr", upload_id)):
        os.makedirs(os.path.join(server_data, "mel_spectr", upload_id))
    
    for i, spectr in enumerate(frames_spectr):
        
        fig = plt.figure(figsize=(10, 4))
        ax = fig.add_subplot(111)
        
        librosa.display.specshow(spectr, ax=ax,sr=22050, x_axis='time', y_axis='mel', fmax=8000)
        
        
        plt.title(f'Frame {i+1} | Mel Spectrogram | {(i+1)*seconds_per_frame/100:.2f} s')
        plt.savefig(os.path.join(server_data,"mel_spectr", upload_id, f'frame_{i+1:04d}.png'))
        plt.close()


def generate_power_spectrogram_graphs(frames_spectr, server_data, upload_id):
    """
    Generate and save spectrogram images from the frames.
    """
    import matplotlib.pyplot as plt
    
    seconds_per_frame = 30 / frames_spectr.__len__()
    
    if not os.path.exists(os.path.join(server_data, "power_spectr")):
        os.makedirs(os.path.join(server_data, "power_spectr"))
    
    if not os.path.exists(os.path.join(server_data, "power_spectr", upload_id)):
        os.makedirs(os.path.join(server_data, "power_spectr", upload_id))
    
    for i, spectr in enumerate(frames_spectr):
        
        fig = plt.figure(figsize=(10, 4))
        ax = fig.add_subplot(111)
        
        librosa.display.specshow(spectr, ax=ax,sr=22050, x_axis='time', y_axis='hz', fmax=8000)
        
        
        plt.title(f'Frame {i+1} | Power Spectrogram | {(i+1)*seconds_per_frame/100:.2f} s')
        plt.savefig(os.path.join(server_data,"power_spectr", upload_id, f'frame_{i+1:04d}.png'))
        plt.close()


def generate_mfcc_graphs(frames_spectr, server_data, upload_id):
    """
    Generate and save spectrogram images from the frames.
    """
    import matplotlib.pyplot as plt
    
    seconds_per_frame = 30 / frames_spectr.__len__()
    
    if not os.path.exists(os.path.join(server_data, "mfcc")):
        os.makedirs(os.path.join(server_data, "mfcc"))
    
    if not os.path.exists(os.path.join(server_data, "mfcc", upload_id)):
        os.makedirs(os.path.join(server_data, "mfcc", upload_id))
    
    for i, mfcc in enumerate(frames_spectr):
        
        fig = plt.figure(figsize=(10, 4))
        ax = fig.add_subplot(111)
        
        librosa.display.specshow(mfcc, ax=ax,sr=22050, x_axis='time')
        
        
        plt.title(f'Frame {i+1} | MFCC | {(i+1)*seconds_per_frame/100:.2f} s')
        plt.savefig(os.path.join(server_data,"power_spectr", upload_id, f'frame_{i+1:04d}.png'))
        plt.close()


def generate_chroma_graphs(frames_spectr, transformation: str, server_data, upload_id):
    """
    Generate and save spectrogram images from the frames.
    """
    import matplotlib.pyplot as plt
    
    if not transformation in ["cens", "cqt", "stft"]:
        raise ValueError(f"Given {transformation} as transformation, wrong paths will be produced. Aborting.")

    seconds_per_frame = 30 / frames_spectr.__len__()
    
    if not os.path.exists(os.path.join(server_data, transformation)):
        os.makedirs(os.path.join(server_data, transformation))
    
    if not os.path.exists(os.path.join(server_data, transformation, upload_id)):
        os.makedirs(os.path.join(server_data, transformation, upload_id))
    
    for i, mfcc in enumerate(frames_spectr):
        
        fig = plt.figure(figsize=(10, 4))
        ax = fig.add_subplot(111)
        
        librosa.display.specshow(mfcc, sr=22050,  x_axis='time', y_axis='chroma', cmap='coolwarm', ax=ax)
        
        plt.title(f'Frame {i+1} | Chroma ({transformation}) | {(i+1)*seconds_per_frame/100:.2f} s')
        plt.savefig(os.path.join(server_data, transformation, upload_id, f'frame_{i+1:04d}.png'))
        plt.close()

def generate_tonnetz_graphs(frames_spectr, server_data, upload_id):
    """
    Generate and save spectrogram images from the frames.
    """
    import matplotlib.pyplot as plt
    
    seconds_per_frame = 30 / frames_spectr.__len__()
    
    if not os.path.exists(os.path.join(server_data, "tonnetz")):
        os.makedirs(os.path.join(server_data, "tonnetz"))
    
    if not os.path.exists(os.path.join(server_data, "tonnetz", upload_id)):
        os.makedirs(os.path.join(server_data, "tonnetz", upload_id))
    
    for i, tonnetz in enumerate(frames_spectr):
        
        fig = plt.figure(figsize=(10, 4))
        ax = fig.add_subplot(111)
        
        librosa.display.specshow(tonnetz, ax=ax,sr=22050, x_axis='time', y_axis='tonnetz', cmap='twilight_shifted')
        
        
        plt.title(f'Frame {i+1} | Tonnetz | {(i+1)*seconds_per_frame/100:.2f} s')
        plt.savefig(os.path.join(server_data, "tonnetz", upload_id, f'frame_{i+1:04d}.png'))
        plt.close()

### ========= VIDEO GENERATION


In [17]:
### =============== SIGNALS TRANSFORMATIONS


def transform_to_ft(frames, metadata_path, normalized: bool):
    ft = []
    
    for i, frame in enumerate(frames):
        ft.append(np.abs(librosa.stft(frame, hop_length=256)))
        
    
    ft = np.array(ft).astype(np.float32)
    
    print(f"Shape of ft: {ft.shape}")
    print(f"Min/max of ft: {np.min(ft)}/{np.max(ft)}")
    
    if normalized:
        ft_mean = np.load(os.path.join(metadata_path, "ft_mean.npy"))
        ft_std = np.load(os.path.join(metadata_path, "ft_std.npy"))
    

        ft = np.log1p(ft)
        
        ft_mean = np.mean(ft, axis=(0, 1), keepdims=True)
        X = (ft - ft_mean) / (ft_std + 1e-8)

    else: 
        X = ft    
    
    print(f"Shape of ft normalized: {X.shape}")
    print(f"Min/max of ft normalized: {np.min(X)}/{np.max(X)}")
            
    return X


def transform_to_spectr(frames, metadata_path, normalized: bool):
    spectr = []
    
    for i, frame in enumerate(frames):
       spectr.append(librosa.amplitude_to_db(
                np.abs(librosa.stft(frame, hop_length=256)), ref=np.max))
    
    spectr = np.array(spectr).astype(np.float32)
    
    
    print(f"Shape of spectrogram: {spectr.shape}")
    print(f"Min/max of spectrogram: {np.min(spectr)}/{np.max(spectr)}")
    
    if normalized:
        spectr_mean = np.load(os.path.join(metadata_path, "spec_mean.npy"))
        spectr_std = np.load(os.path.join(metadata_path, "spec_std.npy"))
            
        spectr = (spectr - spectr_mean) / (spectr_std)
        
        print(f"Shape of spectrogram normalized: {spectr.shape}")
        print(f"Min/max of spectrogram normalized: {np.min(spectr)}/{np.max(spectr)}")
                
    return spectr


def transform_to_mel_spectr(frames, metadata_path, normalized: bool):
    spectr = []
    
    for i, frame in enumerate(frames):
       
        mel_spect = librosa.feature.melspectrogram(
                y=frame, sr=22050, n_fft=8192, hop_length=256, n_mels=1025)
        mel_spect = librosa.power_to_db(mel_spect, ref=np.max)

        spectr.append(mel_spect)
    
    spectr = np.array(spectr).astype(np.float32)
    
    
    print(f"Shape of mel spectrogram: {spectr.shape}")
    print(f"Min/max of mel spectrogram: {np.min(spectr)}/{np.max(spectr)}")
    
    if normalized:
        spectr_mean = np.load(os.path.join(metadata_path, "mel_spec_mean.npy"))
        spectr_std = np.load(os.path.join(metadata_path, "mel_spec_std.npy"))
            
        spectr = (spectr - spectr_mean) / (spectr_std)
        
        print(f"Shape of mel spectrogram normalized: {spectr.shape}")
        print(f"Min/max of mel spectrogram normalized: {np.min(spectr)}/{np.max(spectr)}")
                
    return spectr


def transform_to_power_spectr(frames, metadata_path, normalized: bool):

    spectr = []
    
    for i, frame in enumerate(frames):

        ft = librosa.stft(frame, hop_length=256)
        power_spec = np.abs(ft) ** 2
        power_db = librosa.power_to_db(power_spec, ref=np.max)
       
        spectr.append(power_db)
    
    spectr = np.array(spectr).astype(np.float32)
    
    
    print(f"Shape of power spectrogram: {spectr.shape}")
    print(f"Min/max of power spectrogram: {np.min(spectr)}/{np.max(spectr)}")
    
    if normalized:
        spectr_mean = np.load(os.path.join(metadata_path, "power_spec_mean.npy"))
        spectr_std = np.load(os.path.join(metadata_path, "power_spec_std.npy"))
            
        spectr = (spectr - spectr_mean) / (spectr_std)
        
        print(f"Shape of power spectrogram normalized: {spectr.shape}")
        print(f"Min/max of power spectrogram normalized: {np.min(spectr)}/{np.max(spectr)}")
                
    return spectr

def transform_to_mfcc(frames, metadata_path, normalized: bool):

    mfcc = []
    for i, frame in enumerate(frames):

        mfcc_x = librosa.feature.mfcc(
                y=frame, sr=22050, n_mfcc=12, hop_length=256)

        mfcc.append(mfcc_x)
    
    mfcc = np.array(mfcc).astype(np.float32)
    
    
    print(f"Shape of mfcc: {mfcc.shape}")
    print(f"Min/max of mfcc: {np.min(mfcc)}/{np.max(mfcc)}")
    
    if np.min(mfcc) <= -1.0 or np.max(mfcc) >= 1.0:
        print("Data needs normalization")

        if normalized:

            
            mfcc_mean = np.load(os.path.join(metadata_path, "mfcc_mean.npy"))
            mfcc_std = np.load(os.path.join(metadata_path, "mfcc_std.npy"))
            
            mfcc = (mfcc - mfcc_mean) / (mfcc_std)
            
            print(f"Shape of mfcc normalized: {mfcc.shape}")
            print(f"Min/max of mfcc normalized: {np.min(mfcc)}/{np.max(mfcc)}")
                
    return mfcc


def transform_to_chroma(frames, metadata_path, transformation:str, normalized: bool):

    chroma = []
    
    for i, frame in enumerate(frames):

        match transformation:
            case "stft":
                chroma_x = librosa.feature.chroma_stft(
                y=frame, sr=22050, n_chroma=12,
                hop_length=256, n_fft=2048)
            case "cens":
                chroma_x = librosa.feature.chroma_cens(
                y=frame, sr=22050, n_chroma=12,
                hop_length=512)

            case "cqt":
                chroma_x = librosa.feature.chroma_cqt(
                y=frame, sr=22050, n_chroma=12,
                hop_length=512)

        chroma.append(chroma_x)
    
    chroma = np.array(chroma).astype(np.float32)
    
    
    print(f"Shape of chroma {transformation} spectrogram: {chroma.shape}")
    print(f"Min/max of chroma {transformation} spectrogram: {np.min(chroma)}/{np.max(chroma)}")
    
    if np.min(chroma) <= -1.0 or np.max(chroma) >= 1.0:
        print("Data needs normalization")

        if normalized:

            chroma_mean = np.load(os.path.join(metadata_path, f"chroma_{transformation}_mean.npy"))
            chroma_std = np.load(os.path.join(metadata_path, f"chroma_{transformation}_std.npy"))
                
            chroma = (chroma - chroma_mean) / (chroma_std)
            
            print(f"Shape of chroma {transformation} normalized: {chroma.shape}")
            print(f"Min/max of chroma {transformation} normalized: {np.min(chroma)}/{np.max(chroma)}")
                
    return chroma

def transform_to_tonnetz(frames, metadata_path, normalized: bool):

    tonnetz = []
    
    for i, frame in enumerate(frames):

        
        chroma = librosa.feature.chroma_cqt(y=frame, sr=22050)
        tonnetz_x = librosa.feature.tonnetz(chroma=chroma, sr=22050)
        
        tonnetz.append(tonnetz_x)
    
    tonnetz = np.array(tonnetz).astype(np.float32)
    
    
    print(f"Shape of tonnetz: {tonnetz.shape}")
    print(f"Min/max of tonnetz: {np.min(tonnetz)}/{np.max(tonnetz)}")
    
    if np.min(tonnetz) <= -1.0 or np.max(tonnetz) >= 1.0:
        
        print("Data needs normalization")

        if normalized:
            tonnetz_mean = np.load(os.path.join(metadata_path, "tonnetz_mean.npy"))
            tonnetz_std = np.load(os.path.join(metadata_path, "tonnetz_std.npy"))
        
            tonnetz = (tonnetz - tonnetz_mean) / (tonnetz_std)
            
            print(f"Shape of tonnetz normalized: {tonnetz.shape}")
            print(f"Min/max of tonnetz normalized: {np.min(tonnetz)}/{np.max(tonnetz)}")
                
    return tonnetz


### =============== SIGNALS TRANSFORMATIONS

In [13]:

tracks = validate_audio_files(server_data, "glupoty")


tracks

['glupoty.mp3']

In [14]:


tracks = validate_audio_files(server_data, "glupoty")
signal = infer_signals(os.path.join(server_data, "uploads", tracks[0]))
        
y = extract_y_middle(signal, 30)  # Example usage with the first track


sample_location = generate_audio_from_frames("sample.mp3", y, 22050, server_data)


frames_no_hop = split_to_frames(y, frame_length=22050, hop_length=22050)
frames_hop_2205 = split_to_frames(y, hop_length=2205)

Length of audio signal: 269.80 seconds
Record was sliced successfully.
Length of y_minute: 661500
Length of y: 30.0 seconds
y.max: 0.9730347394943237
y.min: -1.0
y.mean: -3.840836507151835e-05
y.std: 0.18719276785850525
y.shape: (661500,)
Audio file sample.wav generated successfully.


In [15]:

import IPython.display as ipd
ipd.Audio(y, rate=22050)  # Play the audio sign

In [18]:
frames_ft = transform_to_ft(frames_hop_2205, metadata, True)

generate_ft_graphs(frames_ft, server_data, "example-fasf23-23123-23asd")
generate_video("example-fasf23-23123-23asd", "ft", sample_location)

save_feature_to_server_data("ft", server_data, "example-fasf23-23123-23asd", frames_ft)

Shape of ft: (291, 1025, 87)
Min/max of ft: 1.7289931986130824e-10/189.89466857910156
Shape of ft normalized: (291, 1025, 87)
Min/max of ft normalized: -1.036533236503601/7.253082752227783
ft_chosen
list_of_files.__len__() 292
framerate: 9.733333333333333
G:\dev\server_data\ft\example-fasf23-23123-23asd\video
Running command: ffmpeg -framerate 9.73 -y -i G:\dev\server_data\ft\example-fasf23-23123-23asd\frame_%04d.png -i G:\dev\server_data\30s\sample.wav -c:v libx264 -pix_fmt yuv420p -c:a aac -b:a 192k -shortest G:\dev\server_data\ft\example-fasf23-23123-23asd\video\example-fasf23-23123-23asd.mp4
G:\dev\server_data\features\example-fasf23-23123-23asd\ft
['ft.npy']


In [19]:
spectr_normalized = transform_to_spectr(frames_hop_2205, metadata, True)

generate_spectrogram_graphs(spectr_normalized, server_data ,"example-fasf23-23123-23asd")
generate_video("example-fasf23-23123-23asd", "spectr", sample_location)

save_feature_to_server_data("spectr", server_data, "example-fasf23-23123-23asd", spectr_normalized)

Shape of spectrogram: (291, 1025, 87)
Min/max of spectrogram: -80.0/0.0
Shape of spectrogram normalized: (291, 1025, 87)
Min/max of spectrogram normalized: -2.0571823120117188/2.960369110107422
spectr_chosen
framerate: 9.733333333333333
G:\dev\server_data\spectr\example-fasf23-23123-23asd\video
Running command: ffmpeg -framerate 9.73 -y -i G:\dev\server_data\spectr\example-fasf23-23123-23asd\frame_%04d.png -i G:\dev\server_data\30s\sample.wav -c:v libx264 -pix_fmt yuv420p -c:a aac -b:a 192k -shortest G:\dev\server_data\spectr\example-fasf23-23123-23asd\video\example-fasf23-23123-23asd.mp4
G:\dev\server_data\features\example-fasf23-23123-23asd\spectr
['spectr.npy']


In [None]:
mel_spectr_normalized = transform_to_mel_spectr(frames_hop_2205, metadata, True)

generate_mel_spectrogram_graphs(mel_spectr_normalized, server_data, "example-fasf23-23123-23asd")
generate_video("example-fasf23-23123-23asd", "mel_spectr", sample_location)


save_feature_to_server_data("mel_spectr", server_data, "example-fasf23-23123-23asd", mel_spectr_normalized)

Shape of mel spectrogram: (291, 1025, 87)
Min/max of mel spectrogram: -80.0/3.814697265625e-06
Shape of mel spectrogram normalized: (291, 1025, 87)
Min/max of mel spectrogram normalized: -2.089002847671509/2.861764430999756
melspectr_chosen
framerate: 9.733333333333333
/home/rwd/dev/server_data/mel_spectr/example-fasf23-23123-23asd/video
Running command: ffmpeg -framerate 9.73 -y -i /home/rwd/dev/server_data/mel_spectr/example-fasf23-23123-23asd/frame_%04d.png -i /home/rwd/dev/server_data/30s/sample.wav -c:v libx264 -pix_fmt yuv420p -c:a aac -b:a 192k -shortest /home/rwd/dev/server_data/mel_spectr/example-fasf23-23123-23asd/video/example-fasf23-23123-23asd.mp4


ffmpeg version 7.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 13.3.0 (GCC)
  libavutil      59. 39.100 / 59. 39.100
  libavcodec     61. 19.100 / 61. 19.100
  libavformat    61.  7.100 / 61.  7.100
  libavdevice    61.  3.100 / 61.  3.100
  libavfilter    10.  4.100 / 10.  4.100
  libswscale      8.  3.100 /  8.  3.100
  libswresample   5.  3.100 /  5.  3.100
  libpostproc    58.  3.100 / 58.  3.100
Input #0, image2, from '/home/rwd/dev/server_data/mel_spectr/example-fasf23-23123-23asd/frame_%04d.png':
  Duration: 00:00:29.91, start: 0.000000, bitrate: N/A
  Stream #0:0: Video: png, rgba(pc, gbr/unknown/unknown), 1000x400 [SAR 3937:3937 DAR 5:2], 9.73 fps, 9.73 tbr, 9.73 tbn
[aist#1:0/pcm_s16le @ 0x22b092c0] Guessed Channel Layout: mono
Input #1, wav, from '/home/rwd/dev/server_data/30s/sample.wav':
  Duration: 00:00:30.00, bitrate: 352 kb/s
  Stream #1:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 22050 Hz, mono, s16, 352 kb/s
Stream mapping:
  Stream #0:0 -> #0:0 (

In [75]:
power_spectr_normalized = transform_to_power_spectr(frames_hop_2205, metadata, True)

generate_power_spectrogram_graphs(power_spectr_normalized, server_data, "example-fasf23-23123-23asd")
generate_video("example-fasf23-23123-23asd", "power_spectr", sample_location)


save_feature_to_server_data("power_spectr", server_data, "example-fasf23-23123-23asd", power_spectr_normalized)

Shape of power spectrogram: (291, 1025, 87)
Min/max of power spectrogram: -80.0/3.814697265625e-06
Shape of power spectrogram normalized: (291, 1025, 87)
Min/max of power spectrogram normalized: -1.427354097366333/3.3731274604797363
power_spectr_chosen
framerate: 9.733333333333333
/home/rwd/dev/server_data/power_spectr/example-fasf23-23123-23asd/video
Running command: ffmpeg -framerate 9.73 -y -i /home/rwd/dev/server_data/power_spectr/example-fasf23-23123-23asd/frame_%04d.png -i /home/rwd/dev/server_data/30s/sample.wav -c:v libx264 -pix_fmt yuv420p -c:a aac -b:a 192k -shortest /home/rwd/dev/server_data/power_spectr/example-fasf23-23123-23asd/video/example-fasf23-23123-23asd.mp4


ffmpeg version 7.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 13.3.0 (GCC)
  libavutil      59. 39.100 / 59. 39.100
  libavcodec     61. 19.100 / 61. 19.100
  libavformat    61.  7.100 / 61.  7.100
  libavdevice    61.  3.100 / 61.  3.100
  libavfilter    10.  4.100 / 10.  4.100
  libswscale      8.  3.100 /  8.  3.100
  libswresample   5.  3.100 /  5.  3.100
  libpostproc    58.  3.100 / 58.  3.100
Input #0, image2, from '/home/rwd/dev/server_data/power_spectr/example-fasf23-23123-23asd/frame_%04d.png':
  Duration: 00:00:29.91, start: 0.000000, bitrate: N/A
  Stream #0:0: Video: png, rgba(pc, gbr/unknown/unknown), 1000x400 [SAR 3937:3937 DAR 5:2], 9.73 fps, 9.73 tbr, 9.73 tbn
[aist#1:0/pcm_s16le @ 0x38b38580] Guessed Channel Layout: mono
Input #1, wav, from '/home/rwd/dev/server_data/30s/sample.wav':
  Duration: 00:00:30.00, bitrate: 352 kb/s
  Stream #1:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 22050 Hz, mono, s16, 352 kb/s
Stream mapping:
  Stream #0:0 -> #0:0

/home/rwd/dev/server_data/features/example-fasf23-23123-23asd/power_spectr
['power_spectr.npy']


In [76]:
mfcc_normalized = transform_to_mfcc(frames_hop_2205, metadata, True)

generate_mfcc_graphs(mfcc_normalized, server_data, "example-fasf23-23123-23asd")
generate_video("example-fasf23-23123-23asd", "power_spectr", sample_location)


save_feature_to_server_data("mfcc", server_data, "example-fasf23-23123-23asd", mfcc_normalized)

Shape of mfcc: (291, 12, 87)
Min/max of mfcc: -557.2332153320312/193.99948120117188
Data needs normalization
Shape of mfcc normalized: (291, 12, 87)
Min/max of mfcc normalized: -6.921818733215332/2.9211695194244385
power_spectr_chosen
framerate: 9.733333333333333
/home/rwd/dev/server_data/power_spectr/example-fasf23-23123-23asd/video
Running command: ffmpeg -framerate 9.73 -y -i /home/rwd/dev/server_data/power_spectr/example-fasf23-23123-23asd/frame_%04d.png -i /home/rwd/dev/server_data/30s/sample.wav -c:v libx264 -pix_fmt yuv420p -c:a aac -b:a 192k -shortest /home/rwd/dev/server_data/power_spectr/example-fasf23-23123-23asd/video/example-fasf23-23123-23asd.mp4


ffmpeg version 7.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 13.3.0 (GCC)
  libavutil      59. 39.100 / 59. 39.100
  libavcodec     61. 19.100 / 61. 19.100
  libavformat    61.  7.100 / 61.  7.100
  libavdevice    61.  3.100 / 61.  3.100
  libavfilter    10.  4.100 / 10.  4.100
  libswscale      8.  3.100 /  8.  3.100
  libswresample   5.  3.100 /  5.  3.100
  libpostproc    58.  3.100 / 58.  3.100
Input #0, image2, from '/home/rwd/dev/server_data/power_spectr/example-fasf23-23123-23asd/frame_%04d.png':
  Duration: 00:00:29.91, start: 0.000000, bitrate: N/A
  Stream #0:0: Video: png, rgba(pc, gbr/unknown/unknown), 1000x400 [SAR 3937:3937 DAR 5:2], 9.73 fps, 9.73 tbr, 9.73 tbn
[aist#1:0/pcm_s16le @ 0x2719a380] Guessed Channel Layout: mono
Input #1, wav, from '/home/rwd/dev/server_data/30s/sample.wav':
  Duration: 00:00:30.00, bitrate: 352 kb/s
  Stream #1:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 22050 Hz, mono, s16, 352 kb/s
Stream mapping:
  Stream #0:0 -> #0:0

/home/rwd/dev/server_data/features/example-fasf23-23123-23asd/mfcc
['mfcc.npy']


[out#0/mp4 @ 0x271eaa00] video:1310KiB audio:417KiB subtitle:0KiB other streams:0KiB global headers:0KiB muxing overhead: 0.561838%
frame=  291 fps= 53 q=-1.0 Lsize=    1737KiB time=00:00:29.70 bitrate= 479.0kbits/s speed=5.41x    
[libx264 @ 0x27193a00] frame I:2     Avg QP:19.14  size:  8995
[libx264 @ 0x27193a00] frame P:243   Avg QP:21.37  size:  4843
[libx264 @ 0x27193a00] frame B:46    Avg QP:22.18  size:  3169
[libx264 @ 0x27193a00] consecutive B-frames: 69.4% 27.5%  3.1%  0.0%
[libx264 @ 0x27193a00] mb I  I16..4: 38.0% 46.8% 15.2%
[libx264 @ 0x27193a00] mb P  I16..4: 19.1% 11.9%  3.4%  P16..4: 19.4%  5.3%  1.3%  0.0%  0.0%    skip:39.5%
[libx264 @ 0x27193a00] mb B  I16..4:  6.5%  4.2%  0.6%  B16..8: 24.0%  5.2%  0.3%  direct: 8.4%  skip:50.8%  L0:53.3% L1:41.4% BI: 5.3%
[libx264 @ 0x27193a00] 8x8 transform intra:35.0% inter:81.8%
[libx264 @ 0x27193a00] coded y,uvDC,uvAC intra: 15.0% 84.0% 43.1% inter: 9.2% 24.3% 2.1%
[libx264 @ 0x27193a00] i16 v,h,dc,p: 47% 47%  4%  3%
[libx264

In [77]:
normalized_chroma_stft = transform_to_chroma(frames_hop_2205, metadata,  "stft", True)

generate_chroma_graphs(normalized_chroma_stft, "stft", server_data, "example-fasf23-23123-23asd")
generate_video("example-fasf23-23123-23asd", "stft", sample_location)


save_feature_to_server_data("chroma_stft", server_data, "example-fasf23-23123-23asd", normalized_chroma_stft)

Shape of chroma stft spectrogram: (291, 12, 87)
Min/max of chroma stft spectrogram: 0.0007928688428364694/1.0
Data needs normalization
Shape of chroma stft normalized: (291, 12, 87)
Min/max of chroma stft normalized: -1.430617094039917/2.0787482261657715
stft _ chosen
framerate: 9.733333333333333
/home/rwd/dev/server_data/stft/example-fasf23-23123-23asd/video
Running command: ffmpeg -framerate 9.73 -y -i /home/rwd/dev/server_data/stft/example-fasf23-23123-23asd/frame_%04d.png -i /home/rwd/dev/server_data/30s/sample.wav -c:v libx264 -pix_fmt yuv420p -c:a aac -b:a 192k -shortest /home/rwd/dev/server_data/stft/example-fasf23-23123-23asd/video/example-fasf23-23123-23asd.mp4


ffmpeg version 7.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 13.3.0 (GCC)
  libavutil      59. 39.100 / 59. 39.100
  libavcodec     61. 19.100 / 61. 19.100
  libavformat    61.  7.100 / 61.  7.100
  libavdevice    61.  3.100 / 61.  3.100
  libavfilter    10.  4.100 / 10.  4.100
  libswscale      8.  3.100 /  8.  3.100
  libswresample   5.  3.100 /  5.  3.100
  libpostproc    58.  3.100 / 58.  3.100
Input #0, image2, from '/home/rwd/dev/server_data/stft/example-fasf23-23123-23asd/frame_%04d.png':
  Duration: 00:00:29.91, start: 0.000000, bitrate: N/A
  Stream #0:0: Video: png, rgba(pc, gbr/unknown/unknown), 1000x400 [SAR 3937:3937 DAR 5:2], 9.73 fps, 9.73 tbr, 9.73 tbn
[aist#1:0/pcm_s16le @ 0x25367780] Guessed Channel Layout: mono
Input #1, wav, from '/home/rwd/dev/server_data/30s/sample.wav':
  Duration: 00:00:30.00, bitrate: 352 kb/s
  Stream #1:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 22050 Hz, mono, s16, 352 kb/s
Stream mapping:
  Stream #0:0 -> #0:0 (png (n

/home/rwd/dev/server_data/features/example-fasf23-23123-23asd/chroma_stft
['chroma_stft.npy']


[out#0/mp4 @ 0x25367900] video:2566KiB audio:417KiB subtitle:0KiB other streams:0KiB global headers:0KiB muxing overhead: 0.306410%
frame=  291 fps= 50 q=-1.0 Lsize=    2992KiB time=00:00:29.70 bitrate= 825.3kbits/s speed=5.06x    
[libx264 @ 0x252d7a00] frame I:2     Avg QP:21.10  size: 14254
[libx264 @ 0x252d7a00] frame P:273   Avg QP:24.66  size:  9075
[libx264 @ 0x252d7a00] frame B:16    Avg QP:25.81  size:  7546
[libx264 @ 0x252d7a00] consecutive B-frames: 89.3%  9.6%  1.0%  0.0%
[libx264 @ 0x252d7a00] mb I  I16..4: 35.6% 26.9% 37.5%
[libx264 @ 0x252d7a00] mb P  I16..4: 16.1%  6.0% 17.2%  P16..4: 15.3%  6.8%  2.3%  0.0%  0.0%    skip:36.3%
[libx264 @ 0x252d7a00] mb B  I16..4: 12.7%  4.9% 10.3%  B16..8: 14.9%  8.7%  1.5%  direct: 5.8%  skip:41.2%  L0:57.8% L1:30.4% BI:11.8%
[libx264 @ 0x252d7a00] 8x8 transform intra:15.6% inter:59.7%
[libx264 @ 0x252d7a00] coded y,uvDC,uvAC intra: 31.7% 79.3% 45.1% inter: 22.0% 22.5% 7.7%
[libx264 @ 0x252d7a00] i16 v,h,dc,p: 76% 21%  2%  1%
[libx26

In [78]:
normalized_chroma_cens = transform_to_chroma(frames_hop_2205, metadata,  "cens", True)

generate_chroma_graphs(normalized_chroma_cens, "cens", server_data, "example-fasf23-23123-23asd")
generate_video("example-fasf23-23123-23asd", "cens", sample_location)


save_feature_to_server_data("chroma_cens", server_data, "example-fasf23-23123-23asd", normalized_chroma_cens)



Shape of chroma cens spectrogram: (291, 12, 44)
Min/max of chroma cens spectrogram: 0.0/0.9283891916275024
cens _ chosen
framerate: 9.733333333333333
/home/rwd/dev/server_data/cens/example-fasf23-23123-23asd/video
Running command: ffmpeg -framerate 9.73 -y -i /home/rwd/dev/server_data/cens/example-fasf23-23123-23asd/frame_%04d.png -i /home/rwd/dev/server_data/30s/sample.wav -c:v libx264 -pix_fmt yuv420p -c:a aac -b:a 192k -shortest /home/rwd/dev/server_data/cens/example-fasf23-23123-23asd/video/example-fasf23-23123-23asd.mp4


ffmpeg version 7.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 13.3.0 (GCC)
  libavutil      59. 39.100 / 59. 39.100
  libavcodec     61. 19.100 / 61. 19.100
  libavformat    61.  7.100 / 61.  7.100
  libavdevice    61.  3.100 / 61.  3.100
  libavfilter    10.  4.100 / 10.  4.100
  libswscale      8.  3.100 /  8.  3.100
  libswresample   5.  3.100 /  5.  3.100
  libpostproc    58.  3.100 / 58.  3.100
Input #0, image2, from '/home/rwd/dev/server_data/cens/example-fasf23-23123-23asd/frame_%04d.png':
  Duration: 00:00:29.91, start: 0.000000, bitrate: N/A
  Stream #0:0: Video: png, rgba(pc, gbr/unknown/unknown), 1000x400 [SAR 3937:3937 DAR 5:2], 9.73 fps, 9.73 tbr, 9.73 tbn
[aist#1:0/pcm_s16le @ 0x288a0e80] Guessed Channel Layout: mono
Input #1, wav, from '/home/rwd/dev/server_data/30s/sample.wav':
  Duration: 00:00:30.00, bitrate: 352 kb/s
  Stream #1:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 22050 Hz, mono, s16, 352 kb/s
Stream mapping:
  Stream #0:0 -> #0:0 (png (n

/home/rwd/dev/server_data/features/example-fasf23-23123-23asd/chroma_cens
['chroma_cens.npy']


[out#0/mp4 @ 0x288a10c0] video:1028KiB audio:417KiB subtitle:0KiB other streams:0KiB global headers:0KiB muxing overhead: 0.609211%
frame=  291 fps= 59 q=-1.0 Lsize=    1454KiB time=00:00:29.70 bitrate= 401.0kbits/s speed=6.06x    
[libx264 @ 0x28812a00] frame I:2     Avg QP:19.44  size:  9470
[libx264 @ 0x28812a00] frame P:289   Avg QP:22.39  size:  3575
[libx264 @ 0x28812a00] mb I  I16..4: 50.3% 37.5% 12.1%
[libx264 @ 0x28812a00] mb P  I16..4: 32.9%  9.9%  4.4%  P16..4: 10.9%  2.0%  0.2%  0.0%  0.0%    skip:39.6%
[libx264 @ 0x28812a00] 8x8 transform intra:21.2% inter:66.4%
[libx264 @ 0x28812a00] coded y,uvDC,uvAC intra: 11.4% 58.1% 18.5% inter: 5.1% 8.7% 1.8%
[libx264 @ 0x28812a00] i16 v,h,dc,p: 19% 78%  2%  1%
[libx264 @ 0x28812a00] i8 v,h,dc,ddl,ddr,vr,hd,vl,hu: 41% 27% 29%  1%  0%  0%  0%  0%  1%
[libx264 @ 0x28812a00] i4 v,h,dc,ddl,ddr,vr,hd,vl,hu: 42% 38% 19%  0%  0%  0%  0%  0%  0%
[libx264 @ 0x28812a00] i8c dc,h,v,p: 13% 80%  6%  1%
[libx264 @ 0x28812a00] Weighted P-Frames: Y:

In [79]:
normalized_chroma_cqt = transform_to_chroma(frames_hop_2205, metadata,  "cqt", True)

generate_chroma_graphs(normalized_chroma_cqt, "cqt", server_data, "example-fasf23-23123-23asd")
generate_video("example-fasf23-23123-23asd", "cqt", sample_location)


save_feature_to_server_data("chroma_cqt", server_data, "example-fasf23-23123-23asd", normalized_chroma_cqt)



Shape of chroma cqt spectrogram: (291, 12, 44)
Min/max of chroma cqt spectrogram: 0.012269821017980576/1.0
Data needs normalization
Shape of chroma cqt normalized: (291, 12, 44)
Min/max of chroma cqt normalized: -1.9077993631362915/1.984749436378479
cqt _ chosen
framerate: 9.733333333333333
/home/rwd/dev/server_data/cqt/example-fasf23-23123-23asd/video
Running command: ffmpeg -framerate 9.73 -y -i /home/rwd/dev/server_data/cqt/example-fasf23-23123-23asd/frame_%04d.png -i /home/rwd/dev/server_data/30s/sample.wav -c:v libx264 -pix_fmt yuv420p -c:a aac -b:a 192k -shortest /home/rwd/dev/server_data/cqt/example-fasf23-23123-23asd/video/example-fasf23-23123-23asd.mp4


ffmpeg version 7.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 13.3.0 (GCC)
  libavutil      59. 39.100 / 59. 39.100
  libavcodec     61. 19.100 / 61. 19.100
  libavformat    61.  7.100 / 61.  7.100
  libavdevice    61.  3.100 / 61.  3.100
  libavfilter    10.  4.100 / 10.  4.100
  libswscale      8.  3.100 /  8.  3.100
  libswresample   5.  3.100 /  5.  3.100
  libpostproc    58.  3.100 / 58.  3.100
Input #0, image2, from '/home/rwd/dev/server_data/cqt/example-fasf23-23123-23asd/frame_%04d.png':
  Duration: 00:00:29.91, start: 0.000000, bitrate: N/A
  Stream #0:0: Video: png, rgba(pc, gbr/unknown/unknown), 1000x400 [SAR 3937:3937 DAR 5:2], 9.73 fps, 9.73 tbr, 9.73 tbn
[aist#1:0/pcm_s16le @ 0x16f81340] Guessed Channel Layout: mono
Input #1, wav, from '/home/rwd/dev/server_data/30s/sample.wav':
  Duration: 00:00:30.00, bitrate: 352 kb/s
  Stream #1:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 22050 Hz, mono, s16, 352 kb/s
Stream mapping:
  Stream #0:0 -> #0:0 (png (na

/home/rwd/dev/server_data/features/example-fasf23-23123-23asd/chroma_cqt
['chroma_cqt.npy']


[out#0/mp4 @ 0x16f814c0] video:2181KiB audio:417KiB subtitle:0KiB other streams:0KiB global headers:0KiB muxing overhead: 0.347293%
frame=  291 fps= 53 q=-1.0 Lsize=    2607KiB time=00:00:29.70 bitrate= 719.1kbits/s speed=5.37x    
[libx264 @ 0x16ea29c0] frame I:2     Avg QP:20.69  size: 13376
[libx264 @ 0x16ea29c0] frame P:278   Avg QP:23.93  size:  7669
[libx264 @ 0x16ea29c0] frame B:11    Avg QP:25.01  size:  6726
[libx264 @ 0x16ea29c0] consecutive B-frames: 92.8%  6.2%  1.0%  0.0%
[libx264 @ 0x16ea29c0] mb I  I16..4: 41.8% 24.5% 33.7%
[libx264 @ 0x16ea29c0] mb P  I16..4: 19.5%  7.0% 17.4%  P16..4: 13.4%  4.7%  1.1%  0.0%  0.0%    skip:36.8%
[libx264 @ 0x16ea29c0] mb B  I16..4: 14.5%  5.5% 11.6%  B16..8: 15.7%  7.3%  1.0%  direct: 4.3%  skip:40.0%  L0:57.0% L1:30.6% BI:12.5%
[libx264 @ 0x16ea29c0] 8x8 transform intra:16.2% inter:61.9%
[libx264 @ 0x16ea29c0] coded y,uvDC,uvAC intra: 23.2% 82.8% 49.2% inter: 14.7% 17.4% 4.7%
[libx264 @ 0x16ea29c0] i16 v,h,dc,p: 56% 40%  4%  1%
[libx26

In [83]:
normalized_tonnetz = transform_to_tonnetz(frames_hop_2205, metadata, True)

generate_tonnetz_graphs(normalized_tonnetz, server_data, "example-fasf23-23123-23asd")
generate_video("example-fasf23-23123-23asd", "tonnetz", sample_location)


save_feature_to_server_data("tonnetz", server_data, "example-fasf23-23123-23asd", normalized_tonnetz)



Shape of tonnetz: (291, 6, 44)
Min/max of tonnetz: -0.608512282371521/0.5110496878623962
tonnetz _ chosen
framerate: 9.733333333333333
/home/rwd/dev/server_data/tonnetz/example-fasf23-23123-23asd/video
Running command: ffmpeg -framerate 9.73 -y -i /home/rwd/dev/server_data/tonnetz/example-fasf23-23123-23asd/frame_%04d.png -i /home/rwd/dev/server_data/30s/sample.wav -c:v libx264 -pix_fmt yuv420p -c:a aac -b:a 192k -shortest /home/rwd/dev/server_data/tonnetz/example-fasf23-23123-23asd/video/example-fasf23-23123-23asd.mp4


ffmpeg version 7.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 13.3.0 (GCC)
  libavutil      59. 39.100 / 59. 39.100
  libavcodec     61. 19.100 / 61. 19.100
  libavformat    61.  7.100 / 61.  7.100
  libavdevice    61.  3.100 / 61.  3.100
  libavfilter    10.  4.100 / 10.  4.100
  libswscale      8.  3.100 /  8.  3.100
  libswresample   5.  3.100 /  5.  3.100
  libpostproc    58.  3.100 / 58.  3.100
Input #0, image2, from '/home/rwd/dev/server_data/tonnetz/example-fasf23-23123-23asd/frame_%04d.png':
  Duration: 00:00:29.91, start: 0.000000, bitrate: N/A
  Stream #0:0: Video: png, rgba(pc, gbr/unknown/unknown), 1000x400 [SAR 3937:3937 DAR 5:2], 9.73 fps, 9.73 tbr, 9.73 tbn
[aist#1:0/pcm_s16le @ 0x1c8ba400] Guessed Channel Layout: mono
Input #1, wav, from '/home/rwd/dev/server_data/30s/sample.wav':
  Duration: 00:00:30.00, bitrate: 352 kb/s
  Stream #1:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 22050 Hz, mono, s16, 352 kb/s
Stream mapping:
  Stream #0:0 -> #0:0 (png

/home/rwd/dev/server_data/features/example-fasf23-23123-23asd/tonnetz
['tonnetz.npy']


[out#0/mp4 @ 0x1c9ac780] video:1395KiB audio:417KiB subtitle:0KiB other streams:0KiB global headers:0KiB muxing overhead: 0.485938%
frame=  291 fps= 57 q=-1.0 Lsize=    1821KiB time=00:00:29.70 bitrate= 502.2kbits/s speed=5.78x    
[libx264 @ 0x1c8a1a00] frame I:2     Avg QP:17.83  size: 10770
[libx264 @ 0x1c8a1a00] frame P:289   Avg QP:21.61  size:  4865
[libx264 @ 0x1c8a1a00] mb I  I16..4: 57.0% 20.2% 22.8%
[libx264 @ 0x1c8a1a00] mb P  I16..4: 33.5%  9.2% 10.3%  P16..4:  7.7%  2.1%  0.4%  0.0%  0.0%    skip:36.9%
[libx264 @ 0x1c8a1a00] 8x8 transform intra:17.4% inter:56.6%
[libx264 @ 0x1c8a1a00] coded y,uvDC,uvAC intra: 12.7% 54.7% 28.5% inter: 8.1% 9.9% 1.8%
[libx264 @ 0x1c8a1a00] i16 v,h,dc,p: 84% 14%  1%  0%
[libx264 @ 0x1c8a1a00] i8 v,h,dc,ddl,ddr,vr,hd,vl,hu: 58% 14% 26%  2%  0%  0%  0%  0%  0%
[libx264 @ 0x1c8a1a00] i4 v,h,dc,ddl,ddr,vr,hd,vl,hu: 60% 31%  8%  0%  0%  0%  0%  0%  0%
[libx264 @ 0x1c8a1a00] i8c dc,h,v,p: 17% 22% 59%  1%
[libx264 @ 0x1c8a1a00] Weighted P-Frames: Y: