# Work for 2nd Project Update

First we'll take the relevant methods from preview_request so we can get the information of any song

In [None]:
from preview_request import preview_request, convert_to_wav

We'll need to work with arrays and audio

In [None]:
import numpy as np
import pandas as pd
import librosa

For some *mild* color manipulation

In [None]:
import colorsys

For animation

In [None]:
from manim import *

### Example of Manim in Jupyter

In [None]:
class RemoveObjectExample(Scene):
    def construct(self):
        text = Text("Alice and Bob").scale(3)
        self.add(text)
        self.wait(3)
        self.remove(text)
        
        # Display the circle
        circle = Circle()
        self.play(Create(circle))
        self.wait(1)
        self.remove(circle)

%manim -ql -v WARNING RemoveObjectExample

# First Method: Changing Backgrounds

We'll need to convert from rgb to hex codes:

In [None]:
def rgb_to_hex(rgb):
    '''
    param: tuple of rgb coded colors
    returns: hex string
    '''
    if type(rgb) != tuple:
        rgb = tuple(rgb)
    return '%02x%02x%02x' % rgb

Normalize values between 0-255 for rgb codes

In [None]:
def norm_for_color(array):
    '''
    param: np array
    returns: np array normalized between 0-255
    '''
    min_val = np.min(array)
    max_val = np.max(array)
    
    normalized_array = (array - min_val) / (max_val - min_val) * 255
    return normalized_array

Here's the driving manim code:

In [None]:
class colorChanger(Scene):
    def construct(self):
        # audio_file_name = preview_request() #for terminal
        audio_file_name = "GMGM_short.wav"
        y, sr = librosa.load(audio_file_name)

        tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
        beat_times = librosa.frames_to_time(beat_frames, sr=sr)
        
        song_time = librosa.get_duration(y=y, sr=sr)
        
        #Fix last couple seconds
        if beat_times[-1] < song_time:
            beat_times = np.concatenate((beat_times, np.linspace(beat_times[-1], song_time-0.01, 4)))

        percent_through_song = beat_times/song_time


        num_to_take_from_y_1 = ((percent_through_song-0.02)*y.size).astype(int)
        num_to_take_from_y_2 = ((percent_through_song-0.01)*y.size).astype(int)
        num_to_take_from_y_3 = ((percent_through_song+0.00)*y.size).astype(int)

        # Normalize the audio values to generate RGB color channels
        r = norm_for_color(y[num_to_take_from_y_1]).astype(int)
        b = norm_for_color(y[num_to_take_from_y_2]).astype(int)
        g = norm_for_color(y[num_to_take_from_y_3]).astype(int)

        zip_obj = zip(r,b,g)
        color_list = list(zip_obj)

        vectorized_function = np.vectorize(rgb_to_hex, signature='(n)->()')
        list_of_colors = vectorized_function(np.array(color_list))

        cues = np.diff(beat_times)
        
        self.add_sound(audio_file_name)

        for i in range(len(cues)):
            text = Text(f"{beat_times[i]:.2f}", font_size=144)
            self.add(text)  
            self.wait(cues[i])

            self.camera.background_color = "#"+list_of_colors[i]
            self.remove(text)
            
%manim -ql -v WARNING colorChanger

# Second Method:

We're going to use this song for the following example:

In [None]:
audio_file_name = "GMGM_short.wav"
y, sr = librosa.load("GMGM_short.wav")

tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
beat_times = librosa.frames_to_time(beat_frames, sr=sr)

To make the visualizations more "human" I'll think about the colors in HSV and since manim will accept hex values, we want a way to convert between the two:

In [None]:
def hsv_to_hex(hsv):
    h, s, v = hsv
    #Convert 
    h = h/360
    s = s/100
    v = v/100
    
    # Convert HSV to RGB using colorsys
    r, g, b = colorsys.hsv_to_rgb(h, s, v)
    
    # Convert RGB to HEX
    hex_color = '{:02x}{:02x}{:02x}'.format(int(r * 255), int(g * 255), int(b * 255))
    
    return hex_color

We'll want a way to normalize a numpy array between 0-n.

In [None]:
def norm_to_n(array, n=1):
    '''
    param: np array
    returns: np array normalized between 0-n
    '''
    min_val = np.min(array)
    max_val = np.max(array)
    
    normalized_array = (array - min_val) / (max_val - min_val) * n
    return normalized_array

## Step 1: Get *Pitch* for Brightness

In [None]:
# Function to extract pitch at specific timestamps
def extract_max_pitches(y, sr, beat_times, duration=0.1):
    pitches = []
    for time in beat_times:
        # Extract a segment of audio around each timestamp
        start_sample = int(time * sr)
        end_sample = int((time + duration) * sr)
        segment = y[start_sample:end_sample]

        # Get pitch
        pitches_segment, magnitudes = librosa.core.piptrack(y=segment, sr=sr)
        
        # Get the maximum pitch from the frame with the highest magnitude
        max_pitch = np.max(pitches_segment)
        if max_pitch > 0:
            pitches.append(max_pitch)
        else:
            pitches.append(0)  # No pitch detected
    
    return np.array(pitches)

Let's test out the function:

In [None]:
pitches = extract_max_pitches(y,sr,beat_times)

Notice `beat_times` and `pitches` have the same size:

In [None]:
beat_times.shape, pitches.shape

Here's what the `pitches` array looks like.

In [None]:
pitches

And what normalizing will do:

In [None]:
B = norm_to_n(pitches, 100)
B.astype(int) #for viewing purposes

## Step 2: Get Volume for Saturation

In [None]:
# Function to extract pitch at specific timestamps
def extract_max_volume(y, sr, beat_times, duration=0.1):
    volumes = []
    for time in beat_times:
        # Extract a segment of audio around each timestamp
        start_sample = int(time * sr)
        end_sample = int((time + duration) * sr)
        segment = y[start_sample:end_sample]

        # Get volume
        rms = librosa.feature.rms(y=segment)
        rms_db = librosa.amplitude_to_db(rms)
        
        # Get the maximum volume
        max_vol = np.max(rms_db)
        volumes.append(max_vol)
    
    return np.array(volumes)

In [None]:
volumes = extract_max_volume(y, sr, beat_times)
volumes

In [None]:
S = norm_to_n(volumes, 100)
S.astype(int) #for viewing purposes

## Step 3: Get Note (within Octave) for Hue

In [None]:
notes = librosa.hz_to_note(pitches, octave=False)
notes

In [None]:
note_map = {
    'C':1,
    'C♯':1.5,
    'D':2,
    'D♯':2.5,
    'E':3,
    'F':4,
    'F♯':4.5,
    'G':5,
    'G♯':5.5,
    'A':6,
    'A♯':5.5,
    'B':7
}

In [None]:
note_vals = [note_map[note] for note in notes]
np.array(note_vals) #made nparray for viewing purposes

In [None]:
H = norm_to_n(note_vals,360)
H.astype(int)  #for viewing purposes

## Step 4: Separate and Detect Different Sounds 

In [None]:
#harmonic percussive separation 
y_harmonic, y_percussive = librosa.effects.hpss(y)
y_harmonic

In [None]:
y_percussive

We can detect local note onset events with librosa in our percussive array

In [None]:
# onset detection
o_env = librosa.onset.onset_strength(y=y_percussive, sr=sr)


# onset strength per unit
onset_frames = librosa.onset.onset_detect(y=y_percussive, sr=sr, units="time")
onset_frames

From the documentation, here is an example of how we can visualize onsets!

In [None]:
import matplotlib.pyplot as plt
times = librosa.times_like(o_env, sr=sr)

D = np.abs(librosa.stft(y))
fig, ax = plt.subplots(nrows=2, sharex=True)
librosa.display.specshow(librosa.amplitude_to_db(D, ref=np.max),x_axis='time', y_axis='log', ax=ax[0], sr=sr)

ax[1].plot(times, o_env, label='Onset strength')
ax[1].legend()

We can also detect the spectral centroids per unit to maybe do something like exapnd the text during high frequency, brighter sounds 

In [None]:
spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
norm_centroid = (spectral_centroid - spectral_centroid.min()) / (spectral_centroid.max() - spectral_centroid.min())

## Zip and Colorize!

As before, we'll zip to make the color channels and colorize!

In [None]:
zip_obj = zip(H,S,B)
color_list = list(zip_obj)

Let's look at the first ~10 values

In [None]:
color_list[:10] 

Again we'll vectorize the function and apply:

In [None]:
vectorized_function = np.vectorize(hsv_to_hex, signature='(n)->()')
list_of_colors = vectorized_function(np.array(color_list))
list_of_colors

## Play!

In [None]:
class hsvAnim(Scene):
    def construct(self):
        # audio_file_name = preview_request()
        # y, sr = librosa.load(audio_file_name)
        audio_file_name = "GMGM_30.wav"
        y, sr = librosa.load("GMGM_30.wav")
        
        tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
        beat_times = librosa.frames_to_time(beat_frames, sr=sr)

        song_time = librosa.get_duration(y=y, sr=sr)

        #prevent last 5 sec from being constant
        if beat_times[-1] < song_time:
            beat_times = np.concatenate((beat_times, np.linspace(beat_times[-1], song_time-0.01, 4)))

        # harmonic-percussive separation
        y_harmonic, y_percussive = librosa.effects.hpss(y)

        # onset detection 
        # onset_env = librosa.onset.onset_strength(y=y_percussive, sr=sr)
        # onset_times = librosa.onset.onset_detect(y=y_percussive, sr=sr, units="time")

        # spectral centroid for brightness 
        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
        norm_centroid = (spectral_centroid - spectral_centroid.min()) / (spectral_centroid.max() - spectral_centroid.min())

        # Extract Pitch and Volume
        pitches = extract_max_pitches(y, sr, beat_times)
        volumes = extract_max_volume(y, sr, beat_times)
        
        note_map = {
            'C':1,'C♯':1.5,
            'D':2,'D♯':2.5,
            'E':3,
            'F':4,'F♯':4.5,
            'G':5,'G♯':5.5,
            'A':6,'A♯':5.5,
            'B':7
        }
        
        pitches = extract_max_pitches(y, sr, beat_times)
        volumes = extract_max_volume(y, sr, beat_times)

        notes = librosa.hz_to_note(pitches, octave=False)
        note_vals = [note_map[note] for note in notes]

        H = norm_to_n(note_vals,360)
        S = norm_to_n(volumes, 100)
        B = norm_to_n(pitches, 100)


        zip_obj = zip(H,S,B)
        color_list = list(zip_obj)

        vectorized_function = np.vectorize(hsv_to_hex, signature='(n)->()')
        list_of_colors = vectorized_function(np.array(color_list))

        cues = np.diff(beat_times)
        
        self.add_sound(audio_file_name)


        for i in range(len(cues)):
            text = Text(f"{beat_times[i]:.2f}", font_size=144)
            # text size expands during high frequency sounds 
            text.scale(1 + norm_centroid[i] * 6)
            
            self.add(text)  
            self.wait(cues[i])
            self.camera.background_color = "#"+list_of_colors[i]
            self.remove(text)
            
        self.wait(3)
    
%manim -ql -v WARNING hsvAnim

# Some Extra Animations:

In [None]:
import numpy as np

class RosePattern(VMobject):
    def __init__(self, radius: float = 2, k: float = 3, theta_range=TAU, **kwargs):
        super().__init__(**kwargs)
        self.radius = radius
        self.k = k

        step_size = 0.1
        theta = np.arange(0, theta_range + step_size, step_size)

        points = [
            [
                radius * np.cos(k * t) * np.cos(t),
                radius * np.cos(k * t) * np.sin(t),
                0
            ] for t in theta
        ]

        self.set_points_smoothly(points)
        
class ShowingRosePattern(Scene):
    def construct(self):
        self.camera.background_color = '#7fbf26'
        rose = RosePattern(k=8, color=RED, stroke_width=15)

        self.play(Create(rose), run_time=5)
        self.wait()
        
%manim -ql -v WARNING ShowingRosePattern

# Other Project updates

Example of using onset detection for color change instead of beats:
- issues with matching up onset times with color list and cues but first 5 seconds works (with little visual appeal)

In [None]:
class hsvAnim(Scene):
    def construct(self):
        # audio_file_name = preview_request()
        # y, sr = librosa.load(audio_file_name)
        audio_file_name = "GMGM_30.wav"
        y, sr = librosa.load("GMGM_30.wav")
        
        tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
        # beat_times = librosa.frames_to_time(beat_frames, sr=sr)

        song_time = librosa.get_duration(y=y, sr=sr)

        #prevent last 5 sec from being constant
        # if beat_times[-1] < song_time:
        #     beat_times = np.concatenate((beat_times, np.linspace(beat_times[-1], song_time-0.01, 4)))

        # harmonic-percussive separation
        y_harmonic, y_percussive = librosa.effects.hpss(y)

        # onset detection 
        onset_env = librosa.onset.onset_strength(y=y_percussive, sr=sr)
        onset_times = librosa.onset.onset_detect(y=y_percussive, sr=sr, units="time")

        # spectral centroid for brightness 
        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
        norm_centroid = (spectral_centroid - spectral_centroid.min()) / (spectral_centroid.max() - spectral_centroid.min())

        # Extract Pitch and Volume
        pitches = extract_max_pitches(y, sr, beat_times)
        volumes = extract_max_volume(y, sr, beat_times)
        
        note_map = {
            'C':1,'C♯':1.5,
            'D':2,'D♯':2.5,
            'E':3,
            'F':4,'F♯':4.5,
            'G':5,'G♯':5.5,
            'A':6,'A♯':5.5,
            'B':7
        }


        pitches = extract_max_pitches(y, sr, beat_times)
        volumes = extract_max_volume(y, sr, beat_times)

        notes = librosa.hz_to_note(pitches, octave=False)
        note_vals = [note_map[note] for note in notes]

        H = norm_to_n(note_vals,360)
        S = norm_to_n(volumes, 100)
        B = norm_to_n(pitches, 100)


        zip_obj = zip(H,S,B)
        color_list = list(zip_obj)

        vectorized_function = np.vectorize(hsv_to_hex, signature='(n)->()')
        list_of_colors = vectorized_function(np.array(color_list))
    
        self.add_sound(audio_file_name)

        # trying to add color change based on onsets
        num_onsets = len(onset_times)
        num_colors = len(list_of_colors)

        if num_onsets > num_colors:
            onset_times = onset_times[:num_colors]  # Clip onset_times to match number of colors
        elif num_onsets < num_colors:
            list_of_colors = list_of_colors[:num_onsets]  # Clip list_of_colors to match onset_times

        cues = np.diff(onset_times)
        
        for i in range(min(len(cues), len(list_of_colors))):
            text = Text(f"{onset_times[i]:.2f}", font_size=144)
            text.scale(1 + norm_centroid[i] * 6)

            # wait_time = cues[i] if cues[i] > 0.1 else 0.1 
            
            self.add(text)
            self.wait(cues[i]) 
            self.camera.background_color = "#"+list_of_colors[i]  
            self.remove(text)  

        # for i in range(len(cues)):
        #     text = Text(f"{beat_times[i]:.2f}", font_size=144)
        #     # text size expands during high frequency sounds 
        #     text.scale(1 + norm_centroid[i] * 6)
            

        #     self.add(text)  
        #     self.wait(cues[i])
        #     self.camera.background_color = "#"+list_of_colors[i]
        #     self.remove(text)

        duration = sum(cues) + 3
        self.wait(duration)
    
%manim -ql -v WARNING hsvAnim