# Loading Audio

In [5]:
import numpy as np
import scipy.io.wavfile as wav

# Load the audio file
def load_audio(file_path):
    sr, audio = wav.read(file_path)
    if audio.ndim > 1:
        audio = np.mean(audio, axis=1)  # Convert stereo to mono
    audio = audio.astype(np.float32)
    audio /= np.max(np.abs(audio))  # Normalize to range [-1, 1]
    return sr, audio


# Computing STFT Magnitude

In [6]:
# Function to compute STFT magnitude
def compute_stft_magnitude(audio, sr, frame_size, hop_size):
    # Pad audio to ensure all frames have equal size
    audio = np.pad(audio, (0, frame_size - len(audio) % frame_size), mode='constant')
    num_frames = (len(audio) - frame_size) // hop_size + 1
    
    stft_magnitude = np.zeros((num_frames, frame_size // 2 + 1))
    
    # Compute STFT magnitude for each frame
    for i in range(num_frames):
        frame = audio[i * hop_size : i * hop_size + frame_size]
        windowed_frame = frame * np.hamming(frame_size)
        stft = np.fft.rfft(windowed_frame)
        stft_magnitude[i, :] = np.abs(stft)
    
    return stft_magnitude


# Mapping Frequencies to Chroma Bins and Conversion to Integer

In [7]:
# Function to map frequencies to chroma bins and convert to integer
def map_to_chroma(stft_magnitude, sr, num_chroma_bins):
    chroma_matrix = np.zeros((stft_magnitude.shape[0], num_chroma_bins), dtype=int)
    freqs = np.fft.rfftfreq(stft_magnitude.shape[1] * 2 - 1, d=1/sr)
    
    # Define boundaries for chroma bins (C, C#, D, ..., B)
    chroma_boundaries = np.linspace(0, 1, num_chroma_bins + 1)
    
    for i in range(num_chroma_bins):
        bin_start = int(chroma_boundaries[i] * len(freqs))
        bin_end = int(chroma_boundaries[i + 1] * len(freqs))
        
        # Accumulate magnitude within each chroma bin and convert to integer
        chroma_matrix[:, i] = np.round(np.sum(stft_magnitude[:, bin_start:bin_end], axis=1)).astype(int)
    
    return chroma_matrix


# files


In [13]:
if __name__ == '__main__':
    file_path =  r'C:\Users\Sachin\Desktop\ai_projects\command_word\DATA\Blue\akash_blue_0.wav'
    sr, audio = load_audio(file_path)
    
    # Parameters
    duration = 1  # Duration of audio segment to process (in seconds)
    frame_size = 256  # Length of each frame for STFT
    hop_size =   256  # Hop size (frame overlap)
    num_chroma_bins = 12  # Number of chroma bins
    
    # Process audio
    audio_segment = audio[:int(duration * sr)]
    stft_magnitude = compute_stft_magnitude(audio_segment, sr, frame_size, hop_size)
    chroma_matrix = map_to_chroma(stft_magnitude, sr, num_chroma_bins)
    
    # Print shapes of intermediate outputs
    print("STFT Magnitude shape:", stft_magnitude.shape)
    print("Chroma matrix shape:", chroma_matrix.shape)
    print("Chroma matrix (integer):\n", chroma_matrix)


STFT Magnitude shape: (32, 129)
Chroma matrix shape: (32, 12)
Chroma matrix (integer):
 [[ 1  1  2  3  1  1  1  0  0  0  0  0]
 [ 1  1  2  3  1  1  1  0  0  0  0  0]
 [ 1  1  2  2  1  1  0  0  0  0  0  0]
 [ 1  2  2  2  1  1  1  0  0  0  0  0]
 [ 1  1  1  1  1  1  0  0  0  0  0  0]
 [ 3  1  1  1  1  1  1  0  0  0  0  0]
 [ 8  4  2  2  1  1  1  0  0  0  0  0]
 [ 9  4  2  2  1  1  1  1  0  0  0  0]
 [19 28  5 13 46 32  5  3  0  0  0  0]
 [20 35  6  9 29 25 24  8  2  1  1  0]
 [25 63  9 16 20 10 12 13  2  1  2  0]
 [26 45 10 17  6  5  6  3  1  0  1  0]
 [23 31 10  8  4  3  2  2  1  0  0  0]
 [23 24  7  6  6  3  2  2  0  0  0  0]
 [11 14  3  4  4  1  2  2  0  0  0  0]
 [ 9  9  2  3  4  1  2  1  0  0  0  0]
 [ 5 11  3  4  2  1  1  1  0  0  0  0]
 [ 6 10  3  4  3  1  1  1  0  0  0  0]
 [ 3  5  2  4  3  1  1  1  0  0  0  0]
 [ 5  8  5  3  3  2  1  1  0  0  0  0]
 [ 2  7  5  2  2  1  1  0  0  0  0  0]
 [ 3  4  4  5  2  1  1  1  0  0  0  0]
 [ 1  3  3  4  2  2  1  1  0  0  0  0]
 [ 1  1  1  1  

# multiple_files

In [1]:
import os
import numpy as np
import scipy.io.wavfile as wav

# Function to load audio file
def load_audio(file_path):
    sr, audio = wav.read(file_path)
    if audio.ndim > 1:
        audio = np.mean(audio, axis=1)  # Convert stereo to mono
    audio = audio.astype(np.float32)
    audio /= np.max(np.abs(audio))  # Normalize to range [-1, 1]
    return sr, audio

# Function to compute STFT magnitude
def compute_stft_magnitude(audio, sr, frame_size, hop_size):
    # Pad audio to ensure all frames have equal size
    audio = np.pad(audio, (0, frame_size - len(audio) % frame_size), mode='constant')
    num_frames = (len(audio) - frame_size) // hop_size + 1
    
    stft_magnitude = np.zeros((num_frames, frame_size // 2 + 1))
    
    # Compute STFT magnitude for each frame
    for i in range(num_frames):
        frame = audio[i * hop_size : i * hop_size + frame_size]
        windowed_frame = frame * np.hamming(frame_size)
        stft = np.fft.rfft(windowed_frame)
        stft_magnitude[i, :] = np.abs(stft)
    
    return stft_magnitude

# Function to map frequencies to chroma bins and convert to integer
def map_to_chroma(stft_magnitude, sr, num_chroma_bins):
    chroma_matrix = np.zeros((stft_magnitude.shape[0], num_chroma_bins), dtype=int)
    freqs = np.fft.rfftfreq(stft_magnitude.shape[1] * 2 - 1, d=1/sr)
    
    # Define boundaries for chroma bins (C, C#, D, ..., B)
    chroma_boundaries = np.linspace(0, 1, num_chroma_bins + 1)
    
    for i in range(num_chroma_bins):
        bin_start = int(chroma_boundaries[i] * len(freqs))
        bin_end = int(chroma_boundaries[i + 1] * len(freqs))
        
        # Accumulate magnitude within each chroma bin and convert to integer
        chroma_matrix[:, i] = np.round(np.sum(stft_magnitude[:, bin_start:bin_end], axis=1)).astype(int)
    
    return chroma_matrix


In [4]:
import os
import numpy as np
import scipy.io.wavfile as wav

# Function to load audio file
def load_audio(file_path):
    sr, audio = wav.read(file_path)
    if audio.ndim > 1:
        audio = np.mean(audio, axis=1)  # Convert stereo to mono
    audio = audio.astype(np.float32)
    audio /= np.max(np.abs(audio))  # Normalize to range [-1, 1]
    return sr, audio

# Function to compute STFT magnitude
def compute_stft_magnitude(audio, sr, frame_size, hop_size):
    # Pad audio to ensure all frames have equal size
    audio = np.pad(audio, (0, frame_size - len(audio) % frame_size), mode='constant')
    num_frames = (len(audio) - frame_size) // hop_size + 1
    
    stft_magnitude = np.zeros((num_frames, frame_size // 2 + 1))
    
    # Compute STFT magnitude for each frame
    for i in range(num_frames):
        frame = audio[i * hop_size : i * hop_size + frame_size]
        windowed_frame = frame * np.hamming(frame_size)
        stft = np.fft.rfft(windowed_frame)
        stft_magnitude[i, :] = np.abs(stft)
    
    return stft_magnitude

# Function to map frequencies to chroma bins and convert to integer
def map_to_chroma(stft_magnitude, sr, num_chroma_bins):
    chroma_matrix = np.zeros((stft_magnitude.shape[0], num_chroma_bins), dtype=int)
    freqs = np.fft.rfftfreq(stft_magnitude.shape[1] * 2 - 1, d=1/sr)
    
    # Define boundaries for chroma bins (C, C#, D, ..., B)
    chroma_boundaries = np.linspace(0, 1, num_chroma_bins + 1)
    
    for i in range(num_chroma_bins):
        bin_start = int(chroma_boundaries[i] * len(freqs))
        bin_end = int(chroma_boundaries[i + 1] * len(freqs))
        
        # Accumulate magnitude within each chroma bin and convert to integer
        chroma_matrix[:, i] = np.round(np.sum(stft_magnitude[:, bin_start:bin_end], axis=1)).astype(int)
    
    return chroma_matrix


Processing Alexa_Sachin_1.wav...
Chroma features saved to C:\Users\Sachin\Desktop\ai_projects\features_human_voice\sachin\Alexa_Sachin_1_chroma.npy
Processing Alexa_Sachin_1_1.wav...
Chroma features saved to C:\Users\Sachin\Desktop\ai_projects\features_human_voice\sachin\Alexa_Sachin_1_1_chroma.npy
Processing Alexa_Sachin_1_2.wav...
Chroma features saved to C:\Users\Sachin\Desktop\ai_projects\features_human_voice\sachin\Alexa_Sachin_1_2_chroma.npy
Processing Alexa_Sachin_1_3.wav...
Chroma features saved to C:\Users\Sachin\Desktop\ai_projects\features_human_voice\sachin\Alexa_Sachin_1_3_chroma.npy
Processing Alexa_Sachin_1_4.wav...
Chroma features saved to C:\Users\Sachin\Desktop\ai_projects\features_human_voice\sachin\Alexa_Sachin_1_4_chroma.npy
Processing Alexa_Sachin_1_5.wav...
Chroma features saved to C:\Users\Sachin\Desktop\ai_projects\features_human_voice\sachin\Alexa_Sachin_1_5_chroma.npy
Processing Alexa_Sachin_2.wav...
Chroma features saved to C:\Users\Sachin\Desktop\ai_project

Chroma features saved to C:\Users\Sachin\Desktop\ai_projects\features_human_voice\sachin\Sachin_alexa (mp3cut (mp3cut.net) (14)_2_chroma.npy
Processing Sachin_alexa (mp3cut (mp3cut.net) (14)_3.wav...
Chroma features saved to C:\Users\Sachin\Desktop\ai_projects\features_human_voice\sachin\Sachin_alexa (mp3cut (mp3cut.net) (14)_3_chroma.npy
Processing Sachin_alexa (mp3cut (mp3cut.net) (14)_4.wav...
Chroma features saved to C:\Users\Sachin\Desktop\ai_projects\features_human_voice\sachin\Sachin_alexa (mp3cut (mp3cut.net) (14)_4_chroma.npy
Processing Sachin_alexa (mp3cut (mp3cut.net) (14)_5.wav...
Chroma features saved to C:\Users\Sachin\Desktop\ai_projects\features_human_voice\sachin\Sachin_alexa (mp3cut (mp3cut.net) (14)_5_chroma.npy
Processing Sachin_alexa (mp3cut (mp3cut.net) (15).wav...
Chroma features saved to C:\Users\Sachin\Desktop\ai_projects\features_human_voice\sachin\Sachin_alexa (mp3cut (mp3cut.net) (15)_chroma.npy
Processing Sachin_alexa (mp3cut (mp3cut.net) (15)_1.wav...
Chro

 [ 1  0  1  1  0  0  0  0  0  0  0  0]]
Processed Sachin_alexa (mp3cut (mp3cut.net) (16).wav:
[[ 3  3  1  2  1  0  0  0  0  0  0  0]
 [ 3  2  1  2  1  0  0  0  0  0  0  0]
 [ 3  3  7  4  2  1  1  1  0  0  0  0]
 [ 3  2  6  3  2  0  1  1  0  0  0  0]
 [ 3  2  2  3  1  0  0  0  0  0  0  0]
 [ 5  8 30 14 10  4  5  4  6  2  4  4]
 [ 6  7 26 10 12  4  5  4  4  3  2  4]
 [11 20 44 11 22  8 10  8 11  8  6  5]
 [10 24 22  6 15  5  7  4  6  4  3  4]
 [13 24 12  7  7  1  1 10  5  7  3  3]
 [13 23 11  4  9  2  1 10  3  5  6  3]
 [14 27  9  4  5  2  2 11  4  3  5  4]
 [12 45 13  6  6 19  9 16  6  3  5  6]
 [ 8 51 22  9  6 34 16 30 12  3  5  9]
 [11 64 20  7  6 38 17 36 14  4  6 11]
 [ 7 45 29  9 10 43 25 44 11  4  6 14]
 [ 6 38 31  9  8 45 32 39  6  2  3  7]
 [ 7 34 14  7  5 15 13  7  1  1  1  2]
 [ 6 19  9  5  4  8  5  5  1  1  1  1]
 [ 3 20  8  4  4  5  3  5  1  0  1  1]
 [ 6 15  9  4  3  7  3  3  1  1  1  1]
 [ 4 13  6  4  3  6  2  5  1  1  1  1]
 [ 4 11  6  2  2  5  3  4  1  1  1  1]
 [ 4 12  