In [2]:
import numpy as np
from scipy.fftpack import dct

def compute_cepstrum(fft_frames, num_ceps):
    """
    Computes the cepstrum time series values from the frequency domain of audio frames.

    Parameters:
    fft_frames (np.array): A numpy array of frequency domain audio frames obtained from
                            the output of transform_frames_to_frequency_domain function.
    num_ceps (int): The number of cepstral coefficients to be returned.

    Returns:
    np.array: A numpy array of cepstrum time series values for each audio frame.
    """
    # Calculate the power spectrum of the fft frames
    power_spectrum = np.abs(fft_frames) ** 2

    # Compute the logarithm of the power spectrum
    log_power_spectrum = np.log10(power_spectrum)

    # Apply the discrete cosine transform (DCT) on the log power spectrum
    cepstrum = dct(log_power_spectrum, type=2, axis=1, norm='ortho')

    # Keep the first 'num_ceps' cepstral coefficients
    cepstrum = cepstrum[:, :num_ceps]

    return cepstrum

In [6]:
from functions.time_domain import *
from functions.frequency_domain import *

percent_frame_size = 0.2
percent_hop_length = 0.3
path = '../recordings/4_10/Znormalizowane/zdanie_3.wav'
audio, frame_rate, audio_time, n_samples = read_wave(path)
frames, n_, N_ = split_to_frames(audio, frame_rate, percent_frame_size=percent_frame_size,
                                 percent_hop_length=percent_hop_length)
fft_frames: np.ndarray = transform_frames_to_frequency_domain(frames, frame_rate, N_)
df = compute_cepstrum(fft_frames,2)
print(df)

[[-1.82769791e+02  9.20856893e-02]
 [-1.27738594e+02  8.27060342e-02]
 [-7.05334167e+01  5.79222888e-02]
 [-2.23629074e+01  3.95049751e-02]
 [-1.44851370e+01  6.51441738e-02]
 [-1.50461092e+01 -4.30442840e-02]
 [-2.04135208e+01  4.73913550e-02]
 [-3.04894123e+01  5.56733608e-02]
 [-4.81538582e+01  3.81604880e-02]
 [-2.47828922e+01  8.37649405e-03]
 [-3.81883202e+01  5.95449209e-02]
 [-3.85649223e+01  4.54579815e-02]
 [-5.76145401e+01  6.27445057e-02]
 [-7.41307526e+01  5.01513779e-02]
 [-9.90066147e+01  6.94577247e-02]
 [-1.11652145e+02  7.55222738e-02]
 [-1.15711617e+02  8.43020454e-02]
 [-1.12537590e+02  5.85235655e-02]
 [-1.52278900e+02  9.05300379e-02]
 [-7.71760254e+01  5.84254190e-02]
 [-5.10065002e+01  2.03598738e-02]
 [-5.22336998e+01  4.97384071e-02]
 [-1.69274120e+01  1.50683057e-02]
 [-1.46727884e+00  4.48463336e-02]
 [ 3.69346476e+00  1.26481205e-02]
 [ 7.62961864e+00  4.48246375e-02]
 [-4.81203423e+01  5.75460494e-02]
 [-5.48380051e+01  6.14643171e-02]
 [-5.45381470e+01  4

In [7]:
frame_rate

22050

In [14]:
import numpy as np
import scipy.signal

def find_base_frequency(signal, sampling_frequency, min_freq=50, max_freq=400):
    # Compute the real cepstrum of the signal
    real_cepstrum = np.real(np.fft.ifft(np.log(np.abs(np.fft.fft(signal)))))

    # Convert frequency range to quefrency range
    min_quefrency = int(sampling_frequency / max_freq)
    max_quefrency = int(sampling_frequency / min_freq)
    print(min_quefrency)
    print(max_quefrency)
    print(len(real_cepstrum))
    # print(real_cepstrum[min_quefrency:max_quefrency])
    # Find the local maximum of the real cepstrum signal within the quefrency range
    local_max_quefrency = np.argmax(real_cepstrum[min_quefrency:max_quefrency]) + min_quefrency

    # Convert the quefrency value back to the frequency domain
    base_frequency = sampling_frequency / local_max_quefrency

    return base_frequency
find_base_frequency(audio, frame_rate)

55
441
48595


92.25941422594143