# Analiza i przetwarzanie dźwięku - Projekt 1


In [173]:
import wave
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Audio
import plotly.graph_objs as go
from plotly.subplots import make_subplots

In [174]:
def read_wave(path):
    with wave.open(path, 'rb') as wav_file:
        frame_rate = wav_file.getframerate()
        n_samples = wav_file.getnframes()
        samples = wav_file.readframes(n_samples)
        audio = np.frombuffer(samples, dtype=np.int16).astype(np.int32)
    audio_time  = n_samples/frame_rate #in seconds
    display(Audio(data=audio, rate=frame_rate))
    return audio, frame_rate, audio_time, n_samples

def split_to_frames(audio, frame_rate, percent_frame_size=0.1, percent_hop_length=0.5):
    # default frame_size is 10% of the audio and default frame overlap is 50% overlap
    
    # naming convention: n_ - number of frames, N_ - number of samples in a frame
    # convention is consistent with "Cechy sygnalu audio w dziedzinie czasu.pdf"
    frame_size = int(percent_frame_size * frame_rate)
    hop_length = int(percent_hop_length*percent_frame_size * frame_rate)
    frames = []
    for i in range(0, len(audio), hop_length):
        frame = audio[i:i+frame_size]
        if len(frame) == frame_size:
            frames.append(frame)
    frames = np.stack(frames)
    n_ = frames.shape[0]
    N_ = frames.shape[1]
    return frames, n_, N_


def plot_audio(audio,audio_time):
    times = np.linspace(0, audio_time, num=audio.shape[0])
    fig = make_subplots(rows=1, cols=1)
    fig.add_trace(
        go.Scatter(x=times, y=audio, mode='lines'),
    )

    fig.update_layout(
        title="Audio Waveform",
        xaxis_title="Time (s)",
        yaxis_title="Amplitude"
    )
    fig.show()

In [175]:
# to pewnie mozna zrobic jako inputy
percent_frame_size = 0.01
percent_hop_length = 0.3

In [176]:
path = 'recordings/4_10/Nieznormalizowane/zdanie_2.wav'
audio, frame_rate, audio_time, n_samples  = read_wave(path)
audio = audio/10
frames, n_, N_= split_to_frames(audio, frame_rate, percent_frame_size=percent_frame_size, percent_hop_length=percent_hop_length)

In [177]:
# cechy sygnału w dziedzinie czasu na poziomie ramki

def get_volume(audio,N_):
    return np.sqrt(np.sum(np.power(audio,2))/N_)


def plot_volumes(frames, n_=n_,N_=N_):
    volumes = np.apply_along_axis(get_volume, 1,frames, N_=N_)
    fig = make_subplots(rows=1, cols=1)
    fig.add_trace(
        go.Scatter(x=np.arange(0,n_), y=volumes, mode='lines'),
    )

    fig.update_layout(
        title="Volume of audio frames",
        xaxis_title="Frame number",
        yaxis_title="Volume (dB)"
    )
    fig.show()
    
    

def get_ste(audio, N_=N_):
    # ste - short time energy
    return get_volume(audio, N_)**2


def plot_ste(frames, n_=n_,N_=N_):
    ste = np.apply_along_axis(get_ste, 1, frames, N_=N_)
    fig = make_subplots(rows=1, cols=1)
    fig.add_trace(
        go.Scatter(x=np.arange(0,n_), y=ste, mode='lines'),
    )

    fig.update_layout(
        title="Short Time Energy of audio frames",
        xaxis_title="Frame number",
        yaxis_title="Short Time Energy (dB^2)"
    )
    fig.show()
    
    
def get_zcr(audio):
    # ZCR - zero crossing rate
    return np.sum(np.abs(np.diff(np.sign(audio))))/2


def plot_zcr(frames, n_=n_):
    zcr = np.apply_along_axis(get_zcr, 1, frames)
    fig = make_subplots(rows=1, cols=1)
    fig.add_trace(
        go.Scatter(x=np.arange(0,n_), y=zcr, mode='lines'),
    )

    fig.update_layout(
        title="Zero Crossing Rate of audio frames",
        xaxis_title="Frame number",
    )
    fig.show()
    
    
    

def get_sr(audio, N_=N_):
    # sr - silent ratio
    zcr = get_zcr(audio)
    if zcr == 0:
        return 0
    else: 
        return get_volume(audio, N_)/zcr


def plot_sr(frames, n_=n_, N_=N_):
    sr = np.apply_along_axis(get_sr, 1, frames, N_=N_)
    fig = make_subplots(rows=1, cols=1)
    fig.add_trace(
        go.Scatter(x=np.arange(0,n_), y=sr, mode='lines'),
    )

    fig.update_layout(
        title="Silent Ratio of audio frames",
        xaxis_title="Frame number",
    )
    fig.show()
    
    

def get_f0(audio, l_,amdf=False):
    # F0 - fundamental frequency, częstotliwość tonu podstawowego
    # autocorrelation function by default
    # amdf - average magnitude difference function 
    if l_ > len(audio):
        raise ValueError("l_ must be smaller than the length of audio")

    if amdf:
        return np.sum(np.abs(audio[:-l_] - audio[l_:]))
    else:
        return np.sum(audio[:-l_] * audio[l_:])
    
    
def plot_f0(frames, l_, amdf=False,n_=n_):
    f0 = np.apply_along_axis(get_f0, 1, frames, l_=l_, amdf=amdf)
    fig = make_subplots(rows=1, cols=1)
    fig.add_trace(
        go.Scatter(x=np.arange(0,n_), y=f0, mode='lines'),
    )

    fig.update_layout(
        title="Fundamental frequency of audio frames",
        xaxis_title="Frame number",
    )
    fig.show()


In [178]:
# cechy sygnału audio w dziedzinie czasu na poziomie klipu

def get_avg_amplitue(audio):
    return np.mean(np.abs(audio))


def get_vstd(audio):
    # vstd - volume standard deviation normalized by the maximum value
    return np.std(audio)/np.max(np.abs(audio))


def get_vdr(audio):
    # vdr - volume dynamic range 
    return (np.max(audio) - np.min(audio))/np.max(audio)

In [179]:
print(f"Audio length: {np.format_float_positional(audio_time,2)} s")
print(f"Frame rate: {frame_rate} Hz")
print(f"Number of frames: {n_}")
print(f"Number of samples in a frame: {N_}")
print(f"Frame length: {np.format_float_positional(N_/frame_rate,3)}s")

Audio length: 2.21 s
Frame rate: 22050 Hz
Number of frames: 736
Number of samples in a frame: 220
Frame length: 0.01s


In [180]:
plot_audio(audio,audio_time)

In [181]:
plot_volumes(frames)

In [182]:
plot_ste(frames)    

In [183]:
plot_zcr(frames)

In [184]:
plot_sr(frames)

In [185]:
plot_f0(frames, l_=100, amdf=True)       

In [186]:
plot_f0(frames, l_=100, amdf=False)     

In [187]:
print(f"Avarege amplitude: {np.format_float_positional(get_avg_amplitue(audio), precision=1)}")
print(f"VSTD: {np.format_float_positional(get_vstd(audio),precision=4)}")
print(f"VDR: {np.format_float_positional(get_vdr(audio),precision=4)}")

Avarege amplitude: 88.4
VSTD: 0.1156
VDR: 1.7741


In [197]:
# bazujace na energii


def split_to_sec_frames(audio, frame_rate):
    return np.split(audio, np.arange(frame_rate,len(audio),frame_rate))


def get_lstr(frame_sec,frame_rate, percent_frame_size,percent_hop_length):
    frames, n_, N_ = split_to_frames(frame_sec, frame_rate, percent_frame_size,percent_hop_length)
    stes = np.apply_along_axis(get_ste, 1, frames, N_=N_)
    ste_mean = np.mean(stes)
    return np.sum((0.5*ste_mean > stes)+1)/(2*len(frame_sec))

def plot_lstr(audio, frame_rate, percent_frame_size,percent_hop_length):
    frames_sec = split_to_sec_frames(audio, frame_rate)
    lstr = []
    for frame_sec in frames_sec:
        lstr.append(get_lstr(frame_sec,frame_rate, percent_frame_size,percent_hop_length))
    fig = make_subplots(rows=1, cols=1)
    fig.add_trace(
        go.Scatter(x=np.arange(0,len(frames_sec)), y=lstr, mode='lines'),
    )

    fig.update_layout(
        title="Low short time energy ratio of audio frames",
        yaxis_title="Ratio",
        xaxis_title="Second",
    )
    fig.show()


def get_energy_entropy(frames):
    energy = np.sum(np.square(frames), axis=1)
    energy_dist = energy / np.sum(energy)
    return -np.sum(energy_dist * np.log2(energy_dist))


def get_zstd(frames):
    zcr_values = np.apply_along_axis(get_zcr, axis=1, arr=frames)
    zcr_std = np.std(zcr_values)
    return zcr_std


def get_hzcrr(frame_sec,frame_rate, percent_frame_size,percent_hop_length):
    frames, n_, N_ = split_to_frames(frame_sec, frame_rate, percent_frame_size,percent_hop_length)
    zcrs = np.apply_along_axis(get_zcr, 1, frames)
    zcr_mean = np.mean(zcrs)
    return np.sum((1.5*zcr_mean < zcrs)+1)/(2*len(frame_sec))

def plot_hzcrr(audio, frame_rate, percent_frame_size,percent_hop_length):
    frames_sec = split_to_sec_frames(audio, frame_rate)
    lstr = []
    for frame_sec in frames_sec:
        lstr.append(get_hzcrr(frame_sec,frame_rate, percent_frame_size,percent_hop_length))
    fig = make_subplots(rows=1, cols=1)
    fig.add_trace(
        go.Scatter(x=np.arange(0,len(frames_sec)), y=lstr, mode='lines'),
    )

    fig.update_layout(
        title="Low short time energy ratio of audio frames",
        yaxis_title="Ratio",
        xaxis_title="Second",
    )
    fig.show()
    


To bedzie lepiej dzialalo na dluzszych nagraniach, np  30 sekund radia z mową/muzyka.

In [194]:
plot_lstr(audio, frame_rate, 0.05, 0.3)



In [195]:
plot_hzcrr(audio, frame_rate, 0.05, 0.3)

In [196]:
get_energy_entropy(frames)

8.29254968871542

In [198]:
get_zstd(frames)

24.019418784255777

## Frequency Spectrum

In [None]:
plt.figure(figsize=(10, 6))
plt.specgram(audio, Fs=frame_rate)
plt.title('Singal spectrogram')
plt.ylabel('Frequency (Hz)')
plt.xlabel('Time (s)')
plt.xlim(0, audio_time)
plt.colorbar()
plt.show()