In [1]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np

In [4]:
audio_file = "q5_audio.wav"
y, sr = librosa.load(audio_file)
print(f"sampling rate = {sr} and no of sample = {len(y)}")
print(f"duration = {len(y)/sr}")
print(f"frames in 20ms = {20*sr/1000}")

sampling rate = 22050 and no of sample = 259840
duration = 11.784126984126985
frames in 20ms = 441.0


In [None]:
# Define parameters for analysis
frame_length = 512  # Length of each analysis frame in samples
hop_length = 256    # Hop length between consecutive frames in samples


In [None]:
# Calculate short-time energy
energy = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)[0]

# Calculate short-time zero-crossing rate
zero_crossings = librosa.feature.zero_crossing_rate(y=y, frame_length=frame_length, hop_length=hop_length)[0]


In [None]:

# Create time axis
frames = range(len(energy))
t = librosa.frames_to_time(frames, sr=sr, hop_length=hop_length)


In [None]:
# Plot short-time energy
plt.figure(figsize=(10, 6))
plt.subplot(2, 1, 1)
plt.plot(t, energy, label='Energy')
plt.xlabel('Time (s)')
plt.ylabel('Energy')
plt.title('Short-Time Energy')
plt.legend()

# Plot short-time zero-crossing rate
plt.subplot(2, 1, 2)
plt.plot(t, zero_crossings, label='Zero-Crossing Rate', color='r')
plt.xlabel('Time (s)')
plt.ylabel('Zero-Crossing Rate')
plt.title('Short-Time Zero-Crossing Rate')
plt.legend()

plt.tight_layout()
plt.show()
In this code, the frame_length represents the length of each analysis frame in samples, and the hop_length represents the number of samples between the start of each consecutive frame. These parameters control the window size and window shift for the analysis. A smaller frame_length will provide higher time resolution, but less frequency resolution, and vice versa. The hop_length determines the overlap between consecutive frames, allowing for smoother analysis.

In the example above, I've used the values 1024 for frame_length and 512 for hop_length, but you can adjust these values based on the characteristics of your audio and the trade-off between time and frequency resolution that you desire.





