In [1]:
import numpy as np 
import scipy as sp
import pandas as pd 
import librosa 
import sklearn
import matplotlib.pyplot as plt 
from pathlib import Path 
import pathlib
from IPython.display import Audio,display
from tqdm import tqdm   


In [2]:
audio_dir:Path = Path("/home/pujan/D/projects/dsp/datasets/audio/")

In [3]:
audio_paths:list[pathlib.PosixPath]= [file for file in audio_dir.iterdir()]

##### Human voices are often less than 8k hz. so according to nyquist theorem, the sampling rate should be twice the maximum number of frequency present , hence :- 16k HZ is often enough for human audio analysis.

In [4]:
SAMPLING_RATE:int = 16000;

In [5]:
y:np.array;
sr:int;
y,sr = librosa.load(audio_paths[100],sr=SAMPLING_RATE);
y

array([ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
       -1.9121170e-04,  2.9802322e-05,  1.4925003e-04],
      shape=(140160,), dtype=float32)

In [6]:
Audio(y,rate=SAMPLING_RATE)

In [7]:
target_df = pd.read_csv("/home/pujan/D/projects/dsp/datasets/metadata.csv")

In [8]:
target_df.head()

Unnamed: 0,file_name,gender
0,7159862176919548258.wav,male
1,13394269605176372148.wav,male
2,17562067154452877054.wav,male
3,11846076079276902855.wav,male
4,4490555107572319616.wav,male


In [9]:
gender = target_df.loc[target_df["file_name"]=="4490555107572319616.wav",["gender"]].values[0][0]

#### Lowcut :-	80 – 100 Hz	 //Removes low-frequency noise: mic hum (50/60 Hz), rumble, handling noise; keeps fundamental male voice frequencies
#### Highcut :-	3000 – 3500 Hz	//Removes high-frequency hiss, some background noise; preserves most speech intelligibility and clarity

In [10]:
#constansts

LOW_CUTOFF = 80 #HZ
HIGH_CUTOFF = 3000 #HZ
ORDER = 4 #order of the filter

In [11]:
# Designing the Butterworth Bandpass Filter 
# 'fs' is the sampling frequency.
# 'btype' is 'band' for bandpass.
# 'output='sos'' is crucial for numerical stability.
sos = sp.signal.butter(ORDER, [LOW_CUTOFF, HIGH_CUTOFF], btype='band', analog=False, output='sos', fs=SAMPLING_RATE)
print("Filter designed successfully.")

Filter designed successfully.


In [12]:
filtered_signal = sp.signal.sosfilt(sos,y)
Audio(filtered_signal,rate=SAMPLING_RATE)

In [13]:
mfcc = librosa.feature.mfcc(
    y=y, sr=sr,
    n_mfcc=20,           # number of mfcc coefficients
    n_fft=512,          # fft window size
    hop_length=128,      # Step between frames
    n_mels=40,           # number of mel filters
    );

In [14]:
s = {}

for index,val in enumerate(mfcc,start=1):
    s[f"mfcc_{index}"] = val.mean()

In [15]:
s

{'mfcc_1': np.float32(-266.62024),
 'mfcc_2': np.float32(68.539764),
 'mfcc_3': np.float32(2.3052788),
 'mfcc_4': np.float32(10.778232),
 'mfcc_5': np.float32(-2.006714),
 'mfcc_6': np.float32(-9.526618),
 'mfcc_7': np.float32(-13.331796),
 'mfcc_8': np.float32(-4.8699117),
 'mfcc_9': np.float32(-10.175492),
 'mfcc_10': np.float32(-5.098766),
 'mfcc_11': np.float32(-3.8963141),
 'mfcc_12': np.float32(-10.575413),
 'mfcc_13': np.float32(-1.0341904),
 'mfcc_14': np.float32(-6.0163465),
 'mfcc_15': np.float32(-8.175736),
 'mfcc_16': np.float32(-2.9310784),
 'mfcc_17': np.float32(-5.3520722),
 'mfcc_18': np.float32(-5.7421722),
 'mfcc_19': np.float32(-5.88935),
 'mfcc_20': np.float32(-4.214374)}

**What are Mel-Frequency Cepstral Coefficients?**


In simple terms, MFCCs are the "fingerprint" of a sound. They provide a compact representation of the shape of a sound's spectrum, which is crucial for identifying its phonetic content (like vowels and consonants).
They are the most widely used features in automatic speech recognition (ASR) because they effectively model how humans perceive sound.


Let's break down the name:


**Mel-Frequency**: The Mel-scale is specifically designed to mimic the way humans perceive sound, particularly how we discern differences in pitch. Human hearing is more sensitive to changes in lower frequencies than to equivalent changes in higher frequencies.


This refers to the Mel scale, a perceptual scale of pitches judged by listeners to be equal in distance from one another. Humans are much better at distinguishing between small changes in low frequencies than in high frequencies (e.g., the difference between 100 Hz and 200 Hz is much more obvious than between 10,000 Hz and 10,100 Hz). The Mel scale warps the standard frequency (Hz) to match this human perception.


**Cepstral**: This is a clever (and slightly confusing) term. It's an anagram of "spectral". A cepstrum is the spectrum of a spectrum. The purpose of this step is to separate the source of the sound (e.g., the buzzing of the vocal cords) from the filter (e.g., the shape of the mouth and vocal tract). The shape of the vocal tract is what determines the phoneme we produce, so that's the information we want to keep. The DCT step (explained below) accomplishes this separation.


**Coefficients**: The final output is a set of numbers (coefficients) that represent this filtered shape for a very short slice of audio.


**Why are they so useful?**


By focusing on the perceptually meaningful shape of the spectrum and discarding less relevant information like the fundamental frequency (pitch), MFCCs provide a robust set of features for machine learning models to identify the underlying speech content, regardless of who is speaking.





## **How to compute MFCC?**

To calculate MFCCs, we follow these steps:

Pre-emphasize the signal: Amplify higher frequencies to balance the spectrum.

Framing: Break the signal into small, overlapping frames.

Windowing: To soften the edges of each frame, apply a Hamming window.

FFT: Convert each frame from the time domain to the frequency domain.

Mel-filterbank: Apply overlapping triangular filters spaced according to the Mel-scale.

Logarithm: To replicate the way a human ear reacts to sound strength take the logarithm of the filterbank outputs.

DCT: Apply the DCT to the log Mel-spectrum to obtain the Mel-frequency Cepstral Coefficients.

In [16]:
%%time

f0_mean_list = []
f0_std_list = []
centroid_list = []
mfcc_mean_list = []
audio_file_name = []
zcr_mean_list = []
gender_list:list[str] = []


for audio_path in tqdm(audio_paths,desc="Extracting features"):
    y:np.array;
    sr:int;
    y,sr = librosa.load(audio_path,sr=16000,mono=True);
    y = sp.signal.sosfilt(sos,y)
    f0_yin = librosa.yin(y, fmin=80, fmax=400, sr=sr);
    centroid = librosa.feature.spectral_centroid(y=y, sr=sr);
    mfcc = librosa.feature.mfcc(
    y=y, sr=sr,
    n_mfcc=20,           # number of mfcc coefficients
    n_fft=512,          # fft window size
    hop_length=128,      # Step between frames
    n_mels=40,           # number of mel filters
    );
    mfcc = mfcc[1:,:]

    mfccs = {f"mfcc_{i}":round(val.mean(),3) for i,val in enumerate(mfcc,start=1)}

    zcr = librosa.feature.zero_crossing_rate(y, frame_length=512, hop_length=128);

    gender = target_df.loc[target_df["file_name"]==audio_path.name,["gender"]].values[0][0];

    f0_mean_list.append(round(f0_yin.mean(),3));
    f0_std_list.append(round(f0_yin.std(),3));
    centroid_list.append(round(centroid.mean(),3));
    mfcc_mean_list.append(mfccs);
    zcr_mean_list.append(round(zcr.mean(),3));
    audio_file_name.append(audio_path.name);
    gender_list.append(gender);
    

Extracting features: 100%|██████████| 3058/3058 [08:10<00:00,  6.23it/s] 

CPU times: user 18min 34s, sys: 16.3 s, total: 18min 51s
Wall time: 8min 10s





In [17]:
mfcc_mean_list

[{'mfcc_1': np.float64(103.231),
  'mfcc_2': np.float64(0.718),
  'mfcc_3': np.float64(25.404),
  'mfcc_4': np.float64(2.038),
  'mfcc_5': np.float64(-1.094),
  'mfcc_6': np.float64(5.204),
  'mfcc_7': np.float64(-5.565),
  'mfcc_8': np.float64(0.577),
  'mfcc_9': np.float64(3.539),
  'mfcc_10': np.float64(-2.244),
  'mfcc_11': np.float64(-2.243),
  'mfcc_12': np.float64(2.965),
  'mfcc_13': np.float64(-1.824),
  'mfcc_14': np.float64(-4.525),
  'mfcc_15': np.float64(1.5),
  'mfcc_16': np.float64(-1.237),
  'mfcc_17': np.float64(-4.069),
  'mfcc_18': np.float64(-0.35),
  'mfcc_19': np.float64(0.058)},
 {'mfcc_1': np.float64(96.974),
  'mfcc_2': np.float64(-15.822),
  'mfcc_3': np.float64(15.12),
  'mfcc_4': np.float64(1.485),
  'mfcc_5': np.float64(-14.47),
  'mfcc_6': np.float64(0.921),
  'mfcc_7': np.float64(-1.934),
  'mfcc_8': np.float64(-4.715),
  'mfcc_9': np.float64(-2.218),
  'mfcc_10': np.float64(-3.464),
  'mfcc_11': np.float64(-3.024),
  'mfcc_12': np.float64(-3.933),
  'mfc

In [18]:
def return_mfcc(i:int):
    mfcc =[a[f"mfcc_{i}"] for a in mfcc_mean_list]
    return mfcc

In [19]:
len(audio_file_name)

3058

In [None]:
features:pd.DataFrame = pd.DataFrame({
    "file_name":audio_file_name,
    "f0_mean":f0_mean_list,
    "f0_std":f0_std_list,
    "centroid_mean":centroid_list,
    "zcr_mean":zcr_mean_list,
    "gender":gender_list
});

In [21]:
features.head()

Unnamed: 0,file_name,f0_mean,f0_std,centroid_mean,zcr_mean
0,7159862176919548258.wav,135.841,24.243,740.331,0.051
1,12951848363959320156.wav,226.774,55.77,909.476,0.076
2,13394269605176372148.wav,139.675,21.848,772.593,0.055
3,17562067154452877054.wav,134.99,30.237,1161.595,0.1
4,9431791451113209518.wav,210.459,45.223,1007.198,0.082


In [None]:
for i in range(1,20):
    features[f"mfcc_{i}"] = return_mfcc(i)


In [None]:
features.head()     

Index(['file_name', 'f0_mean', 'f0_std', 'centroid_mean', 'zcr_mean', 'mfcc_1',
       'mfcc_2', 'mfcc_3', 'mfcc_4', 'mfcc_5', 'mfcc_6', 'mfcc_7', 'mfcc_8',
       'mfcc_9', 'mfcc_10', 'mfcc_11', 'mfcc_12', 'mfcc_13', 'mfcc_14',
       'mfcc_15', 'mfcc_16', 'mfcc_17', 'mfcc_18', 'mfcc_19'],
      dtype='object')

In [None]:
features.to_csv("train_data.csv",index=False);