# Audio Analysis of FLAC Files
This notebook analyzes FLAC audio files to extract features such as bitrate, amplitude, and more, and then plots them to give a basic overview of the audio format.

## Install Dependencies from Requirements.txt

In [None]:
#  pip install -r requirements.txt

## Load Dependencies

In [None]:
import os
import librosa
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf 
import tensorflow_io as tfio


## Set Defined Configuration

In [None]:
speaker = "2"
database_name = "Mendeley Data\differentPhrase"
dataset_path = f"audio\speakers\{database_name}"
preprocess = True

## Function to Extract Audio Features
This function will extract features such as bitrate, amplitude, and more from FLAC files.

In [None]:
def extract_features(file_path):
	# Load audio file, refer to https://librosa.org/doc/main/generated/librosa.resample.html for res_type
	audio, sample_rate = librosa.load(file_path, res_type='kaiser_best')
 
 	# Chromas
	stft = np.abs(librosa.stft(audio))
	chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)

	cqt = np.abs(librosa.cqt(y=audio, sr=sample_rate))
	# chroma_cqt = np.mean(librosa.feature.chroma_cqt(C=cqt, sr=sample_rate).T, axis=0)

	# vqt = np.abs(librosa.vqt(audio, sr=sample_rate))
	# chroma_vqt = np.mean(librosa.feature.chroma_vqt(V=vqt, sr=sample_rate).T, axis=0)

	chroma_cens = np.mean(librosa.feature.chroma_cens(C=cqt, sr=sample_rate).T, axis=0)

    # Extract Mel Spectrogram
	mel_spectrogram = np.mean(librosa.feature.melspectrogram(y=audio, sr=sample_rate).T, axis=0)
    
	# Mel-Frequency Cepstral Coefficients
	mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
	# Standardization: The first approach ((mfccs - np.mean(mfccs, axis=1, keepdims=True)) / (np.std(mfccs, axis=1, keepdims=True) + 1e-6)) is used to standardize the MFCC features, maintaining their temporal structure while normalizing their scales.
	# Averaging: The second approach (np.mean(mfccs.T, axis=0)) simply averages the MFCC features over time, losing the temporal information but providing a single summary vector.
	mfccs_scaled = (mfccs - np.mean(mfccs, axis=1, keepdims=True)) / (np.std(mfccs, axis=1, keepdims=True) + 1e-6)
	# mfccs_scaled = np.mean(mfccs.T, axis=0)
	mfccs_mean = np.mean(mfccs_scaled.T, axis=0)

	# root-mean-square
	rms = np.mean(librosa.feature.rms(y=audio).T, axis=0)
 
	# spectral centroid
	spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)
 
	# polynomial features
	poly_features = np.mean(librosa.feature.poly_features(y=audio, sr=sample_rate).T, axis=0)
 
 	#	tonal centroid features 
	tonnetz = np.mean(librosa.feature.tonnetz(y=audio, sr=sample_rate).T, axis=0)
 
  	# Zero-Crossing Rate
	zcr = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)

	# Stack features
	features = np.hstack([
		chroma_stft,
		# chroma_cens,
		mel_spectrogram,
		mfccs_mean,
		rms,
		# spectral_centroid,
		# poly_features,
		# tonnetz,
		zcr
	])

	# features = np.vstack([
	# 	zcr,
	# 	mfcc,
	# 	mel_spectrogram,
	# 	chroma_stft
	# ])
 
	# stacked_features = librosa.feature.stack_memory(features.T, n_steps=5)
    
	return features


## Load Database and Analyze Multiple FLAC Files
This code will load multiple FLAC files from a directory and extract features from each file.

In [None]:
count = 0
audio_features = []

for speaker in os.listdir(dataset_path):
    speaker_path = os.path.join(dataset_path, speaker)
    if os.path.isdir(speaker_path):
        for file_name in os.listdir(speaker_path):
            file_path = os.path.join(speaker_path, file_name)
            if file_path.endswith('.flac') and count < 2:
                audio = tf.io.read_file(file_path)
                print(audio.shape)
                # features = extract_audio_features(file_path)
                # audio_features.append(features)
                count+=1

df_features = pd.DataFrame(audio_features)
df_features.head()

## Plotting Audio Features
This section will plot the extracted audio features for better visualization.

In [7]:
# Plot duration
plt.figure(figsize=(10, 6))
plt.hist(df_features['duration'], bins=20, color='blue', alpha=0.7)
plt.title('Audio Duration Distribution')
plt.xlabel('Duration (seconds)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Plot amplitude
plt.figure(figsize=(10, 6))
plt.hist(df_features['amplitude'], bins=20, color='green', alpha=0.7)
plt.title('Amplitude Distribution')
plt.xlabel('Amplitude')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Plot RMS
plt.figure(figsize=(10, 6))
plt.hist(df_features['rms'], bins=20, color='red', alpha=0.7)
plt.title('RMS Energy Distribution')
plt.xlabel('RMS Energy')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Plot Zero Crossing Rate
plt.figure(figsize=(10, 6))
plt.hist(df_features['zcr'], bins=20, color='purple', alpha=0.7)
plt.title('Zero Crossing Rate Distribution')
plt.xlabel('Zero Crossing Rate')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Plot Spectral Centroid
plt.figure(figsize=(10, 6))
plt.hist(df_features['spectral_centroid'], bins=20, color='orange', alpha=0.7)
plt.title('Spectral Centroid Distribution')
plt.xlabel('Spectral Centroid')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

KeyError: 'duration'

<Figure size 1000x600 with 0 Axes>

## Save Features to CSV
Finally, save the extracted features to a CSV file for later use in machine learning tasks.

In [None]:
df_features.to_csv('data/audio_features.csv', index=False)
print('Audio features saved to audio_features.csv')