# Mel Spectrogram

In [55]:
pip install matplotlib

Note: you may need to restart the kernel to use updated packages.


In [56]:
#libraries 
#basic imports
import os
import numpy as np
from pathlib import Path

# PyTorch setup
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, random_split

# metrics
from sklearn.metrics import classification_report

import librosa
import matplotlib.pyplot as plt


## Mel Spectrogram Feature Extraction Function 

*  Taking an audio file path and getting a fixed size mel spectrogram
* visual representation of the sound frequencies over time
* meant to mimic how humans perceive sound

In [49]:
# define
def extract_mel_spectrogram(file_path, n_mels=128, max_len=128):
    try:
        y, sr = librosa.load(file_path, sr=16000)  # resample to 16kHz
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
        S_dB = librosa.power_to_db(S, ref=np.max)

        # padding or truncating to fixed length
        if S_dB.shape[1] < max_len:
            pad_width = max_len - S_dB.shape[1]
            S_dB = np.pad(S_dB, ((0, 0), (0, pad_width)), mode='constant')
        else:
            S_dB = S_dB[:, :max_len]
            
# processed spectrogram
        return S_dB
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


In [50]:
# directory setup 
# set paths for mel spectrogram features
real_mel_dir = Path("mel_features/real")
fake_mel_dir = Path("mel_features/fake")

# creating directories if they don't exist
real_mel_dir.mkdir(parents=True, exist_ok=True)
fake_mel_dir.mkdir(parents=True, exist_ok=True)

# path for raw audio files
real_base = Path("fake_audio/REAL")
fake_base = Path("fake_audio/FAKE")

## Extraction and Save 
* converting raw files into mel spectrogram and saving as .npy files

In [51]:
# loop through all folders
def process_and_save(base_path, out_path):
    for folder in base_path.iterdir():
        if folder.is_dir():
            for wav_file in folder.glob("*.wav"):
                save_name = f"{folder.name}_{wav_file.name.replace('.wav', '.npy')}"
                save_path = out_path / save_name

                # skip if already processed - skip logic! 
                if save_path.exists():
                    print(f"Skipping already processed: {save_path}")
                    continue

                features = extract_mel_spectrogram(str(wav_file))
                if features is not None:
                    np.save(save_path, features)
                    print(f"Saved features to: {save_path}")


In [52]:
# calling function 
process_and_save(real_base, real_mel_dir)
process_and_save(fake_base, fake_mel_dir)


Skipping already processed: mel_features/real/obama-original_segment_5660.npy
Skipping already processed: mel_features/real/obama-original_segment_5667.npy
Skipping already processed: mel_features/real/obama-original_segment_5615.npy
Skipping already processed: mel_features/real/obama-original_segment_5669.npy
Skipping already processed: mel_features/real/obama-original_segment_5731.npy
Skipping already processed: mel_features/real/obama-original_segment_5703.npy
Skipping already processed: mel_features/real/obama-original_segment_5627.npy
Skipping already processed: mel_features/real/obama-original_segment_5620.npy
Skipping already processed: mel_features/real/obama-original_segment_5704.npy
Skipping already processed: mel_features/real/obama-original_segment_5683.npy
Skipping already processed: mel_features/real/obama-original_segment_5652.npy
Skipping already processed: mel_features/real/obama-original_segment_5684.npy
Skipping already processed: mel_features/real/obama-original_seg

Skipping already processed: mel_features/fake/biden-to-ryan_segment_2470.npy
Skipping already processed: mel_features/fake/biden-to-ryan_segment_2477.npy
Skipping already processed: mel_features/fake/biden-to-ryan_segment_2553.npy
Skipping already processed: mel_features/fake/biden-to-ryan_segment_2452.npy
Skipping already processed: mel_features/fake/biden-to-ryan_segment_2483.npy
Skipping already processed: mel_features/fake/biden-to-ryan_segment_2455.npy
Skipping already processed: mel_features/fake/biden-to-ryan_segment_2484.npy
Skipping already processed: mel_features/fake/biden-to-ryan_segment_2503.npy
Skipping already processed: mel_features/fake/biden-to-ryan_segment_2504.npy
Skipping already processed: mel_features/fake/biden-to-ryan_segment_2469.npy
Skipping already processed: mel_features/fake/biden-to-ryan_segment_2531.npy
Skipping already processed: mel_features/fake/biden-to-ryan_segment_2536.npy
Skipping already processed: mel_features/fake/biden-to-ryan_segment_2544.npy

## Truncating / Padding Features

* function so all features have the same shape 
input: a 2d array with shape 
output: a 2d array padded or truncated 

* mel spectrogram can have slightly different time durations, resulting in inconsistent shapes. Function standardizes them. 

In [41]:
# all features have same shape - for NN uniform input size
def pad_or_truncate(feature, target_shape=(128, 256)):
    h, w = feature.shape
    target_h, target_w = target_shape

    # truncate if too big
    feature = feature[:target_h, :target_w]

    # pad if too small
    pad_h = target_h - feature.shape[0]
    pad_w = target_w - feature.shape[1]

    return np.pad(feature, ((0, pad_h), (0, pad_w)), mode='constant')


In [42]:
# loading and labeling features 
# apply padding/ truncation to .npy files
X = []
y = []

# loading/ processing files
for file in Path("mel_features/real").glob("*.npy"):
    mel = np.load(file)
    X.append(pad_or_truncate(mel))
    y.append(0)

for file in Path("mel_features/fake").glob("*.npy"):
    mel = np.load(file)
    X.append(pad_or_truncate(mel))
    y.append(1)

# converting into numpy arrays 
X = np.array(X)
y = np.array(y)

print("Shape:", X.shape, y.shape)


Shape: (3725, 128, 256) (3725,)


In [43]:
#flattening mel spectrogram - 1D vector
X = X.reshape(X.shape[0], -1)
print("Flattened shape:", X.shape)  


Flattened shape: (3725, 32768)


In [44]:
# splitting dataset for training and testing 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


Train shape: (2980, 32768) Test shape: (745, 32768)


In [None]:
## Defining and Training on Mel Spectrogram 

In [45]:
# import libraries 
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report

# convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

# create a dataloader
# batches training data and shuffles to improve generalization
train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=32, shuffle=True)

# model def: MLP model
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(), # ReLU
            nn.Dropout(0.3),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid() # single neuron with sigmoid function
        )

    def forward(self, x):
        return self.net(x)

model = MLP(X_train.shape[1])
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# training loop
for epoch in range(100):
    model.train()
    for xb, yb in train_loader:
        preds = model(xb)
        loss = criterion(preds, yb)
        optimizer.zero_grad()
        loss.backward() # binary cross entropy
        optimizer.step() # adam to adjust weight during backprop
    print(f"Epoch {epoch+1} Loss: {loss.item():.4f}")


Epoch 1 Loss: 1.5813
Epoch 2 Loss: 0.1814
Epoch 3 Loss: 0.0003
Epoch 4 Loss: 0.0088
Epoch 5 Loss: 0.0149
Epoch 6 Loss: 0.0401
Epoch 7 Loss: 0.1454
Epoch 8 Loss: 0.0023


KeyboardInterrupt: 

In [25]:
# evaluation
model.eval()
with torch.no_grad():
    test_preds = model(X_test_tensor)
    test_preds = torch.round(test_preds).squeeze() # values >= are classified as 1 
    # values < 0.5 are classified as real 

from sklearn.metrics import classification_report
print(classification_report(y_test, test_preds.numpy()))


              precision    recall  f1-score   support

           0       0.74      0.38      0.50       151
           1       0.86      0.97      0.91       594

    accuracy                           0.85       745
   macro avg       0.80      0.67      0.70       745
weighted avg       0.84      0.85      0.83       745



In [26]:
# saving model 
torch.save(model.state_dict(), "mlp_melspec_model.pth")

In [27]:
# saving testing
np.save("y_test.npy", y_test)