In [1]:
%%capture
pip install PyWavelets

In [2]:
%%bash
git clone https://github.com/fbcotter/pytorch_wavelets.git
cd pytorch_wavelets
pip install .
cd ..
rm -r pytorch_wavelets

Cloning into 'pytorch_wavelets'...


Processing /media/greca/HD/GitHub/ser-pytorch/notebooks/pytorch_wavelets
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pytorch-wavelets
  Building wheel for pytorch-wavelets (setup.py): started
  Building wheel for pytorch-wavelets (setup.py): finished with status 'done'
  Created wheel for pytorch-wavelets: filename=pytorch_wavelets-1.3.0-py3-none-any.whl size=54851 sha256=8eb842a3b29fe5fa00de6912abdcbdd69dc585d76c3b885de71d39e609aa66e3
  Stored in directory: /tmp/pip-ephem-wheel-cache-4287ticv/wheels/d9/6f/3c/68cbd8ac7dc59a9d948be15c30f0c07cbf4b8432b16e42b125
Successfully built pytorch-wavelets
Installing collected packages: pytorch-wavelets
  Attempting uninstall: pytorch-wavelets
    Found existing installation: pytorch-wavelets 1.3.0
    Uninstalling pytorch-wavelets-1.3.0:
      Successfully uninstalled pytorch-wavelets-1.3.0
Successfully installed pytorch-wavelets-1.3.0


In [3]:
import pywt
import pandas as pd
import os
import torch
import torch.nn as nn
import torchaudio
import numpy as np
import torch.nn.functional as F
import pywt
import random
from audiomentations import Normalize
from sklearn.metrics import f1_score
from torch.utils.data import Dataset, DataLoader
from pytorch_wavelets import DWTForward, DTCWTForward, ScatLayer
from typing import Tuple, List, Dict

# Making sure the experiments are reproducible
seed = 2109
random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [37]:
def create_propor_train_dataframe(
    path: str
) -> pd.DataFrame:
    """
    Creates a PROPOR 2022's pandas DataFrame containing
    all the training files using the same structure as the
    `test_ser_metadata.csv` file.
    
    Args:
        path (str): the path to the CSV file.
    
    Returns:
        df (pd.DataFrame): the pandas DataFrame.
    """
    wav_files = [
        file
        for file in os.listdir(path)
        if file.endswith(".wav")
    ]
    df = pd.DataFrame()
    
    for wav in wav_files:
        wav_file = os.path.basename(wav)
        wav_file = wav_file.split("/")[0]
        label = wav_file.split("_")[-1].replace(".wav", "")
        
        row = pd.DataFrame({
            "file": [os.path.join(path, wav_file)],
            "label": [label],
            "wav_file": [wav_file]
        })
        
        df = pd.concat(
            [df, row],
            axis=0
        )
    
    return df.reset_index(drop=True)

def stereo_to_mono(
    audio: torch.Tensor
) -> torch.Tensor:
    """
    Converts a stereo audio to mono.
    
    Args:
        audio (torch.Tensor): the audio's waveform (stereo).
    
    Returns:
        torch.Tensor: the audio's waveform (mono).
    """
    audio = torch.mean(audio, dim=0, keepdim=True)
    return audio

def read_audio(
    path: str,
    to_mono: bool = True,
    sample_rate: int = None
) -> Tuple[torch.Tensor, int]:
    """
    Reads a audio file.

    Args:
        path (str): the audio file's path.
        to_mono (bool, optional): convert the signal to mono. Defaults to True.
        sample_rate (int, optional): resample the audio to that specific sample rate
                                     (None if you won't resample). Defaults to None.

    Returns:
        Tuple[torch.Tensor, int]: the audio waveform and the sample rate.
    """
    audio, sr = torchaudio.load(filepath=path)
    
    if sample_rate is not None and sample_rate != sr:
        audio = resample_audio(
            audio=audio,
            sample_rate=sr,
            new_sample_rate=sample_rate
        )
    
    if to_mono and audio.shape[0] > 1:
        audio = stereo_to_mono(audio=audio)
    
    return audio, sr

def resample_audio(
    audio: torch.Tensor,
    sample_rate: int,
    new_sample_rate: int
) -> torch.Tensor:
    """
    Resamples a given audio.

    Args:
        audio (torch.Tensor): the audio's waveform.
        sample_rate (int): the original audio's sample rate.
        new_sample_rate (int): the new audio's sample rate.

    Returns:
        torch.Tensor: the resampled audio's waveform.
    """
    transform = torchaudio.transforms.Resample(
        orig_freq=sample_rate,
        new_freq=new_sample_rate
    )
    audio = transform(audio)
    return audio

def extract_spectrogram(
    audio: torch.Tensor,
    n_fft: int,
    hop_length: int
) -> torch.Tensor:
    """
    Extracts the spectrogram of a given audio.
    
    Args:
        audio (torch.Tensor): the audio's waveform.
        n_fft (int): the number of fft.
        hop_length (int): the hop length.
        
    Returns:
        torch.Tensor: the extracted Spectogram.
    """
    transform = torchaudio.transforms.Spectrogram(
        n_fft=n_fft,
        hop_length=hop_length
    )
    spectrogram = transform(audio)
    return spectrogram

def extract_melspectrogram(
    audio: torch.Tensor,
    sample_rate: int,
    n_fft: int,
    hop_length: int,
    n_mels: int
) -> torch.Tensor:
    """
    Extracts the mel spectrogram of a given audio.
    
    Args:
        audio (np.ndarray): the audio's waveform.
        sample_rate (int): the audio's sample rate.
        n_fft (int): the number of fft.
        hop_length (int): the hop length.
        n_mels (int): the number of mels.
        f_min (int): the minimum frequency.
        f_max (int): the maximum frequency.
        
    Returns:
        torch.Tensor: the extracted Mel Spectrogram.
    """
    transform = torchaudio.transforms.MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        power=1
    )
    mel_spectrogram = transform(audio)
    return mel_spectrogram

def pad_features(
    features: List,
    max_height: int,
    max_width: int
) -> torch.Tensor:
    """
    Auxiliary function to pad the features.
    
    Args:
        features (List): the features that will be padded (mfcc, spectogram or mel_spectogram).
        max_height (int): the height max value.
        max_width (int): the width max value.
    
    Returns:
        List: the padded features.
    """
    features = [
        F.pad(f, (0, max_width - f.size(2), 0, max_height - f.size(1)))
        for f in features
    ]
    return features

In [5]:
propor_path = "/media/greca/HD/Datasets/PROPOR 2022/data_train/train"
df = create_propor_train_dataframe(propor_path)
df["label"] = df["label"].replace({
    "neutral": 0,
    "non-neutral-male": 1,
    "non-neutral-female": 2
})
df.head()

Unnamed: 0,file,label,wav_file
0,/media/greca/HD/Datasets/PROPOR 2022/data_trai...,1,bpubdl02_segment247_non-neutral-male.wav
1,/media/greca/HD/Datasets/PROPOR 2022/data_trai...,0,bpubmn14_segment89_neutral.wav
2,/media/greca/HD/Datasets/PROPOR 2022/data_trai...,0,bfamdl26_segment93_neutral.wav
3,/media/greca/HD/Datasets/PROPOR 2022/data_trai...,0,bfammn27_segment275_neutral.wav
4,/media/greca/HD/Datasets/PROPOR 2022/data_trai...,0,bfamcv02_segment276_neutral.wav


In [6]:
labels = []
data = []
audios = []

for file, label in zip(df["file"], df["label"]):
    audio, sr = read_audio(
        path=file,
        to_mono=True,
        sample_rate=None
    )
    
    feat = extract_melspectrogram(
        audio=audio,
        sample_rate=sr,
        n_fft=1024,
        hop_length=512,
        n_mels=60
    )
    
#     feat = extract_spectrogram(
#         audio=audio,
#         n_fft=1024,
#         hop_length=512    
#     )
    
    data.append(feat)
    labels.append(label)
    audios.append(audio)

In [7]:
max_height = max([x.size(1) for x in data])
max_width = max([x.size(2) for x in data])

data = pad_features(
    features=data,
    max_height=max_height,
    max_width=max_width
)

data = torch.cat(data, 0).to(dtype=torch.float32)
data = data.unsqueeze(1)
labels = torch.as_tensor(labels).to(dtype=torch.long)

print(data.shape, labels.shape)

torch.Size([625, 1, 60, 439]) torch.Size([625])


## Comparing PyTorch Wavelets vs PyWavelets

In [8]:
audio_test = data[0, :, :, :].unsqueeze(0)
maxlevel = 4
wavelet="db6"

# PyTorch Wavelets
xfm = DWTForward(
    J=maxlevel,
    wave=wavelet, 
    mode="symmetric"
)
Yl, Yh = xfm(audio_test)
print(Yl.shape)
print(len(Yh))

for i in range(len(Yh)):
    print(Yh[i].shape)

print(); print();

# PyWavelets
coeffs = pywt.wavedec2(
    data=audio_test,
    level=maxlevel,
    wavelet=wavelet, 
    mode="symmetric",
    axes=(-2,-1)
)

print(coeffs[0].shape)
for j in range(maxlevel):
    for b in range(3):
        print(coeffs[maxlevel-j][b].shape)

print()

arr, coeff_slices = pywt.coeffs_to_array(coeffs, axes=(-2, -1))
print(arr.shape)

torch.Size([1, 1, 14, 37])
4
torch.Size([1, 1, 3, 35, 225])
torch.Size([1, 1, 3, 23, 118])
torch.Size([1, 1, 3, 17, 64])
torch.Size([1, 1, 3, 14, 37])


(1, 1, 14, 37)
(1, 1, 35, 225)
(1, 1, 35, 225)
(1, 1, 35, 225)
(1, 1, 23, 118)
(1, 1, 23, 118)
(1, 1, 23, 118)
(1, 1, 17, 64)
(1, 1, 17, 64)
(1, 1, 17, 64)
(1, 1, 14, 37)
(1, 1, 14, 37)
(1, 1, 14, 37)

(1, 1, 103, 481)




In [9]:
# Code taken from: https://github.com/fbcotter/pytorch_wavelets/blob/master/tests/test_dwt.py
from pytorch_wavelets import DWTForward, DWTInverse
import numpy as np

PREC_FLT = 3
PREC_DBL = 7
J = 4
wave = "db6"
mode = "symmetric"
x = audio_test

dwt = DWTForward(J=J, wave=wave, mode=mode)
yl, yh = dwt(x)

coeffs = pywt.wavedec2(x.cpu().numpy(), wave, level=J, axes=(-2,-1),
                       mode=mode)
np.testing.assert_array_almost_equal(yl.cpu(), coeffs[0], decimal=PREC_FLT)
for j in range(J):
    for b in range(3):
        np.testing.assert_array_almost_equal(
            coeffs[J-j][b], yh[j][:,:,b].cpu(), decimal=PREC_FLT)

In [10]:
arr, coeff_slices = pywt.coeffs_to_array(coeffs, axes=(-2, -1))
print(arr.shape)

(1, 1, 103, 481)


In [11]:
arr, coeff_slices = pywt.coeffs_to_array(coeffs, axes=(-2, -1))
print(arr.shape)

(1, 1, 103, 481)


## PyTorch Wavelets

### Extracting the Wavelets

In [12]:
class SER_Dataset_PTH(Dataset):
    def __init__(
        self,
        X: torch.Tensor,
        y: torch.Tensor
    ) -> None:
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(
        self,
        index: int
    ) -> Dict:
        batch = {}
        batch["features"] = self.X[index, :, :, :]
        batch["labels"] = self.y[index]
        return batch
    
def create_dataloader_pth(
    X: torch.Tensor,
    y: torch.Tensor
) -> DataLoader:
    dataset = SER_Dataset_PTH(X, y)
    dataloader = DataLoader(
        dataset,
        batch_size=32,
        num_workers=0,
        drop_last=True
    )
    
    return dataloader

In [13]:
dwt = DWTForward(wave="db6", J=3, mode="symmetric")
yl, yh = dwt(data)

print(yl.shape)
print(len(yh))
print(yh[0].shape)
print(yh[1].shape)
print(yh[2].shape)
print()

torch.Size([625, 1, 17, 64])
3
torch.Size([625, 1, 3, 35, 225])
torch.Size([625, 1, 3, 23, 118])
torch.Size([625, 1, 3, 17, 64])



In [14]:
dtcwt = DTCWTForward(J=3, biort="near_sym_b", qshift="qshift_b")
yl, yh = dtcwt(data)

print(yl.shape)
print(len(yh))
print(yh[0].shape)
print(yh[1].shape)
print(yh[2].shape)
print()

torch.Size([625, 1, 16, 110])
3
torch.Size([625, 1, 6, 30, 220, 2])
torch.Size([625, 1, 6, 15, 110, 2])
torch.Size([625, 1, 6, 8, 55, 2])



In [15]:
scat = ScatLayer()
output = scat(data)
print(output.shape)

torch.Size([625, 7, 30, 220])


### Training the model

In [16]:
class BaselinePTH(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.model = nn.Sequential(
            ScatLayer(),
            nn.Conv2d(
                in_channels=7,
                out_channels=64,
                kernel_size=(2, 2),
                padding="valid"
            ),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(
                kernel_size=(2, 2)
            ),
            nn.Conv2d(
                in_channels=64,
                out_channels=128,
                kernel_size=(2, 2),
                padding="valid"
            ),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(
                kernel_size=(2, 2)
            ),
            nn.Flatten(),
            nn.Dropout(p=0.5),
            nn.Linear(
                in_features=41472,
                out_features=3
            )
        )
    
    def forward(self, X: torch.Tensor) -> torch.Tensor:
        return self.model(X)

In [17]:
dataloader = create_dataloader_pth(data, labels)
device = torch.device("cpu")
model = BaselinePTH().to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)
loss = torch.nn.CrossEntropyLoss()
epochs = 30

for epoch in range(1, epochs+1):
    print(f"\nTraining epoch: {epoch}/{epochs}")
    model.train()
    train_loss = 0.0
    train_f1 = 0.0
    
    for batch in dataloader:
        data = batch["features"].to(device)
        target = batch["labels"].to(device)
        optimizer.zero_grad()
        
        output = model(data)
        
        l = loss(output, target)
        train_loss += l.item()
        
        l.backward()
        optimizer.step()
        
        prediction = output.argmax(dim=-1, keepdim=True)
        train_f1 += f1_score(
            target.detach().cpu().numpy(),
            prediction.detach().cpu().numpy(),
            average="macro"
        )
        
    train_loss /= len(dataloader)
    train_f1 /= len(dataloader)
    
    print(f"Training loss: {train_loss:1.6f}")
    print(f"Training F1: {train_f1:1.6f}")


Training epoch: 1/30
Training loss: 2.101768
Training F1: 0.310213

Training epoch: 2/30
Training loss: 0.978775
Training F1: 0.444106

Training epoch: 3/30
Training loss: 0.926789
Training F1: 0.460981

Training epoch: 4/30
Training loss: 1.142287
Training F1: 0.486497

Training epoch: 5/30
Training loss: 0.713688
Training F1: 0.582194

Training epoch: 6/30
Training loss: 0.709239
Training F1: 0.607228

Training epoch: 7/30
Training loss: 0.715909
Training F1: 0.590082

Training epoch: 8/30
Training loss: 0.638277
Training F1: 0.607354

Training epoch: 9/30
Training loss: 0.445954
Training F1: 0.684647

Training epoch: 10/30
Training loss: 0.361411
Training F1: 0.735316

Training epoch: 11/30
Training loss: 0.330701
Training F1: 0.754670

Training epoch: 12/30
Training loss: 0.253081
Training F1: 0.807139

Training epoch: 13/30
Training loss: 0.286743
Training F1: 0.769616

Training epoch: 14/30
Training loss: 0.232459
Training F1: 0.848193

Training epoch: 15/30
Training loss: 0.283

## PyWavelets

### Extracting the Wavelets

In [18]:
def stereo_to_mono(
    audio: torch.Tensor
) -> torch.Tensor:
    """
    Converts a stereo audio to mono.
    
    Args:
        audio (torch.Tensor): the audio's waveform (stereo).
    
    Returns:
        torch.Tensor: the audio's waveform (mono).
    """
    audio = torch.mean(audio, dim=0, keepdim=True)
    return audio

def read_audio(
    path: str,
    to_mono: bool = True,
    sample_rate: int = None
) -> Tuple[torch.Tensor, int]:
    """
    Reads a audio file.

    Args:
        path (str): the audio file's path.
        to_mono (bool, optional): convert the signal to mono. Defaults to True.
        sample_rate (int, optional): resample the audio to that specific sample rate
                                     (None if you won't resample). Defaults to None.

    Returns:
        Tuple[torch.Tensor, int]: the audio waveform and the sample rate.
    """
    audio, sr = torchaudio.load(filepath=path)
    
    if sample_rate is not None and sample_rate != sr:
        audio = resample_audio(
            audio=audio,
            sample_rate=sr,
            new_sample_rate=sample_rate
        )
        sr = sample_rate
    
    if to_mono and audio.shape[0] > 1:
        audio = stereo_to_mono(audio=audio)
    
    return audio, sr

def torch_to_numpy(
    tensor: torch.Tensor
) -> np.ndarray:
    """
    Converts a torch's tensor to numpy.

    Args:
        tensor (torch.Tensor): torch's tensor.

    Returns:
        np.ndarray: numpy's tensor.
    """
    np_tensor = tensor.detach().permute(1, 0).numpy()
    np_tensor = np_tensor.astype(np.float32)
    return np_tensor

def numpy_to_torch(
    tensor: np.ndarray
) -> torch.Tensor:
    """
    Converts a numpy's tensor to torch.

    Args:
        tensor (np.ndarray): numpy's tensor.

    Returns:
        torch.Tensor: torch's tensor.
    """
    torch_tensor = torch.from_numpy(tensor).permute(1, 0)
    torch_tensor = torch_tensor.to(dtype=torch.float32)
    return torch_tensor

def create_propor_train_dataframe(
    path: str
) -> pd.DataFrame:
    """
    Creates a PROPOR 2022's pandas DataFrame containing
    all the training files using the same structure as the
    `test_ser_metadata.csv` file.
    
    Args:
        path (str): the path to the CSV file.
    
    Returns:
        df (pd.DataFrame): the pandas DataFrame.
    """
    wav_files = [
        file
        for file in os.listdir(path)
        if file.endswith(".wav")
    ]
    df = pd.DataFrame()
    
    for wav in wav_files:
        wav_file = os.path.basename(wav)
        wav_file = wav_file.split("/")[0]
        label = wav_file.split("_")[-1].replace(".wav", "")
        
        row = pd.DataFrame({
            "file": [wav_file],
            "label": [label],
            "wav_file": [wav_file]
        })
        
        df = pd.concat(
            [df, row],
            axis=0
        )
    
    return df.reset_index(drop=True)

def wavelet_from_raw_audio(
    df: pd.DataFrame,
    wavelet: str,
    maxlevel: int,
    type: str,
    params: dict
) -> pd.DataFrame:
    """
    Extract the wavelet from the raw audio.

    Args:
        spectrogram (torch.Tensor): the raw audios.
        labels (torch.Tensor): the raw audios' labels.
        wavelet (str): the wavelet's name.
        maxlevel (int): the wavelet's max level.
        type (str): which wavelet to extract (packet).
        params (Dict): the parameters dict.

    Returns:
        pd.DataFrame: the extracted wavelet packet.
    """
    wavelet_df = pd.DataFrame()
    normalizer = Normalize(p=1)
    
    if type == "packet":
        for label, file_path in zip(df["label"], df["wav_file"]):
            mean_nodes = []
            audio, sr = read_audio(
                path=file_path,
                to_mono=params["feature"]["to_mono"],
                sample_rate=params["feature"]["sample_rate"]
            )
            audio = audio.squeeze().numpy()
            audio = normalizer(audio, params["feature"]["sample_rate"])
            audio = np.squeeze(audio)
            
            wp = pywt.WaveletPacket(
                data=audio,
                wavelet=wavelet,
                mode="symmetric",
                maxlevel=maxlevel
            )
            
            if wp.maxlevel > 0:
                nodes = [node.path for node in wp.get_level(maxlevel, "natural")]
                
                for node in nodes:
                    data = wp[node].data
                    data = np.multiply(data, data)
                    data = sum(data)/len(data)
                    # data = sum(data)
                    mean_data = torch.as_tensor(data)
                    mean_data = mean_data.to(torch.float32).item()
                    mean_nodes.append([mean_data])
                
                row = pd.DataFrame(mean_nodes).T
                row.columns = nodes
                row["target"] = label
                wavelet_df = pd.concat([wavelet_df, row], axis=0)
                
    wavelet_df = wavelet_df.reset_index(drop=True)
    return wavelet_df

propor_train_path = "/media/greca/HD/Datasets/PROPOR 2022/data_train/train"
propor_train_df = create_propor_train_dataframe(propor_train_path)
propor_train_df["wav_file"] = propor_train_df["wav_file"].apply(lambda x: os.path.join(propor_train_path, x))

wavelet_df = wavelet_from_raw_audio(
    df=propor_train_df,
    wavelet="db6",
    maxlevel=3,
    type="packet",
    params={
        "feature":{
            "to_mono": True,
            "sample_rate": 16000
        }
    }
)

In [19]:
wavelet_df.head(5)

Unnamed: 0,aaa,aad,ada,add,daa,dad,dda,ddd,target
0,0.254451,0.019808,0.007313,0.008119,0.000257,0.000768,0.006321,0.001077,non-neutral-male
1,0.078333,0.00107,2.8e-05,0.00011,1e-05,1.1e-05,2e-05,8e-06,neutral
2,0.088213,0.043541,0.000455,0.008713,5e-06,3.2e-05,0.000135,9.6e-05,neutral
3,0.063264,0.004273,0.00011,0.000697,8e-06,1.6e-05,6.2e-05,2.2e-05,neutral
4,0.055455,0.038655,0.006121,0.014573,0.000165,0.000527,0.004849,0.000741,neutral


### Training the model

In [20]:
class SER_Dataset_PYWT(Dataset):
    def __init__(
        self,
        df: pd.DataFrame
    ) -> None:
        self.X = df.drop(columns=["target"])
        self.y = df["target"]
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(
        self,
        index: int
    ) -> Dict:
        batch = {}
        features = self.X.iloc[index, :].values.tolist()
        features = torch.as_tensor(features)
        
        batch["features"] = features
        batch["labels"] = self.y[index]
        return batch

def create_dataloader_pywt(
    df: pd.DataFrame
) -> DataLoader:
    dataset = SER_Dataset_PYWT(df)
    dataloader = DataLoader(
        dataset,
        batch_size=64,
        num_workers=0,
        drop_last=True
    )
    
    return dataloader

In [21]:
class BaselinePYWT(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(
                in_features=8,
                out_features=64
            ),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(
                in_features=64,
                out_features=128
            ),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(
                in_features=128,
                out_features=3
            )
        )
    
    def forward(self, X: torch.Tensor) -> torch.Tensor:
        return self.model(X)

In [22]:
wavelet_df["target"] = wavelet_df["target"].replace({
    "neutral": 0,
    "non-neutral-male": 1,
    "non-neutral-female": 2
})
dataloader = create_dataloader_pywt(wavelet_df)
device = torch.device("cpu")
model = BaselinePYWT().to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)
loss = torch.nn.CrossEntropyLoss()
epochs = 150

for epoch in range(1, epochs+1):
    print(f"\nTraining epoch: {epoch}/{epochs}")
    model.train()
    train_loss = 0.0
    train_f1 = 0.0
    
    for batch in dataloader:
        data = batch["features"].to(device)
        target = batch["labels"].to(device)
        optimizer.zero_grad()
        
        output = model(data)
        
        l = loss(output, target)
        train_loss += l.item()
        
        l.backward()
        optimizer.step()
        
        prediction = output.argmax(dim=-1, keepdim=True)
        train_f1 += f1_score(
            target.detach().cpu().numpy(),
            prediction.detach().cpu().numpy(),
            average="macro"
        )
        
    train_loss /= len(dataloader)
    train_f1 /= len(dataloader)
    
    print(f"Training loss: {train_loss:1.6f}")
    print(f"Training F1: {train_f1:1.6f}")


Training epoch: 1/150
Training loss: 0.985934
Training F1: 0.264924

Training epoch: 2/150
Training loss: 0.744077
Training F1: 0.292592

Training epoch: 3/150
Training loss: 0.675169
Training F1: 0.292987

Training epoch: 4/150
Training loss: 0.650148
Training F1: 0.292987

Training epoch: 5/150
Training loss: 0.639814
Training F1: 0.292987

Training epoch: 6/150
Training loss: 0.632420
Training F1: 0.292987

Training epoch: 7/150
Training loss: 0.624934
Training F1: 0.292987

Training epoch: 8/150
Training loss: 0.620964
Training F1: 0.292987

Training epoch: 9/150
Training loss: 0.617439
Training F1: 0.301507

Training epoch: 10/150
Training loss: 0.613711
Training F1: 0.301507

Training epoch: 11/150
Training loss: 0.609810
Training F1: 0.301507

Training epoch: 12/150
Training loss: 0.606681
Training F1: 0.310025

Training epoch: 13/150
Training loss: 0.603920
Training F1: 0.310025

Training epoch: 14/150
Training loss: 0.600803
Training F1: 0.310025

Training epoch: 15/150
Train

Training loss: 0.425387
Training F1: 0.519810

Training epoch: 119/150
Training loss: 0.429469
Training F1: 0.557568

Training epoch: 120/150
Training loss: 0.428940
Training F1: 0.550299

Training epoch: 121/150
Training loss: 0.424570
Training F1: 0.585718

Training epoch: 122/150
Training loss: 0.424342
Training F1: 0.558216

Training epoch: 123/150
Training loss: 0.429889
Training F1: 0.557096

Training epoch: 124/150
Training loss: 0.422810
Training F1: 0.531556

Training epoch: 125/150
Training loss: 0.420247
Training F1: 0.575299

Training epoch: 126/150
Training loss: 0.424688
Training F1: 0.557103

Training epoch: 127/150
Training loss: 0.418997
Training F1: 0.550732

Training epoch: 128/150
Training loss: 0.421119
Training F1: 0.593124

Training epoch: 129/150
Training loss: 0.412850
Training F1: 0.581460

Training epoch: 130/150
Training loss: 0.410699
Training F1: 0.594812

Training epoch: 131/150
Training loss: 0.417403
Training F1: 0.577079

Training epoch: 132/150
Traini

## Test

In [23]:
def time_distribute(
    data: np.ndarray, sequence_length: int, stride: int = None, z_pad: bool = True
) -> np.ndarray:
    """
    Takes a sequence of features or labels and creates an np.ndarray of time
    distributed sequences for input to a Keras TimeDistributed() layer.

    Args:
        data: The array to be time distributed.
        sequence_length: The length of the output sequences in samples.
        stride (optional): The number of samples between sequences. Defaults to sequence_length.
        z_pad (optional): Zero padding to ensure all sequences to have the same dimensions.
        Defaults to True.

    Returns:
        The time ditributed data sequences.

    Example:
        Given an np.ndarray of data:
        >>> data.shape
        (10000, 32, 32, 1)
        >>> time_distribute(data, 10).shape
        (1000, 10, 32, 32, 1)
        The function yeilds 1000 training sequences, each 10 samples long.
    """

    if stride is None:
        stride = sequence_length
        
    if stride > sequence_length:
        print(
            "WARNING: Stride longer than sequence length, causing missed samples. This is not recommended."
        )
    
    td_data = []
    
    for n in range(0, len(data) - sequence_length + 1, stride):
        td_data.append(data[n : n + sequence_length])
    
    if z_pad:
        if len(td_data) * stride + sequence_length != len(data) + stride:
            z_needed = len(td_data) * stride + sequence_length - len(data)
            z_padded = np.zeros(td_data[0].shape)
            for i in range(sequence_length - z_needed):
                z_padded[i] = data[-(sequence_length - z_needed) + i]
            td_data.append(z_padded)
    
    return np.array(td_data)

In [24]:
propor_path = "/media/greca/HD/Datasets/PROPOR 2022/data_train/train"
df = create_propor_train_dataframe(propor_path)
df["label"] = df["label"].replace({
    "neutral": 0,
    "non-neutral-male": 1,
    "non-neutral-female": 2
})
df.head()

Unnamed: 0,file,label,wav_file
0,bpubdl02_segment247_non-neutral-male.wav,1,bpubdl02_segment247_non-neutral-male.wav
1,bpubmn14_segment89_neutral.wav,0,bpubmn14_segment89_neutral.wav
2,bfamdl26_segment93_neutral.wav,0,bfamdl26_segment93_neutral.wav
3,bfammn27_segment275_neutral.wav,0,bfammn27_segment275_neutral.wav
4,bfamcv02_segment276_neutral.wav,0,bfamcv02_segment276_neutral.wav


In [25]:
from torchaudio import transforms
from scipy import stats

audio, sr = read_audio(
    path=df["file"][0],
    to_mono=True,
    sample_rate=16000
)

coeffs = pywt.wavedec(
    audio.squeeze(0).numpy(),
    wavelet="db4",
    mode='symmetric',
    level=4,
    axis=-1
)

transform = transforms.MFCC(
    sample_rate=16000,
    n_mfcc=13
)

means = []
variances = []

for i in range(len(coeffs)):
    coeff = coeffs[i]
    coeff = time_distribute(coeff, 400)
    mfcc_coeff = transform(torch.from_numpy(coeff).to(dtype=torch.float32))
    
    means_mfcc_coeff = torch.mean(mfcc_coeff, dim=(-2, -1))
    means.append(torch.mean(means_mfcc_coeff))
    
    variances_mfcc_coeff = torch.var(mfcc_coeff, dim=(-2, -1))
    variances.append(torch.mean(variances_mfcc_coeff))
    
    print(mfcc_coeff.shape, stats.skew(mfcc_coeff, axis=(-2, -1)).shape)

RuntimeError: Failed to load audio from bpubdl02_segment247_non-neutral-male.wav

## Test 2

In [26]:
def pad_raw_audio(
    features: List,
    max_frames: int
) -> torch.Tensor:
    """
    Auxiliary function to pad the features.
    
    Args:
        features (List): the features that will be padded (mfcc, spectogram or mel_spectogram).
        max_frames (int): the max frames value.
    
    Returns:
        List: the padded features.
    """
    features = [
        F.pad(f, (0, max_frames - f.size(1)))
        for f in features
    ]
    return features

def find_max_dimensions_raw_audio(
    df: pd.DataFrame,
    params: Dict
) -> int:
    """
    Workaround to find the maximum dimension of the raw audio (number of frames).

    Args:
        audios (List): List of audio waveforms.

    Returns:
        int: the max number of frames.
    """
    audios = []
    
    for file_path in df["file"]:
        audio, _ = read_audio(
            path=file_path,
            to_mono=params["feature"]["to_mono"],
            sample_rate=params["feature"]["sample_rate"]
        )
        audios.append(audio)
        
    max_frames = max([x.size(1) for x in audios])
    return max_frames

In [80]:
propor_path = "/media/greca/HD/Datasets/PROPOR 2022/data_train/train"
df = create_propor_train_dataframe(propor_path)
df["label"] = df["label"].replace({
    "neutral": 0,
    "non-neutral-male": 1,
    "non-neutral-female": 2
})
df.head()

Unnamed: 0,file,label,wav_file
0,/media/greca/HD/Datasets/PROPOR 2022/data_trai...,1,bpubdl02_segment247_non-neutral-male.wav
1,/media/greca/HD/Datasets/PROPOR 2022/data_trai...,0,bpubmn14_segment89_neutral.wav
2,/media/greca/HD/Datasets/PROPOR 2022/data_trai...,0,bfamdl26_segment93_neutral.wav
3,/media/greca/HD/Datasets/PROPOR 2022/data_trai...,0,bfammn27_segment275_neutral.wav
4,/media/greca/HD/Datasets/PROPOR 2022/data_trai...,0,bfamcv02_segment276_neutral.wav


In [81]:
sample_rate = 16000
n_mels = 60
audios = []
labels = []

for file, label in zip(df["file"], df["label"]):
    audio, sr = read_audio(
        path=file,
        to_mono=True,
        sample_rate=sample_rate
    )

    labels.append(label)
    audios.append(audio)

In [82]:
max_size = find_max_dimensions_raw_audio(df, params={"feature": {"to_mono": True, "sample_rate": sample_rate}})
padded_audio = pad_raw_audio(audios, max_size)
padded_audio = torch.concat(padded_audio, dim=0)
padded_audio.shape

torch.Size([625, 224512])

In [83]:
specs = []

for i in range(padded_audio.shape[0]):

    coeffs = pywt.wavedec(
        padded_audio[i].squeeze().numpy(),
        wavelet="db4",
        level=4,
        mode="symmetric",
    )

    new_coeffs = [torch.from_numpy(coeffs[i]).unsqueeze(0) for i in range(len(coeffs))]

    transform = torchaudio.transforms.MelSpectrogram(
        sample_rate=sample_rate,
        n_mels=n_mels,
        n_fft=512,
        hop_length=256,
        power=1
    )
#     transform = transforms.MFCC(
#         sample_rate=sample_rate,
#         n_mfcc=13,
#         melkwargs={"n_fft": 512, "hop_length": 256, "n_mels": n_mels, "center": False}
#     )

    mel_specs = [transform(coeff) for coeff in new_coeffs]

    max_height = max([x.size(1) for x in mel_specs])
    max_width = max([x.size(2) for x in mel_specs])

    mel_specs = pad_features(
        features=mel_specs,
        max_height=max_height,
        max_width=max_width
    )
    mel_specs = torch.concat(mel_specs, dim=0).unsqueeze(0)
    specs.append(mel_specs)

specs = torch.concat(specs, dim=0)
specs.shape

torch.Size([625, 5, 60, 439])

In [88]:
class BaselineTeste2(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(
                in_channels=5,
                out_channels=64,
                kernel_size=(2, 2),
                padding="valid"
            ),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(
                kernel_size=(2, 2)
            ),
            nn.Conv2d(
                in_channels=64,
                out_channels=128,
                kernel_size=(2, 2),
                padding="valid"
            ),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(
                kernel_size=(2, 2)
            ),
            nn.Flatten(),
            nn.Linear(
                in_features=195328,
                out_features=3
            )
        )
    
    def forward(self, X: torch.Tensor) -> torch.Tensor:
        return self.model(X)

In [89]:
dataloader = create_dataloader_pth(specs, labels)
device = torch.device("cpu")
model = BaselineTeste2().to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)
loss = torch.nn.CrossEntropyLoss()
epochs = 30

for epoch in range(1, epochs+1):
    print(f"\nTraining epoch: {epoch}/{epochs}")
    model.train()
    train_loss = 0.0
    train_f1 = 0.0
    
    for batch in dataloader:
        data = batch["features"].to(device)
        target = batch["labels"].to(device)
        optimizer.zero_grad()
        
        output = model(data)
        
        l = loss(output, target)
        train_loss += l.item()
        
        l.backward()
        optimizer.step()
        
        prediction = output.argmax(dim=-1, keepdim=True)
        train_f1 += f1_score(
            target.detach().cpu().numpy(),
            prediction.detach().cpu().numpy(),
            average="macro"
        )
        
    train_loss /= len(dataloader)
    train_f1 /= len(dataloader)
    
    print(f"Training loss: {train_loss:1.6f}")
    print(f"Training F1: {train_f1:1.6f}")


Training epoch: 1/30
Training loss: 5.010028
Training F1: 0.281255

Training epoch: 2/30
Training loss: 1.694842
Training F1: 0.394194

Training epoch: 3/30
Training loss: 1.243173
Training F1: 0.403906

Training epoch: 4/30
Training loss: 0.992872
Training F1: 0.555003

Training epoch: 5/30
Training loss: 0.844646
Training F1: 0.552987

Training epoch: 6/30
Training loss: 1.172102
Training F1: 0.534522

Training epoch: 7/30
Training loss: 0.868625
Training F1: 0.613065

Training epoch: 8/30
Training loss: 0.823009
Training F1: 0.624471

Training epoch: 9/30
Training loss: 0.743554
Training F1: 0.652289

Training epoch: 10/30
Training loss: 0.568447
Training F1: 0.703164

Training epoch: 11/30
Training loss: 0.427177
Training F1: 0.693793

Training epoch: 12/30
Training loss: 0.343550
Training F1: 0.765916

Training epoch: 13/30
Training loss: 0.296858
Training F1: 0.805410

Training epoch: 14/30
Training loss: 0.306968
Training F1: 0.792278

Training epoch: 15/30
Training loss: 0.346