In [None]:
import os
import torch
import numpy as np
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

class MelSpectrogramDataset(Dataset):
    def __init__(self, root_dir, file_paths=None, labels=None):
        self.root_dir = root_dir
        self.data = []
        self.language_to_idx = {}

        # Locate the mel directory
        mel_dir = os.path.join(self.root_dir, 'mel')
        
        # Get the sorted list of language directories
        languages = sorted([
            lang for lang in os.listdir(mel_dir)
            if os.path.isdir(os.path.join(mel_dir, lang))
        ])

        # Create mapping from language to index
        self.language_to_idx = {lang: idx for idx, lang in enumerate(languages)}

        if file_paths is None or labels is None:
            # Store all (file_path, one-hot label) pairs if not provided
            for lang in languages:
                lang_dir = os.path.join(mel_dir, lang)
                for file_name in os.listdir(lang_dir):
                    if file_name.endswith('.npy'):
                        file_path = os.path.join(lang_dir, file_name)
                        # Create one-hot encoded label
                        label = torch.zeros(len(self.language_to_idx))
                        label[self.language_to_idx[lang]] = 1
                        self.data.append((file_path, label))
        else:
            # Use provided file paths and one-hot labels
            for file_path, label in zip(file_paths, labels):
                self.data.append((file_path, label))

        self.num_classes = len(self.language_to_idx)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_path, one_hot_label = self.data[idx]
        mel_tensor = torch.tensor(np.load(file_path), dtype=torch.float32)

        return one_hot_label, mel_tensor

# Function to perform train-test split
def create_train_test_split(dataset, test_size=0.2):
    file_paths = [x[0] for x in dataset.data]
    labels = [x[1] for x in dataset.data]

    # Split the dataset
    train_file_paths, test_file_paths, train_labels, test_labels = train_test_split(
        file_paths, labels, test_size=test_size, random_state=42
    )

    # Create new datasets for train and test
    train_dataset = MelSpectrogramDataset(root_dir=dataset.root_dir, file_paths=train_file_paths, labels=train_labels)
    test_dataset = MelSpectrogramDataset(root_dir=dataset.root_dir, file_paths=test_file_paths, labels=test_labels)

    return train_dataset, test_dataset

# Example Usage
dataset = MelSpectrogramDataset(root_dir='data')
train_dataset, test_dataset = create_train_test_split(dataset, test_size=0.2)

# Creating DataLoaders for train and test datasets
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

print(train_dataset[0])
# Example of using the train dataloader
for label, mel in train_dataloader:
    print("Train Label (one-hot):", label)
    print("Train Mel tensor shape:", mel)
    break

(tensor([0., 0., 1., 0.]), tensor([[[-100.0000, -100.0000, -100.0000,  ..., -100.0000, -100.0000,
          -100.0000],
         [-100.0000, -100.0000, -100.0000,  ...,  -21.8491,  -14.9475,
             7.9811],
         [-100.0000, -100.0000, -100.0000,  ...,  -14.5379,   -7.6362,
            15.2923],
         ...,
         [-100.0000, -100.0000, -100.0000,  ...,  -28.1896,  -24.7541,
           -27.9030],
         [-100.0000, -100.0000, -100.0000,  ...,  -27.8419,  -29.9519,
           -32.3267],
         [-100.0000, -100.0000, -100.0000,  ...,  -29.2723,  -28.5787,
           -30.4753]]]))
Train Label (one-hot): tensor([[0., 0., 0., 1.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.]])
Train Mel tensor shape: tensor([[[[-100.0000, -100.0000, -100.0000,  ..., -100.0000, -100.0000,
           -100.0000],
          [ -56.0409,  -64.4637,  -67.6327,  ...,  -48.5726,  -54.6184,
            -51.3472],
          [ -48.7296,  -57.1525,  -60.3215,  ...,  -41.2

In [None]:
# import os
# import torch
# import numpy as np
# from torch.utils.data import Dataset, DataLoader

# class MelSpectrogramDataset(Dataset):
#     def __init__(self, root_dir):
#         self.root_dir = root_dir
#         self.data = []
#         self.language_to_idx = {}

#         mel_dir = os.path.join(self.root_dir, 'mel')

#         # Sort languages for consistent one-hot encoding
#         languages = sorted([
#             lang for lang in os.listdir(mel_dir)
#             if os.path.isdir(os.path.join(mel_dir, lang))
#         ])

#         # Create mapping from language name to index
#         self.language_to_idx = {lang: idx for idx, lang in enumerate(languages)}
#         self.num_classes = len(self.language_to_idx)

#         # Store file paths and labels
#         for lang in languages:
#             lang_dir = os.path.join(mel_dir, lang)
#             for file_name in os.listdir(lang_dir):
#                 if file_name.endswith('.npy'):
#                     file_path = os.path.join(lang_dir, file_name)
#                     label_idx = self.language_to_idx[lang]
#                     self.data.append((file_path, label_idx))

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         file_path, label_idx = self.data[idx]

#         # Load mel-spectrogram from .npy
#         mel_tensor = torch.tensor(np.load(file_path), dtype=torch.float32)

#         # One-hot encode the label
#         one_hot_label = torch.zeros(self.num_classes, dtype=torch.float32)
#         one_hot_label[label_idx] = 1.0

#         return one_hot_label, mel_tensor

In [None]:
dataset = MelSpectrogramDataset(root_dir='data')
train_dataset, test_dataset = create_train_test_split(dataset, test_size=0.2)

# Creating DataLoaders for train and test datasets
from torch.utils.data import DataLoader

batch_size = 32
train_loader = DataLoader(dataset, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(dataset, batch_size = batch_size, shuffle = False)

# For testing
# for label, mel in train_dataset:
#     print("Label (one-hot):", label)
#     print("Mel tensor shape:", mel.shape)
#     break


AttributeError: 'tuple' object has no attribute 'shape'

In [3]:
import torch
import torch.nn as nn

class ResBlock(torch.nn.Module):
    """
    ResNet block.
    1x1, 3x3, 1x1 kernels.
    """

    EXP = 4

    def __init__(self, in_channels: int, out_channels: int, stride: int=1):
        super(ResBlock, self).__init__()
        self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
        self.bn1 = torch.nn.BatchNorm2d(out_channels)
        self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = torch.nn.BatchNorm2d(out_channels)
        self.conv3 = torch.nn.Conv2d(out_channels, out_channels * self.EXP, kernel_size=1, bias=False)
        self.bn3 = torch.nn.BatchNorm2d(out_channels * self.EXP)
        self.relu = torch.nn.ReLU(inplace=True)
        
        self.downsample = None if (stride == 1 and in_channels == out_channels * self.EXP) else torch.nn.Sequential(
                torch.nn.Conv2d(in_channels, out_channels * self.EXP, kernel_size=1, stride=stride, bias=False),
                torch.nn.BatchNorm2d(out_channels * self.EXP)
                )

    def forward(self, x):
        inp = x if self.downsample is None else self.downsample(x)

        out = self.relu(self.bn1(self.conv1(x)))
        
        out = self.relu(self.bn2(self.conv2(out)))

        out = self.bn3(self.conv3(out))

        out += inp
        out = self.relu(out)

        return out

class ResLayer(torch.nn.Module):
    """
    Chain of ResNet blocks.
    """
    def __init__(self, in_channels: int, out_channels: int, blocks: int, stride: int=1):
        super(ResLayer, self).__init__()
        layers = []
        layers.append(ResBlock(in_channels, out_channels, stride=stride))

        for _ in range(1, blocks):
            layers.append(ResBlock(out_channels * ResBlock.EXP, out_channels))

        self.layers = torch.nn.Sequential(*layers)

    def forward(self, x):
        return self.layers(x)

class SelfAttentionPooling(torch.nn.Module):

    def __init__(self, input_dim: int):
        super(SelfAttentionPooling, self).__init__()
        self.query = torch.nn.Parameter(torch.randn(input_dim))

    def forward(self, x):
        attn_scores = torch.einsum('sbi,i->sb', x, self.query)
        attn_weights = torch.nn.functional.softmax(attn_scores, dim=0)
        attn_weights = attn_weights.unsqueeze(-1)
        return torch.sum(x * attn_weights, dim=0)

class CaiNet(torch.nn.Module):

    def __init__(self, classes: int):
        super(CaiNet, self).__init__()
        self.in_channels = 16
        self.conv = torch.nn.Conv2d(1, 16, kernel_size=7, stride=1, padding=3, bias=False)
        self.bn = torch.nn.BatchNorm2d(16)
        self.relu = torch.nn.ReLU(inplace=True)

        self.res1 = ResLayer(16, 4, 3, 1)
        self.res2 = ResLayer(16, 8, 4, 2)
        self.res3 = ResLayer(32, 16, 6, 2)
        self.res4 = ResLayer(64, 32, 3, 2)

        # Dropout layers
        # self.dropout1 = nn.Dropout2d(0.25)
        self.dropout2 = nn.Dropout2d(0.5)
        # self.dropout3 = nn.Dropout(0.25)

        self.pool = torch.nn.AdaptiveAvgPool2d((1, None))
        self.lstm = torch.nn.LSTM(128, 128, bidirectional=True)

        self.attention = SelfAttentionPooling(input_dim=256)
        self.fc = torch.nn.Linear(256, classes)

    def forward(self, x):

        # input transform
        c1out = self.relu(self.bn(self.conv(x)))

        # through resnet layers
        resout = self.res1(c1out)
        resout = self.res2(resout)
        resout = self.dropout2(resout)
        resout = self.res3(resout)
        resout = self.dropout2(resout)
        resout = self.res4(resout)

        # bi-lstm
        lstmin = self.pool(resout)
        lstmin = lstmin.squeeze(2)
        lstmin = lstmin.permute(2, 0, 1)
        lstmout, _ = self.lstm(lstmin)

        # attention and classification
        attn = self.attention(lstmout)
        pred = self.fc(attn)

        return pred
