In [1]:
import pandas as pd
import config

train_labels = pd.read_csv(config.TARGET_FILE, sep='\t', header=None, names=['file', 'gender'])

In [2]:
import librosa
import numpy as np

def extract_features(file_path, n_mfcc=config.MFCC_N):
    y, sr = librosa.load(file_path, sr=None)

    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mfcc_mean = np.mean(mfcc.T, axis=0)
    mfcc_std = np.std(mfcc.T, axis=0)

    return np.hstack([mfcc_mean, mfcc_std])

In [3]:
from torch.utils.data import Dataset
import os
import torch

class AudioDataset(Dataset):
    def __init__(self, folder, labels, transform=None):
        self.folder = folder
        self.labels = labels
        self.transform = transform
        self.files = [f.replace(".wav", "") for f in os.listdir(folder) if f.endswith(".wav")]

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        file_name = self.files[idx]
        file_path = os.path.join(self.folder, file_name + ".wav")
        features = extract_features(file_path)
    
        if self.transform:
            features = self.transform(features)
    
        matching_labels = self.labels.loc[self.labels['file'] == file_name, 'gender']
        
        label = matching_labels.values[0]
        return torch.tensor(features, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

In [4]:
train_dataset = AudioDataset(config.TRAIN_FOLDER, train_labels)
test_dataset = AudioDataset(config.TEST_FOLDER, train_labels)

In [5]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=config.BATCH_SIZE, shuffle=False)

In [6]:
input_size = config.MFCC_N * 2
hidden_size = config.LSTM_HIDDEN_SIZE
num_classes = config.NUM_CLASSES

In [7]:
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)
        x, (hn, cn) = self.lstm(x)
        x = self.fc(hn[-1])
        return x

In [8]:
model = LSTMModel(input_size, hidden_size, num_classes).to(config.DEVICE)

In [9]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=config.LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

In [10]:
from tensorboardX import SummaryWriter

writer = SummaryWriter(config.LOG_DIR)

In [None]:
for epoch in range(config.EPOCHS):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(config.DEVICE), labels.to(config.DEVICE)

        optimizer.zero_grad()
        outputs = model(inputs)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    writer.add_scalar('Loss/train', running_loss / len(train_loader), epoch)
    writer.add_scalar('Accuracy/train', correct / total, epoch)

    print(
        f"Epoch [{epoch + 1}/{config.EPOCHS}], Loss: {running_loss / len(train_loader):.4f}, Accuracy: {correct / total:.4f}")

In [None]:
embeddings = []
labels = []

model.eval()
with torch.no_grad():
    for inputs, target_labels in train_loader:
        inputs = inputs.to(config.DEVICE)
        outputs = model(inputs)
        embeddings.append(outputs.cpu().numpy())
        labels.append(target_labels.numpy())

embeddings = np.vstack(embeddings)
labels = np.hstack(labels)

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)

plt.figure(figsize=(10, 8))
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=labels, cmap='viridis', alpha=0.7)
plt.colorbar(scatter, label='Целевая переменная')
plt.title("t-SNE проекция эмбеддингов")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.show()

In [None]:
import umap

reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2)
embeddings_2d = reducer.fit_transform(embeddings)

plt.figure(figsize=(10, 8))
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=labels, cmap='viridis', alpha=0.7)
plt.colorbar(scatter, label='Целевая переменная')
plt.title("UMAP проекция эмбеддингов")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.show()