In [2]:
import numpy as np
sample_file = "D:\\projects\\voice_conversion\\data\\features\\LJ001-0001_features.npy"
features = np.load(sample_file)
print("Feature Shape:", features.shape)

Feature Shape: (13, 416)


In [None]:
#Find your max MFCC length
import numpy as np
import os

data_dir = "D:\\projects\\voice_conversion\\data\\features\\"
max_len = 0

for file in os.listdir(data_dir):
    if file.endswith("_features.npy"):
        mfcc = np.load(os.path.join(data_dir, file))
        max_len = max(max_len, mfcc.shape[0])  # Update max length

print(f"🔍 Maximum sequence length: {max_len}")


🔍 Maximum sequence length: 13


In [None]:
#Modify your VoiceDataset class to pad all sequences to this max length.


import torch
from torch.nn.utils.rnn import pad_sequence

class VoiceDataset(Dataset):
    """Loads MFCC features from .npy files and pads sequences"""
    def __init__(self, data_dir, max_len):
        self.files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith("_features.npy")]
        self.max_len = max_len

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        data = np.load(self.files[idx])  # Shape: (time_frames, 13)
        pad_size = self.max_len - data.shape[0]

        # Pad the sequence with zeros if it's shorter than max_len
        if pad_size > 0:
            data = np.pad(data, ((0, pad_size), (0, 0)), mode='constant')

        return torch.tensor(data, dtype=torch.float32)

# Set max_len from step 1
max_len = 13  # (Example value, replace with your actual max_len)
dataset = VoiceDataset("D:\\projects\\voice_conversion\\data\\features\\", max_len)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)


In [None]:
#Modify Dataset 

import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import os

class VoiceDataset(Dataset):
    """Loads MFCC features from .npy files without padding"""
    def __init__(self, data_dir):
        self.files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith("_features.npy")]

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        data = np.load(self.files[idx])  # Shape: (time_frames, 13)
        return torch.tensor(data, dtype=torch.float32)



In [None]:
# Modify your collate_fn to pad along the second dimension (time frames)
# instead of the first dimension (features):
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    """
    Custom collate function to pad sequences along the time dimension.
    """
    # Convert list of tensors to a list
    batch = [x.T for x in batch]  # Transpose to shape (Time Frames, 13)

    # Pad sequences along time frames to match the longest sequence in the batch
    batch_padded = pad_sequence(batch, batch_first=True, padding_value=0)

    # Transpose back to shape (Batch, 13, Time Frames)
    return batch_padded.permute(0, 2, 1)


In [13]:
dataset = VoiceDataset("D:\\projects\\voice_conversion\\data\\features\\")
dataloader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)


In [14]:
class VoiceConversionModel(nn.Module):
    """Neural network for voice conversion with dynamic sequence length"""
    def __init__(self, input_dim=13, hidden_dim=128):
        super(VoiceConversionModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, input_dim)  # Output same feature dimension

    def forward(self, x):
        x, _ = self.lstm(x)  # LSTM for sequence modeling
        return self.fc(x)  # Apply fully connected layer


In [16]:
model = VoiceConversionModel()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

for epoch in range(10):
    for batch in dataloader:
        batch = batch.permute(0, 2, 1)  # Reshape input for model
        
        optimizer.zero_grad()
        output = model(batch)
        loss = loss_fn(output, batch)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# Save model
torch.save(model.state_dict(), "D:\\projects\\voice_conversion\\models\\voice_conversion.pth")
print("✅ Model training completed! Model saved.")


Epoch 1, Loss: 15225.76171875
Epoch 2, Loss: 11782.896484375
Epoch 3, Loss: 10468.3388671875
Epoch 4, Loss: 7459.92919921875
Epoch 5, Loss: 5485.767578125
Epoch 6, Loss: 3637.851806640625
Epoch 7, Loss: 2971.788818359375
Epoch 8, Loss: 2276.29150390625
Epoch 9, Loss: 1435.498779296875
Epoch 10, Loss: 1084.8226318359375
✅ Model training completed! Model saved.
