In [7]:
from utils import load_feature_tensor_data, set_seed, load_tensor_dataset, load_feature_tensor_data_full, load_data
from torch.utils.data import DataLoader, Subset
from trainer import Trainer
from torch import nn, optim
from mlp import basic_mlp
from pretrained_models import HubertClassifier, Wav2Vec2Classifier
import torch
from custom_transformer import AudioTransformer
import numpy as np
import torch.nn.functional as F

In [None]:
set_seed(42)

# Noise class handling

In [None]:
from noise_pp import split_noise
split_noise('silence', 0.14,0.14)

# Pretrained transformer

In [13]:
train_loader, val_loader, test_loader = load_feature_tensor_data(audio_dir="tensorflow-speech-recognition-challenge/train/audio",
    val_txt="tensorflow-speech-recognition-challenge/train/validation_list.txt",
    test_txt="tensorflow-speech-recognition-challenge/train/testing_list.txt",
    batch_size=16,
    sample_rate=16000) # change to load_feature_tensor_data_full for RNN and LSTM

weights = torch.tensor([0.06074972, 0.06097921, 0.06131008, 0.06134336, 0.06144344,
       0.06101214, 0.06061936, 0.06144344, 0.05994402, 0.06071708,
       6.38696739, 0.00347077]).to('cuda') # Change to appropriate device

Training samples: 51386
Validation samples: 6851
Test samples: 6888
Label distribution in training set:
yes: 1860
no: 1853
up: 1843
down: 1842
left: 1839
right: 1852
on: 1864
off: 1839
stop: 1885
go: 1861
silence: 292
unknown: 32556


In [None]:
MODEL_NAME    = 'test_model'
LEARNING_RATE = 0.0001

In [None]:
model = basic_mlp(12)
model.name = MODEL_NAME
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE) 
criterion = nn.CrossEntropyLoss(weight=weights) # change to None for unweighted
# criterion = nn.CrossEntropyLoss()

# Trener
trainer = Trainer(
    model_instance=model,
    optimizer=optimizer,
    criterion=criterion,
    train_loader=train_loader,
    valid_loader=val_loader,
    test_loader=test_loader
)

# Trening
trainer.train_multiple()

In [2]:
train_loader = load_tensor_dataset('wav2vec_train.pt', batch_size=8)
val_loader = load_tensor_dataset('wav2vec_valid.pt', batch_size=8)
test_loader =load_tensor_dataset('wav2vec_test.pt', batch_size=8)

# Custom transformer

In [None]:
train_loader, val_loader, test_loader = load_data(audio_dir="tensorflow-speech-recognition-challenge/train/audio",
    val_txt="tensorflow-speech-recognition-challenge/train/validation_list.txt",
    test_txt="tensorflow-speech-recognition-challenge/train/testing_list.txt",
    batch_size=16,
    sample_rate=16000)

weights = torch.tensor([0.06074972, 0.06097921, 0.06131008, 0.06134336, 0.06144344,
       0.06101214, 0.06061936, 0.06144344, 0.05994402, 0.06071708,
       6.38696739, 0.00347077]).to('cuda') # Change to appropriate device

In [None]:
MODEL_NAME    = 'test_model'
LEARNING_RATE = 0.0001

In [None]:
model = AudioTransformer(num_classes=12, patch_time=8)
model.name = MODEL_NAME
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(weight=weights)

# Trener
trainer = Trainer(
    model_instance=model,
    optimizer=optimizer,
    criterion=criterion,
    train_loader=train_loader,
    valid_loader=val_loader,
    test_loader=test_loader
)

# Trening
trainer.train(n_epochs=10)

# Undersampling

In [None]:
UNDERSAMPLING_FACTOR = 0.1

dataset = train_loader.dataset 
labels = dataset.tensors[1].numpy()
indices_class_11 = np.where(labels == 11)[0]
indices_other = np.where(labels != 11)[0]
undersampled_class_11 = np.random.choice(indices_class_11, size=int(UNDERSAMPLING_FACTOR * len(indices_class_11)), replace=False)
final_indices = np.concatenate([indices_other, undersampled_class_11])
np.random.shuffle(final_indices)
undersampled_dataset = Subset(dataset, final_indices)

undersampled_loader = DataLoader(undersampled_dataset, batch_size=32, shuffle=True, num_workers=0)

# Use the undersampled_loader as the train_loader replacement

The rest remains the same