In [1]:
import torch
import pickle
from model.EffectDecoder import EffectDecoder
from transformers import ASTModel
from sklearn.metrics import accuracy_score, roc_auc_score
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class EffectClassifier(torch.nn.Module):
    def __init__(self, n_classes,embed_dim=768):
        super(EffectClassifier, self).__init__()
        self.pretrained = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
        self.embed = torch.nn.Linear(embed_dim,embed_dim)
        self.cls = torch.nn.Linear(embed_dim, n_classes)
        self.relu = torch.nn.ReLU()
        self.softmax = torch.nn.Softmax(dim=-1)

    def forward(self, x):
        x = self.pretrained(**x).pooler_output
        x = self.relu(self.embed(x))
        x = self.cls(x)
        return self.softmax(x)

In [21]:
dataset = pickle.load(open("data/guitar_sample_dataset_multiclass.pkl", "rb"))

In [24]:
dataset[0]

{'dry_tone_path': 'data/instrument_dataset/Train_submission/Train_submission/1-E1-Major 00.wav',
 'wet_tone_path': 'data/wet_tones/1-E1-Major 00_wet_0.wav',
 'wet_tone_features': {'input_values': tensor([[[-1.2310, -1.2776, -1.1612,  ..., -1.2776, -1.2776, -1.2776],
          [-1.1927, -1.2776, -1.0318,  ..., -1.2776, -1.2776, -1.2776],
          [-1.0118, -1.2629, -0.8861,  ..., -1.2776, -1.2776, -1.2776],
          ...,
          [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670],
          [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670],
          [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670]]])},
 'effect_names': ['Gain'],
 'effects': tensor([[0., 0., 0., 1., 0.]]),
 'parameters': tensor([[  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
            0.0000,   0.0000, -17.1298,   0.0000,   0.0000,   0.0000,   0.0000,
            0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
            0.0000,   0.0000,

In [25]:
train_data, test_data = train_test_split(dataset, test_size=0.2)

In [26]:
def eval(model, loss_fn, dl):
    model.eval()
    total_loss = 0
    labels = []
    preds = []
    for batch in tqdm.tqdm(dl):
        features = batch['wet_tone_features'].to(device)
        label = batch['effects'].to(device)
        with torch.no_grad():
            output = model(features)
        loss = loss_fn(output, label)
        total_loss += loss.item()
        preds.append(torch.argmax(output, dim=-1).cpu().numpy())
        labels.append(torch.argmax(label).cpu().numpy())
    print(f"Accuracy:{accuracy_score(labels, preds)} | Total Loss:{total_loss}")
    return

In [27]:
def train(model, optimizer, loss_fn, train_loader,test_loader, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm.tqdm(train_loader):
            optimizer.zero_grad()
            features = batch['wet_tone_features'].to(device)
            labels = batch['effects'].to(device)
            output = model(features)
            loss = loss_fn(output, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss}")
        eval(model, loss_fn, test_loader)
    return

In [28]:
model = EffectClassifier(5).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=.0000005)
loss_fn = torch.nn.CrossEntropyLoss()

In [33]:
train(model, optimizer, loss_fn, train_data, test_data, epochs=5)

100%|██████████| 400/400 [01:34<00:00,  4.23it/s]


Epoch 1, Loss: 627.8390402793884


100%|██████████| 100/100 [00:06<00:00, 15.92it/s]


Accuracy:0.56 | Total Loss:152.8575165271759


100%|██████████| 400/400 [01:34<00:00,  4.21it/s]


Epoch 2, Loss: 582.9692931175232


100%|██████████| 100/100 [00:06<00:00, 15.92it/s]


Accuracy:0.67 | Total Loss:140.95802009105682


100%|██████████| 400/400 [01:32<00:00,  4.34it/s]


Epoch 3, Loss: 537.9732059836388


100%|██████████| 100/100 [00:06<00:00, 15.64it/s]


Accuracy:0.7 | Total Loss:132.32380890846252


100%|██████████| 400/400 [01:41<00:00,  3.95it/s]


Epoch 4, Loss: 509.97031432390213


100%|██████████| 100/100 [00:06<00:00, 15.88it/s]


Accuracy:0.73 | Total Loss:127.62350732088089


100%|██████████| 400/400 [01:29<00:00,  4.47it/s]


Epoch 5, Loss: 492.1641817688942


100%|██████████| 100/100 [00:05<00:00, 16.82it/s]

Accuracy:0.74 | Total Loss:124.5223405957222





I0000 00:00:1730742580.129816  279245 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5520 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4070 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


In [None]:
import crepe
from ddsp.spectral_ops import compute_loudness, stft


In [5]:
import crepe
from scipy.io import wavfile

sr, audio = wavfile.read('data/dry_tones/Electric1.wav')
time, frequency, confidence, activation = crepe.predict(audio, sr, viterbi=True)

  sr, audio = wavfile.read('data/dry_tones/Electric1.wav')
I0000 00:00:1730761482.262423   16273 service.cc:148] XLA service 0x7fc3d800b690 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1730761482.263291   16273 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 4070 Laptop GPU, Compute Capability 8.9
2024-11-04 17:04:42.298456: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1730761482.457039   16273 cuda_dnn.cc:529] Loaded cuDNN version 90300
2024-11-04 17:04:50.179877: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Trying algorithm eng11{k2=4,k3=0} for conv (f32[32,128,128,1]{3,2,1,0}, u8[0]{0}) custom-call(f32[32,1024,191,1]{3,2,1,0}, f32[128,1024,64,1]{3,2,1,0}, f32[128]{0}), window={size=64x1}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", back

[1m 7/19[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m0s[0m 9ms/step 

I0000 00:00:1730761485.271276   16273 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m13/19[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m0s[0m 9ms/step

2024-11-04 17:04:46.602068: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:306] Allocator (GPU_0_bfc) ran out of memory trying to allocate 74.62GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2024-11-04 17:04:47.572371: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:306] Allocator (GPU_0_bfc) ran out of memory trying to allocate 16.93GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 151ms/step


In [None]:
import tf2onnx
tf2onnx.convert.from_keras(model, output_path='model.onnx')

In [7]:
frequency.shape

(596,)

In [9]:
confidence

array([0.5334961 , 0.6296221 , 0.8698378 , 0.93597   , 0.9421039 ,
       0.94829744, 0.9376753 , 0.93722534, 0.9454341 , 0.9401556 ,
       0.94271576, 0.94477797, 0.9450136 , 0.94556755, 0.9531479 ,
       0.94853216, 0.9408216 , 0.9497386 , 0.9543725 , 0.9439076 ,
       0.9441276 , 0.95267344, 0.9476809 , 0.9511987 , 0.9556084 ,
       0.9535724 , 0.95312613, 0.9589498 , 0.955145  , 0.9472393 ,
       0.9572332 , 0.9571534 , 0.9460222 , 0.9498669 , 0.95371455,
       0.9439485 , 0.9469752 , 0.9478384 , 0.9500707 , 0.9501963 ,
       0.9503716 , 0.948262  , 0.94317514, 0.95212287, 0.94810647,
       0.93393356, 0.9448331 , 0.9501995 , 0.9366317 , 0.9421654 ,
       0.9387107 , 0.9382332 , 0.9428285 , 0.9382944 , 0.9340488 ,
       0.9387747 , 0.9382097 , 0.9353085 , 0.9322601 , 0.9364581 ,
       0.9396669 , 0.92812693, 0.9326183 , 0.9353108 , 0.9333069 ,
       0.93721473, 0.93084145, 0.9314979 , 0.94108725, 0.93568885,
       0.93526834, 0.9308967 , 0.93698275, 0.9380307 , 0.92509

In [11]:
time

array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
       0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
       0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
       0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
       0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
       0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
       0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
       0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
       0.99, 1.  , 1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, 1.08, 1.09,
       1.1 , 1.11, 1.12, 1.13, 1.14, 1.15, 1.16, 1.17, 1.18, 1.19, 1.2 ,
       1.21, 1.22, 1.23, 1.24, 1.25, 1.26, 1.27, 1.28, 1.29, 1.3 , 1.31,
       1.32, 1.33, 1.34, 1.35, 1.36, 1.37, 1.38, 1.39, 1.4 , 1.41, 1.42,
       1.43, 1.44, 1.45, 1.46, 1.47, 1.48, 1.49, 1.