# Imports and Constants

### Data Loading

In [2]:
!pip install datasets

import torch
from torch import nn
from datasets import load_dataset
from torch.utils.data import DataLoader

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [3]:
LANGUAGES = ["en", "de", "nl", "sv-SE", "da"]
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
LEARNING_RATE = 0.001
NUM_EPOCHS = 20
BATCH_SIZE = 20
TRAIN_BATCH = 100
VAL_BATCH = 0
SAMPLE_RATE = 48000
OPTIMIZER = torch.optim.Adam
LOSS_FUNCTION = nn.CrossEntropyLoss()

# Constants related to the MFCC processing
# the number of samples per fft
N_FFT = 2048
# the amount of transform to shift
HOP_LENGTH = 512
# The number of coefficient we extract
N_MFCC = 13

### Data Processing

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns

from glob import glob # list out files in directory -> reading wave files

import librosa
import librosa.display
import IPython.display as ipd # play files

from itertools import cycle # colours and gimiks

### Training

In [47]:
import numpy as np
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data.sampler import SubsetRandomSampler
import torchvision.transforms as transforms
import random

# Download Dataset

In [6]:
# linking hugging face account
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: fineGrained).
The token `Sneed` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authen

In [55]:
def load_data(languages, train_batch, val_batch):
  train_data = []
  val_data = []

  train_audio = []
  train_labels = []
  train_sr = []
  val_audio = []
  val_labels = []
  val_sr = []

  one_hot = F.one_hot(torch.tensor([0, 1, 2, 3, 4]), num_classes=len(languages))

  for i in range(len(one_hot)):
    one_hot[i] = one_hot[i].to(dtype=torch.float)

  for i in range(len(languages)):
    random_seed = random.randrange(1,100)
    # Load common voice 17 dataset training set with streaming, and enabling custom code (necessary to load dataset correctly)
    train_set = load_dataset("mozilla-foundation/common_voice_17_0", languages[i], split="train", streaming=True, trust_remote_code=True)
    train_data.append(train_set.shuffle(buffer_size=train_batch, seed=random_seed))
    val_set = load_dataset("mozilla-foundation/common_voice_17_0", languages[i], split="validation", streaming=True, trust_remote_code=True)
    val_data.append(val_set.shuffle(buffer_size=val_batch, seed=random_seed))

    it = iter(train_data[i])
    it2 = iter(val_data[i])

    for j in range(train_batch):
      train_item = next(it)

      if train_item:
        train_audio.append(train_item['audio']['array'])
        train_sr.append(train_item['audio']['sampling_rate'])
        train_labels.append(one_hot[i])

    for j in range(val_batch):
      val_item = next(it2)

      if val_item:
        val_audio.append(val_item['audio']['array'])
        val_sr.append(val_item['audio']['sampling_rate'])
        val_labels.append(one_hot[i])


    print(f"Loaded {languages[i]}")

  return train_audio, train_labels, train_sr, val_audio, val_labels, val_sr

In [52]:
train_audio, train_labels, train_sr, val_audio, val_labels, val_sr = load_data(LANGUAGES, TRAIN_BATCH, VAL_BATCH)

Reading metadata...: 1101170it [00:20, 54898.62it/s]


Loaded en


Reading metadata...: 589100it [00:11, 53546.93it/s]


Loaded de


Reading metadata...: 34898it [00:00, 67459.85it/s]


Loaded nl


Reading metadata...: 7744it [00:00, 36730.85it/s]


Loaded sv-SE


Reading metadata...: 3484it [00:00, 26111.30it/s]
Exception ignored from cffi callback <function SoundFile._init_virtual_io.<locals>.vio_read at 0x7f8d2da79120>:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/soundfile.py", line 1290, in vio_read
    @_ffi.callback("sf_vio_read")

KeyboardInterrupt: 


Loaded da


# Data Processing

In [8]:
# Preprocessing audio

def process_batch(audio_data, sample_rates, batch_size):
  audio_processed = []

  for i in range(batch_size):
    # Resample
    audio_resampled = librosa.resample(audio_data[i], orig_sr = sample_rates[i], target_sr = SAMPLE_RATE)

    # Trimming decibels
    audio_trimmed, _ = librosa.effects.trim(audio_resampled, top_db=80)

    audio_length = 49000 * 7
    if len(audio_trimmed) > audio_length:
      audio_trimmed = audio_trimmed[:audio_length]

    elif len(audio_trimmed) < audio_length:
      padding = audio_length - len(audio_trimmed)
      audio_trimmed = np.pad(audio_trimmed, (0, padding), mode='constant')

    audio_trimmed = librosa.feature.mfcc(y = np.abs(audio_trimmed), sr=SAMPLE_RATE, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mfcc=N_MFCC)

    audio_processed.append(audio_trimmed)

  return audio_processed

In [9]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm

TEST_INDEX = 1

MFCC_data = process_batch(train_audio, train_sr, TRAIN_BATCH)

D = librosa.amplitude_to_db(np.abs(MFCC_data[TEST_INDEX]), ref=np.max)
img = librosa.display.specshow(D, y_axis='linear', x_axis='time', sr=SAMPLE_RATE)

print(MFCC_data[TEST_INDEX].shape)

NameError: name 'train_audio' is not defined

In [58]:
import torch
from torch.utils.data import Dataset, DataLoader
import librosa
import numpy as np

class AudioDataset(Dataset):
    def __init__(self, audios, labels, sample_rates, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mfcc=N_MFCC, transform=None):
        """
        audios: list of raw audio arrays
        labels: list of labels
        """
        self.audios = audios
        self.labels = labels
        self.sample_rates = sample_rates
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.n_mfcc = n_mfcc
        self.transform = transform

    def __len__(self):
        return len(self.audios)

    def __getitem__(self, idx):
        audio = self.audios[idx]
        label = self.labels[idx]
        sr = self.sample_rates[idx]

        audio_length = SAMPLE_RATE*5

        # Resample
        audio_resampled = librosa.resample(audio, orig_sr = sr, target_sr = SAMPLE_RATE)

        # Trimming decibels
        audio_trimmed, _ = librosa.effects.trim(audio_resampled, top_db=80)

        # if audio too long trim down length (sr = 49000, so 3 sec)
        if len(audio_trimmed) > audio_length:
          audio_trimmed = audio_trimmed[:audio_length]

        # if audio too short, add padding
        elif len(audio_trimmed) < audio_length:
          padding = audio_length - len(audio_trimmed)
          audio_trimmed = np.pad(audio_trimmed, (0, padding), mode='constant')

        # Calculate MFCCs for the trimmed audio
        mfcc = librosa.feature.mfcc(y = np.abs(audio_trimmed), sr=SAMPLE_RATE, n_fft=self.n_fft, hop_length=self.hop_length, n_mfcc=self.n_mfcc)
        mfcc = torch.tensor(mfcc, dtype=torch.float)

        return mfcc, label

In [39]:
audio_dataset = AudioDataset(train_audio, train_labels, train_sr)
train_loader = DataLoader(audio_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)

# CNN Model

In [13]:
class ConvNetwork(nn.Module):
  def __init__(self):
    super().__init__()

    self.c1 = nn.Sequential(
      nn.Conv1d(
        in_channels=13,
        out_channels=32,
        kernel_size=3,
        stride=1,
        padding=2,
      ),
      nn.BatchNorm1d(32),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2),
      nn.Dropout(p=0.2)
    )

    self.c2 = nn.Sequential(
      nn.Conv1d(
        in_channels=16,
        out_channels=64,
        kernel_size=3,
        stride=1,
        padding=2
      ),
      nn.BatchNorm1d(64),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2),
      nn.Dropout(p=0.2)
    )

    self.c3 = nn.Sequential(
      nn.Conv1d(
        in_channels=32,
        out_channels=128,
        kernel_size=3,
        stride=1,
        padding=2
      ),
      nn.BatchNorm1d(128),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2),
      nn.Dropout(p=0.2)
    )

    self.global_max = nn.MaxPool2d(kernel_size=2)
    self.flatten = nn.Flatten()
    self.linear = nn.Linear(1344, len(LANGUAGES)) # BATCH_SIZE or other params
    self.softmax = nn.Softmax(dim=1)

  def forward(self, input):
    x = self.c1(input)
    x = self.c2(x)
    x = self.c3(x)
    x = self.global_max(x)
    x = self.flatten(x)
    logits = self.linear(x)
    predictions = self.softmax(logits)
    return logits, predictions

### Train

In [14]:
from google.colab import drive
import os
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
def save_result():
  save_dir = "/content/drive/My Drive/training_results"
  os.makedirs(save_dir, exist_ok=True)
  # np.savetxt(f"{save_dir}/{model_path}_train_err.csv", train_err)


def save_weight(net, model_code):
  """model code will be a string code we assigned to each model"""
  weight_dir = "/content/drive/My Drive/model_weights"
  os.makedirs(weight_dir, exist_ok=True)

  model_path = os.path.join(weight_dir, model_code)
  torch.save(net.state_dict(), f"{model_path}.pth")


def load_weight(net, model_code):
  """model code will be a string code we assigned to each model"""
  weight_dir = "/content/drive/My Drive/model_weights"
  model_path = os.path.join(weight_dir, model_code)

  net.load_state_dict(torch.load(f"{model_path}.pth"))
  net.eval()
  return net

In [36]:
def train(model, data_loader, device, optimizer, loss_function, num_epochs):
  start_time = time.time()

  for epoch in range(num_epochs):
    for data, label in data_loader:
      logits, predictions = model(data)
      loss = loss_function(logits, label.to(dtype=torch.float))

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

    print(f"Epoch #{epoch} | Loss: {loss.item()}")

  end_time = time.time()
  elapsed_time = end_time - start_time
  print(f"Time elapsed: {elapsed_time:.4f} seconds")

def test_accuracy_old(model, data_loader, device, optimizer, loss_function, num_epochs):
  start_time = time.time()

  for epoch in range(num_epochs):
    for data, label in data_loader:
      logits, predictions = model(data)
      loss = loss_function(logits, label.to(dtype=torch.float))

    print(f"Epoch #{epoch} | Loss: {loss.item()}")

  end_time = time.time()
  elapsed_time = end_time - start_time
  print(f"Time elapsed: {elapsed_time:.4f} seconds")


def test_accuracy(model, data_loader, device, optimizer, loss_function, num_epochs):
    start_time = time.time()

    for epoch in range(num_epochs):
        epoch_loss = 0.0
        correct = 0
        total = 0

        for data, label in data_loader:
            data = data.to(device)
            label = label.to(device)

            logits, predictions = model(data)
            loss = loss_function(logits, label.to(dtype=torch.float))
            epoch_loss += loss.item() * data.size(0)

            pred_classes = torch.argmax(predictions, dim=1)
            true_classes = torch.argmax(label, dim=1)

            correct += torch.eq(pred_classes, true_classes).sum().item()

            total += label.size(0)

        avg_loss = epoch_loss / total
        accuracy = correct / total * 100
        print(f"Epoch #{epoch+1} | Loss: {avg_loss:.4f} | Accuracy: {accuracy:.2f}%")

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Time elapsed: {elapsed_time:.4f} seconds")

In [26]:
cnn = ConvNetwork()

loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn.parameters(), lr=LEARNING_RATE)

train(cnn, train_loader, DEVICE, optimizer, loss_function, NUM_EPOCHS)

save_weight(cnn, "Placeholder")

KeyboardInterrupt: 

In [57]:
cnn = ConvNetwork()

loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn.parameters(), lr=LEARNING_RATE)
load_weight(cnn, "cnn_test_1")

# test_accuracy(cnn, train_loader, DEVICE, optimizer, loss_function, 1)
train_audio, train_labels, train_sr, val_audio, val_labels, val_sr = load_data(LANGUAGES, TRAIN_BATCH, VAL_BATCH)
audio_dataset = AudioDataset(train_audio, train_labels, train_sr)
train_loader = DataLoader(audio_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
test_accuracy(cnn, train_loader, DEVICE, optimizer, loss_function, 1)

train_audio, train_labels, train_sr, val_audio, val_labels, val_sr = load_data(LANGUAGES, TRAIN_BATCH, VAL_BATCH)
audio_dataset = AudioDataset(train_audio, train_labels, train_sr)
train_loader = DataLoader(audio_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
test_accuracy(cnn, train_loader, DEVICE, optimizer, loss_function, 1)

train_audio, train_labels, train_sr, val_audio, val_labels, val_sr = load_data(LANGUAGES, TRAIN_BATCH, VAL_BATCH)
audio_dataset = AudioDataset(train_audio, train_labels, train_sr)
train_loader = DataLoader(audio_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
test_accuracy(cnn, train_loader, DEVICE, optimizer, loss_function, 1)

train_audio, train_labels, train_sr, val_audio, val_labels, val_sr = load_data(LANGUAGES, TRAIN_BATCH, VAL_BATCH)
audio_dataset = AudioDataset(train_audio, train_labels, train_sr)
train_loader = DataLoader(audio_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
test_accuracy(cnn, train_loader, DEVICE, optimizer, loss_function, 1)

train_audio, train_labels, train_sr, val_audio, val_labels, val_sr = load_data(LANGUAGES, TRAIN_BATCH, VAL_BATCH)
audio_dataset = AudioDataset(train_audio, train_labels, train_sr)
train_loader = DataLoader(audio_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
test_accuracy(cnn, train_loader, DEVICE, optimizer, loss_function, 1)

  net.load_state_dict(torch.load(f"{model_path}.pth"))
Reading metadata...: 1101170it [00:18, 59452.39it/s]


Loaded en


Reading metadata...: 589100it [00:12, 47126.42it/s]


Loaded de


Reading metadata...: 34898it [00:00, 61409.25it/s]


Loaded nl


Reading metadata...: 7744it [00:01, 6881.94it/s]


Loaded sv-SE


Reading metadata...: 3484it [00:00, 25758.32it/s]


Loaded da
Epoch #1 | Loss: 0.5542 | Accuracy: 79.40%
Time elapsed: 45.6363 seconds


Reading metadata...: 1101170it [00:18, 58839.69it/s]


Loaded en


Reading metadata...: 589100it [00:09, 59928.85it/s]


Loaded de


Reading metadata...: 34898it [00:00, 68365.23it/s]


Loaded nl


Reading metadata...: 7744it [00:00, 44515.07it/s]


Loaded sv-SE


Reading metadata...: 3484it [00:00, 29142.50it/s]


Loaded da
Epoch #1 | Loss: 0.7725 | Accuracy: 68.20%
Time elapsed: 49.1997 seconds


Reading metadata...: 1101170it [00:18, 58934.08it/s]


Loaded en


Reading metadata...: 589100it [00:10, 58383.55it/s]


Loaded de


Reading metadata...: 34898it [00:00, 48584.42it/s]


Loaded nl


Reading metadata...: 7744it [00:00, 22470.69it/s]


Loaded sv-SE


Reading metadata...: 3484it [00:00, 31083.19it/s]


Loaded da
Epoch #1 | Loss: 0.5353 | Accuracy: 72.00%
Time elapsed: 47.0952 seconds


Reading metadata...: 1101170it [00:19, 56554.44it/s]


Loaded en


Reading metadata...: 589100it [00:10, 54613.69it/s]


Loaded de


Reading metadata...: 34898it [00:00, 64501.37it/s]


Loaded nl


Reading metadata...: 7744it [00:00, 11521.92it/s]


Loaded sv-SE


Reading metadata...: 3484it [00:00, 22867.15it/s]


Loaded da
Epoch #1 | Loss: 0.4643 | Accuracy: 86.40%
Time elapsed: 45.2568 seconds


Reading metadata...: 1101170it [00:17, 61235.65it/s]


Loaded en


Reading metadata...: 589100it [00:10, 54180.35it/s]


Loaded de


Reading metadata...: 34898it [00:00, 46457.42it/s]


Loaded nl


Reading metadata...: 7744it [00:00, 39352.31it/s]


Loaded sv-SE


Reading metadata...: 3484it [00:00, 30182.64it/s]


Loaded da
Epoch #1 | Loss: 0.7375 | Accuracy: 70.40%
Time elapsed: 46.7276 seconds


In [59]:
class ConvNetwork_2(nn.Module):
  def __init__(self):
    super().__init__()

    self.c1 = nn.Sequential(
      nn.Conv2d(
        in_channels=1,
        out_channels=32,
        kernel_size=3,
        stride=1,
        padding=2,
      ),
      nn.BatchNorm2d(32),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2),
      nn.Dropout(p=0.2)
    )

    self.c2 = nn.Sequential(
      nn.Conv2d(
        in_channels=32,
        out_channels=64,
        kernel_size=3,
        stride=1,
        padding=2
      ),
      nn.BatchNorm2d(64),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2),
      nn.Dropout(p=0.2)
    )

    self.c3 = nn.Sequential(
      nn.Conv2d(
        in_channels=64,
        out_channels=128,
        kernel_size=3,
        stride=1,
        padding=2
      ),
      nn.BatchNorm2d(128),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2),
      nn.Dropout(p=0.2)
    )

    self.global_max = nn.MaxPool2d(kernel_size=2)
    self.flatten = nn.Flatten()
    self.linear = nn.Linear(3840, len(LANGUAGES)) # BATCH_SIZE or other params
    self.softmax = nn.Softmax(dim=1)

  def forward(self, input):
    input = input.unsqueeze(1)
    input = input.squeeze(0)
    x = self.c1(input)
    x = self.c2(x)
    x = self.c3(x)
    x = self.global_max(x)
    x = self.flatten(x)
    logits = self.linear(x)
    predictions = self.softmax(logits)
    return logits, predictions

In [62]:
cnn = ConvNetwork_2()

loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn.parameters(), lr=LEARNING_RATE)

train_audio, train_labels, train_sr, val_audio, val_labels, val_sr = load_data(LANGUAGES, TRAIN_BATCH, VAL_BATCH)
audio_dataset = AudioDataset(train_audio, train_labels, train_sr)
train_loader = DataLoader(audio_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)

for i in range(1, 21):
  name = "cnn_test_2_epoch_" + str(i)
  load_weight(cnn, name)
  test_accuracy(cnn, train_loader, DEVICE, optimizer, loss_function, 1)

Reading metadata...: 1101170it [00:17, 62206.15it/s]


Loaded en


Reading metadata...: 589100it [00:08, 68827.65it/s]


Loaded de


Reading metadata...: 34898it [00:00, 42719.24it/s]


Loaded nl


Reading metadata...: 7744it [00:00, 44559.71it/s]


Loaded sv-SE


Reading metadata...: 3484it [00:00, 29489.73it/s]


Loaded da


  net.load_state_dict(torch.load(f"{model_path}.pth"))


Epoch #1 | Loss: 2.3915 | Accuracy: 35.60%
Time elapsed: 43.4324 seconds
Epoch #1 | Loss: 1.5724 | Accuracy: 32.20%
Time elapsed: 43.8623 seconds
Epoch #1 | Loss: 1.4882 | Accuracy: 24.40%
Time elapsed: 43.0753 seconds
Epoch #1 | Loss: 1.4285 | Accuracy: 31.40%
Time elapsed: 44.8217 seconds
Epoch #1 | Loss: 1.4025 | Accuracy: 41.00%
Time elapsed: 44.4776 seconds
Epoch #1 | Loss: 1.3951 | Accuracy: 38.20%
Time elapsed: 47.4758 seconds
Epoch #1 | Loss: 1.4038 | Accuracy: 39.60%
Time elapsed: 44.6774 seconds
Epoch #1 | Loss: 1.4037 | Accuracy: 34.60%
Time elapsed: 43.4356 seconds
Epoch #1 | Loss: 1.3276 | Accuracy: 36.00%
Time elapsed: 44.5422 seconds
Epoch #1 | Loss: 1.2436 | Accuracy: 43.60%
Time elapsed: 45.0891 seconds
Epoch #1 | Loss: 1.1931 | Accuracy: 46.40%
Time elapsed: 42.6359 seconds
Epoch #1 | Loss: 1.2307 | Accuracy: 49.20%
Time elapsed: 42.3193 seconds
Epoch #1 | Loss: 1.1192 | Accuracy: 51.80%
Time elapsed: 44.6080 seconds
Epoch #1 | Loss: 1.2257 | Accuracy: 57.00%
Time ela

In [63]:
cnn = ConvNetwork_2()

loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn.parameters(), lr=LEARNING_RATE)
name = "cnn_test_2_epoch_20"
load_weight(cnn, name)

train_audio, train_labels, train_sr, val_audio, val_labels, val_sr = load_data(LANGUAGES, TRAIN_BATCH, VAL_BATCH)
audio_dataset = AudioDataset(train_audio, train_labels, train_sr)
train_loader = DataLoader(audio_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
test_accuracy(cnn, train_loader, DEVICE, optimizer, loss_function, 1)

train_audio, train_labels, train_sr, val_audio, val_labels, val_sr = load_data(LANGUAGES, TRAIN_BATCH, VAL_BATCH)
audio_dataset = AudioDataset(train_audio, train_labels, train_sr)
train_loader = DataLoader(audio_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
test_accuracy(cnn, train_loader, DEVICE, optimizer, loss_function, 1)

train_audio, train_labels, train_sr, val_audio, val_labels, val_sr = load_data(LANGUAGES, TRAIN_BATCH, VAL_BATCH)
audio_dataset = AudioDataset(train_audio, train_labels, train_sr)
train_loader = DataLoader(audio_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
test_accuracy(cnn, train_loader, DEVICE, optimizer, loss_function, 1)

  net.load_state_dict(torch.load(f"{model_path}.pth"))
Reading metadata...: 1101170it [00:17, 62380.90it/s]


Loaded en


Reading metadata...: 589100it [00:10, 58878.25it/s]


Loaded de


Reading metadata...: 34898it [00:00, 67409.05it/s]


Loaded nl


Reading metadata...: 7744it [00:00, 12752.73it/s]


Loaded sv-SE


Reading metadata...: 3484it [00:00, 13559.62it/s]


Loaded da
Epoch #1 | Loss: 0.9949 | Accuracy: 57.00%
Time elapsed: 44.9910 seconds


Reading metadata...: 1101170it [00:17, 61510.74it/s]


Loaded en


Reading metadata...: 589100it [00:10, 57719.03it/s]


Loaded de


Reading metadata...: 34898it [00:00, 45526.25it/s]


Loaded nl


Reading metadata...: 7744it [00:00, 29898.84it/s]


Loaded sv-SE


Reading metadata...: 3484it [00:00, 28689.25it/s]


Loaded da
Epoch #1 | Loss: 1.0520 | Accuracy: 59.20%
Time elapsed: 43.7125 seconds


Reading metadata...: 1101170it [00:17, 62320.14it/s]


Loaded en


Reading metadata...: 589100it [00:10, 54999.00it/s]


Loaded de


Reading metadata...: 34898it [00:00, 47267.91it/s]


Loaded nl


Reading metadata...: 7744it [00:00, 36281.95it/s]


Loaded sv-SE


Reading metadata...: 3484it [00:00, 35879.82it/s]


Loaded da
Epoch #1 | Loss: 0.9829 | Accuracy: 61.40%
Time elapsed: 46.0831 seconds
