In [1]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt

DIR_WAV = "Dataset_wav\\"
DIR_PNG = "Dataset_png\\"
 
def PlotFreq_Time(rFolder, sFolder, name):
    source_path = rFolder + name
    output_path = sFolder + name.split('.')[0] + ".png"
    if not os.path.exists(output_path):
        audio_data, sample_rate = librosa.load(source_path)
        stft = librosa.stft(audio_data)
        spectrogram = librosa.amplitude_to_db(abs(stft))
        plt.figure(figsize=(5, 5))
        librosa.display.specshow(spectrogram, sr=sample_rate, cmap='gray')
        plt.tight_layout()
        plt.savefig(output_path)
        plt.close()

In [2]:
import torch
from PIL import Image
from torchvision import transforms

pre_NoNorm = transforms.Compose([
    transforms.Resize((360, 360)),
    transforms.ToTensor()])

def loaderNoNorm(folder, name):
    img = Image.open(folder + name)
    img_t = pre_NoNorm(img)
    return img_t

In [3]:
# Loading all WAV file and convert to PNG
all_WAV_Names = [file for file in os.listdir(DIR_WAV) if file.endswith(".wav")]
for name in all_WAV_Names:
    PlotFreq_Time(DIR_WAV, DIR_PNG, name)

# Generating random train and valid indexs
n_Datas = len(all_WAV_Names)
n_valid = int(0.3*n_Datas)
shuffled_indexs = torch.randperm(n_Datas)
tra_indexs = shuffled_indexs[n_valid:]
val_indexs = shuffled_indexs[:n_valid]
tra_indexs.shape, val_indexs.shape

(torch.Size([840]), torch.Size([360]))

In [4]:
# Loading train and valid PNG file and calculate the std and mean
all_PNG_Names = [file for file in os.listdir(DIR_PNG) if file.endswith(".png")]
tra_PNG_Names = [all_PNG_Names[index] for index in tra_indexs]
val_PNG_Names = [all_PNG_Names[index] for index in val_indexs]
tra_imgs = torch.stack([loaderNoNorm(DIR_PNG, name) for name in tra_PNG_Names])
val_imgs = torch.stack([loaderNoNorm(DIR_PNG, name) for name in val_PNG_Names])
tra_mean = tra_imgs.view(4, -1).mean(dim=1)
tra_std = tra_imgs.view(4, -1).std(dim=1)
val_mean = val_imgs.view(4, -1).mean(dim=1)
val_std = val_imgs.view(4, -1).std(dim=1)
print(tra_mean, tra_std)
print(val_mean, val_std)
label_dict = {"front":0, "back":1, "left":2, "right":3, "up":4, "down":5}

tensor([0.3463, 0.3521, 0.3502, 0.3516]) tensor([0.4564, 0.4546, 0.4549, 0.4546])
tensor([0.3446, 0.3485, 0.3468, 0.3533]) tensor([0.4567, 0.4557, 0.4560, 0.4541])


In [6]:
# Building train and valid preprocess
tra_preprocess = transforms.Compose([
  transforms.Resize([360, 360]),
  transforms.ToTensor(),
  transforms.Normalize(mean=tra_mean, std=tra_std)
])
val_preprocess = transforms.Compose([
  transforms.Resize([360, 360]),
  transforms.ToTensor(),
  transforms.Normalize(mean=tra_mean, std=tra_std)
])
def tra_loader(path):
  img = Image.open(path)
  img_t = tra_preprocess(img)
  return img_t
def val_loader(path):
  img = Image.open(path)
  img_t = val_preprocess(img)
  return img_t

# Using in categorizing label by PNG name
import re
def removeNum(s):
    return re.sub(r'\d+|\..*', '', s)
# Some Data the Dataset need
tra_PNG_Paths = [DIR_PNG + name for name in tra_PNG_Names]
tra_PNG_Label = torch.tensor([label_dict[removeNum(name)] for name in tra_PNG_Names])
val_PNG_Paths = [DIR_PNG + name for name in val_PNG_Names]
val_PNG_Label = torch.tensor([label_dict[removeNum(name)] for name in val_PNG_Names])

In [7]:
# Defining the Custom DataSet
from torch.utils.data import Dataset, DataLoader

class VoiceDataSet(Dataset):
  def __init__(self, isTrain = True):
    self.paths = tra_PNG_Paths if isTrain else val_PNG_Paths
    self.labels = tra_PNG_Label if isTrain else val_PNG_Label
    self.loader = tra_loader if isTrain else val_loader

  def __len__(self):
    return len(self.paths)
  
  def __getitem__(self, index):
    path = self.paths[index]
    image = self.loader(path)
    label = self.labels[index]
    return image, label
  

In [8]:
# Defining the Accuracy calculation function
def Accuracy(model, tra_loader, val_loader):
  result = {}
  model.eval()
  for name, loader in [("train", tra_loader),("valid", val_loader)]:
    correct = 0
    total = 0
    with torch.no_grad():
      for imgs, labels in loader:
        outputs = model(imgs)
        _, predicted = torch.max(outputs, dim=1)
        total+= labels.shape[0]
        correct+= int((predicted==labels).sum())
    result[name] = correct / total
  return result

In [37]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Setting the CNN module
class CNN(nn.Module):
  def __init__(self, n_chan=32):
    super().__init__()
    self.n_chan = n_chan
    self.conv1 = nn.Conv2d(4, n_chan, kernel_size=5, padding=1)
    self.conv2 = nn.Conv2d(n_chan, n_chan//2, kernel_size=5, padding=1)
    self.fc1 = nn.Linear(n_chan//2 * 88 * 88, 32)
    self.fc2 = nn.Linear(32, 6)

  def forward(self, out):
    print(out.shape)
    out = F.max_pool2d(F.relu(self.conv1(out)), 2)
    print(out.shape)
    out = F.max_pool2d(F.relu(self.conv2(out)), 2)
    print(out.shape)
    out = out.view(-1, self.n_chan//2 * 88 * 88)
    print(out.shape)
    out = F.relu(self.fc1(out))
    print(out.shape)
    out = self.fc2(out)
    print(out.shape)
    return out
  
# Setting the training loop
def training_loop(n_epochs, optimizer, model, loss_fcn, train_loader, valid_loader):
    for epoch in range(1, n_epochs+1):
      loss_train = 0.
      loss_valid = 0.
      model.train()
      for imgs, labels in train_loader:
        outputs=model(imgs)
        loss=loss_fcn(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_train+= loss.item()
      avg_traLoss = loss_train/len(train_loader)
      model.eval()
      with torch.no_grad():
        for imgs, labels in valid_loader:
          outputs = model(imgs)
          loss = loss_fcn(outputs, labels)
          loss_valid += loss.item()
      avg_valLoss = loss_valid/len(valid_loader)
      accuracy = Accuracy(model, train_loader, valid_loader)
      print("| Epoch: {0:3} | Train_Loss: {1:.4f} | Valid_Loss: {2:.4f} | Train_Accuracy: {3:.4f} | Valid_Accuracy: {4:.4f} |".format(epoch, avg_traLoss, avg_valLoss, accuracy["train"], accuracy["valid"]))

In [21]:
# Loading data
tra_data = VoiceDataSet(isTrain = True)
val_data = VoiceDataSet(isTrain = False)
trainLoader = DataLoader(tra_data, batch_size=10, shuffle=True)
validLoader = DataLoader(val_data, batch_size=10, shuffle=True)

In [None]:
# Starting training
model = CNN()
optimizer = optim.SGD(model.parameters(), lr = 5e-4)
loss_fcn = nn.CrossEntropyLoss()
training_loop(n_epochs = 100, optimizer = optimizer, model = model, loss_fcn = loss_fcn, train_loader = trainLoader, valid_loader=validLoader)

In [34]:
# Saving the model to .pth file
filepath = 'model.pth'
torch.save(model.state_dict(), filepath)

In [None]:
model_in = CNN()
model_in.load_state_dict(torch.load("model.pth"))
Accuracy(model_in, trainLoader, validLoader)