## Import modules

In [1]:
import torch
import librosa
import pandas as pd 
import numpy as np
import random

torch.manual_seed(0)
random.seed(0)

## Read data

In [2]:
def read_files(csv_path, audio_folder_path, sr=22050):
    track_df = pd.read_csv(csv_path)

    audio_dict = {
        "track": [],
        "y": [],
        "sr": [],
    }

    for track_name in track_df["track"]:
        print(f"loading {track_name}")
        y, sr = librosa.load(f"{audio_folder_path}/{track_name}", sr=sr)
        audio_dict["track"].append(track_name)
        audio_dict["y"].append(y)
        audio_dict["sr"].append(sr)

    return track_df, pd.DataFrame(audio_dict)


In [3]:
data_folder_path = "../data"
sr = 22050
track_df, audio_df = read_files(f"{data_folder_path}/train.csv", f"{data_folder_path}/audios/clips", sr)


loading normalize_5s_intro_thc1MtNagC8.wav
loading normalize_5s_intro_Wo2qUD1g7xM.wav
loading normalize_5s_intro_3ObVN3QQiZ8.wav
loading normalize_5s_intro_S-zQJFRX5Fg.wav
loading normalize_5s_intro_SyZOAgXiPMw.wav
loading normalize_5s_intro_GQT8ejgV2_A.wav
loading normalize_5s_intro_PQAIxeSIQU4.wav
loading normalize_5s_intro_E-8pyVBvCPQ.wav
loading normalize_5s_intro_Qr8eZSVaw10.wav
loading normalize_5s_intro_p7j-tz1Cn4o.wav
loading normalize_5s_intro_nISI4qF55F4.wav
loading normalize_5s_intro_RoeRU5zxkak.wav
loading normalize_5s_intro_EygNk739nnY.wav
loading normalize_5s_intro_w1G3rqVil1s.wav
loading normalize_5s_intro_KKc_RMln5UY.wav
loading normalize_5s_intro_Ng2JdroNfC0.wav
loading normalize_5s_intro_xc0sWhVhmkw.wav
loading normalize_5s_intro_VVRszjvg3_U.wav
loading normalize_5s_intro_C7u6rtswjCU.wav
loading normalize_5s_intro_HiPkwl5p1GY.wav
loading normalize_5s_intro_mYa_9d2Daas.wav
loading normalize_5s_intro_6MSYrN4YfKY.wav
loading normalize_5s_intro_O2q_9lBDM7I.wav
loading nor

## Preprocess data

In [4]:
def preprocess_data(track_df, audio_df):
    # x = np.array([[librosa.feature.melspectrogram(y=y, sr=sr)] for y, sr in zip(audio_df["y"], audio_df["sr"])])
    x = np.array([[[value]] for value in audio_df["y"].values])
    return x


In [5]:
# print(audio_df)
# for index, (track, y, sr) in audio_df.iterrows():
#     print(y)

x = preprocess_data(track_df, audio_df)
y = np.array([track_df['score'].values]).T

# print(x)
print("x shape =", x.shape)
print("y shape =", y.shape)


x shape = (220, 1, 1, 110250)
y shape = (220, 1)


## Prepare dataset and Data Augmentation

In [6]:
def augmentation_nothing(y, sr):
    return y


def augmentation_change_amplitude(y, sr):
    gain = random.uniform(0.8, 1.25)
    return y * gain


def augmentation_inverse(y, sr):
    return y * -1


def augmentation_noise(y, sr):
    signal = y[0][0]
    RMS = np.sqrt(np.mean(signal**2))
    STD_n = random.uniform(0, 0.01)
    noise = np.random.normal(0, STD_n, signal.shape[0])
    signal_noise = signal+noise
    return np.array([[signal_noise]])


def augmentation_pitch(y, sr):
    signal = y[0][0]
    pitch_factor = random.uniform(-10, 10)
    return np.array([[librosa.effects.pitch_shift(signal, sr=sr, n_steps=pitch_factor)]])


def augmentation_speed(y, sr):
    signal = y[0][0]
    speed_factor = random.uniform(0.5, 2)
    return np.array([[librosa.effects.time_stretch(signal, rate=speed_factor)]])


def augmentation_func(y, sr):
    if random.choice([True, False]):
        y = augmentation_change_amplitude(y, sr)
    if random.choice([True, False]):
        y = augmentation_inverse(y, sr)
    if random.choice([True, False]):
        y = augmentation_noise(y, sr)
    if random.choice([True, False]):
        y = augmentation_pitch(y, sr)
    return y


In [7]:
from torch.utils.data import Dataset, DataLoader


class MyDataset(Dataset):
    def __init__(self, x, y, augmentation = False):
        self.x = x
        self.y = torch.tensor(y, dtype=torch.float32)
        self.len = x.shape[0]
        self.augmentation = augmentation


    def __getitem__(self, index):
        x = self.x[index]
        if(self.augmentation):
            x = augmentation_func(x, sr)
        
        return torch.tensor(x, dtype=torch.float32), self.y[index]

    def __len__(self):
        return self.len



In [8]:
from sklearn.model_selection import train_test_split

train_test_ratio = 0.85

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=train_test_ratio, random_state=0)

train_set = MyDataset(x=x_train, y=y_train, augmentation=True)
test_set = MyDataset(x=x_test, y=y_test, augmentation=False)

# train_size = len(x_train)
# test_size =  len(x_test)

# train_set, _ = torch.utils.data.random_split(train_set, [train_size, 0])
# test_set, _ = torch.utils.data.random_split(test_set, [test_size, 0])


print("Augmentation testing")
print("train_set")
for i in range(10):
    print(train_set[0][0][0][0])

print("test_set")
for i in range(10):
    print(test_set[0][0][0][0])


Augmentation testing
train_set
tensor([ 0.0146,  0.0104,  0.0135,  ..., -0.0471, -0.0485, -0.0632])
tensor([-0.0137, -0.0127, -0.0031,  ...,  0.0475,  0.0520,  0.0631])
tensor([-0.0077, -0.0117, -0.0104,  ...,  0.0459,  0.0498,  0.0660])
tensor([ 0.0080,  0.0120,  0.0077,  ..., -0.0495, -0.0451, -0.0620])
tensor([ 0.0081,  0.0131,  0.0103,  ..., -0.0584, -0.0596, -0.0791])
tensor([-0.0094, -0.0198, -0.0059,  ...,  0.0456,  0.0352,  0.0610])
tensor([ 0.0071,  0.0111,  0.0102,  ..., -0.0208, -0.0146,  0.0000])
tensor([-0.0077, -0.0117, -0.0104,  ...,  0.0459,  0.0498,  0.0660])
tensor([ 0.0077,  0.0117,  0.0104,  ..., -0.0459, -0.0498, -0.0660])
tensor([ 0.0138,  0.0129,  0.0130,  ..., -0.0550, -0.0498, -0.0804])
test_set
tensor([-4.8946e-05, -1.3814e-05,  1.8567e-06,  ..., -1.2329e-02,
        -1.2139e-02, -3.1391e-02])
tensor([-4.8946e-05, -1.3814e-05,  1.8567e-06,  ..., -1.2329e-02,
        -1.2139e-02, -3.1391e-02])
tensor([-4.8946e-05, -1.3814e-05,  1.8567e-06,  ..., -1.2329e-02,
  

## Build Model

In [9]:
import torch.nn as nn
import torch.nn.functional as F


# class Model(nn.Module):
#     def __init__(self):
#         super(Model, self).__init__()
#         self.conv1 = nn.Conv2d(1, 16, (5, 5), 1)
#         self.conv2 = nn.Conv2d(16, 32, (5, 5), 1)
#         self.pool = nn.MaxPool2d((2, 2), 2)
#         self.fc1 = nn.Linear(199680, 1024)
#         self.dropout1 = nn.Dropout(0.25)
#         self.fc2 = nn.Linear(1024, 512)
#         self.dropout2 = nn.Dropout(0.25)
#         self.fc3 = nn.Linear(512, 1)
#         self.sigmoid = nn.Sigmoid()

#     def forward(self, x):
#         x = self.conv1(x)
#         x = F.relu(x)
#         x = self.conv2(x)
#         x = F.relu(x)
#         x = self.pool(x)
#         x = torch.flatten(x, 1)
#         x = self.fc1(x)
#         x = F.relu(x)
#         x = self.dropout1(x)
#         x = self.fc2(x)
#         x = F.relu(x)
#         x = self.dropout2(x)
#         x = self.fc3(x)
#         output = self.sigmoid(x)
#         return output


class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, (1, 32), 1)
        self.conv2 = nn.Conv2d(16, 32, (1, 32), 1)
        self.pool1 = nn.MaxPool2d((1, 2), 2)

        self.dropout1 = nn.Dropout(0.25)
        self.fc1 = nn.Linear(1763008, 256)
        self.dropout2 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        # x = self.pool1(x)

        x = self.conv2(x)
        x = F.relu(x)
        x = self.pool1(x)

        x = self.dropout1(x)

        x = torch.flatten(x, 1)

        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)

        x = self.fc2(x)

        output = self.sigmoid(x)
        
        return output


In [10]:
def train(model, device, train_loader, criterion, optimizer, epoch, log_interval=10):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        # print(f"output = {output} target = {target} loss = {loss}")
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))


def test(model, device,criterion, test_loader):
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = criterion(output, target)
            test_loss += loss.item()  # sum up batch loss

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}\n'.format(test_loss))


## Training

In [11]:
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

torch.cuda.empty_cache()


learning_rate = 0.001
gamma = 0.1
epochs = 50

train_loader = torch.utils.data.DataLoader(train_set, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_set, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Model().to(device)
print(model)
criterion = torch.nn.MSELoss()
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adadelta(model.parameters(), lr=learning_rate)
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
scheduler = StepLR(optimizer, step_size=1, gamma=gamma)

Model(
  (conv1): Conv2d(1, 16, kernel_size=(1, 32), stride=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(1, 32), stride=(1, 1))
  (pool1): MaxPool2d(kernel_size=(1, 2), stride=2, padding=0, dilation=1, ceil_mode=False)
  (dropout1): Dropout(p=0.25, inplace=False)
  (fc1): Linear(in_features=1763008, out_features=256, bias=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [12]:
# Training
for epoch in range(1, epochs + 1):
    train(model, device, train_loader, criterion, optimizer, epoch, log_interval=100)
    test(model, device, criterion, test_loader)
    scheduler.step()



Test set: Average loss: 0.0256


Test set: Average loss: 0.0265


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set: Average loss: 0.0264


Test set:

In [13]:
test(model, device, criterion, test_loader)


Test set: Average loss: 0.0264



## Save Model

In [14]:
model_folder_path = "../model"
save_model_name = "model1.pt"

torch.save(model, f"{model_folder_path}/{save_model_name}")

## Predict

In [15]:
data_folder_path = "../data"
model_folder_path = "../model"
load_model_name = "model1.pt"
# sr = 11025

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model = torch.load(f"{model_folder_path}/{load_model_name}")
model.eval()

test_track_df, test_audio_df = read_files(f"{data_folder_path}/test.csv", f"{data_folder_path}/audios/clips", sr=sr)
test_x = preprocess_data(test_track_df, test_audio_df)

output_dict = {
    "track": [],
    "score": []
}

for track, features in zip(test_track_df['track'], test_x):
    features = np.array([features])
    features = torch.tensor(features, dtype=torch.float32).to(device)
    score = model(features)
    output_dict["track"].append(track)
    output_dict["score"].append(score[0][0].cpu().detach().numpy())

output_df = pd.DataFrame(output_dict)
output_df.to_csv(f"{data_folder_path}/submission.csv", index=False)


loading normalize_5s_intro_0EVVKs6DQLo.wav
loading normalize_5s_intro_d7to9URtLZ4.wav
loading normalize_5s_intro_TzhhbYS9EO4.wav
loading normalize_5s_intro_nn5nypm7GG8.wav
loading normalize_5s_intro_hed6HkYNA7g.wav
loading normalize_5s_intro_rWznOAwxM1g.wav
loading normalize_5s_intro_zyQkFh-E4Ak.wav
loading normalize_5s_intro_agKkcRXN2iE.wav
loading normalize_5s_intro_SZaZU_qi6Xc.wav
loading normalize_5s_intro_ZpDQJnI4OhU.wav
loading normalize_5s_intro_D4nWzd63jV4.wav
loading normalize_5s_intro_9odM1BRqop4.wav
loading normalize_5s_intro_F64yFFnZfkI.wav
loading normalize_5s_intro_Js2JQH_kt0I.wav
loading normalize_5s_intro_Skt_NKI4d6U.wav


| Change | loss   |
| ------ | ------ |
| baseline | 0.0269 |
| Move dropout | 0.0272 |
| 3 conv 2 fc | 0.0269 |
| 3 conv 3 fc | 0.0260 |
| 2 conv 3 fc | 0.0258 |
| 2 conv 2 fc | 0.0530 |
| augmentation | 0.0268 |




