# ANN Project - Artist Classifaction Using MP3 song files.
### Project Members - Dhrumil Shah & Sarthak Tandon

#### The aim of this project was to use song files in mp3 format from 20 artists and to make Neural Network architectures to identify the artists using the song files.  

### Dataset Link : https://www.kaggle.com/dhrumil140396/mp3s32k 

# Utility

### This code block has the following purpose:
#### 1)  transform - Read the mp3 files from the source folder and convert them to melspectrograms, followed by log tranformation and finally saving each file in pickle format
#### 2)  load_album - Load the transformed files and split them to training, validation and testing set based on the albums from artists. For each artist, songs from one album will go to  testing set, one to validation set and the rest to training set.
#### 3) load_songs - Load the transformed files and does simple training, validation and testing split. The split is stratifies to ensure that all the artists are present in all the sets.
#### 4) slice_songs - Takes each song from the training, validation and testing set and slices them to smaller samples based on the number of frames provides. 
#### 5) encode_labels - Takes the labels from training, validation and testing set and perfroms label encoding, followed by onehot encoding.
#### 6) plot_spectrogram - Plots a spectrogram 

In [7]:
#Importing necessary libraries

import os
import random
import librosa as lib
import pickle as pk
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

 
def transform(data_path,dataset_path,sampling_rate = 16000):
    os.makedirs(dataset_path, exist_ok=True)
    start = datetime.now() # Timer to check display time taken to process all the files
    artists = [artist for artist in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, artist))] #store the lists of artists 
    
    for artist in artists:
        print("Accessing Artist: {}".format(artist))
        artist_path = os.path.join(data_path, artist)
        albums = os.listdir(artist_path)

        for album in albums:
            print("-- Album: {}".format(album))
            album_path = os.path.join(artist_path, album)
            songs = os.listdir(album_path)

            for song in songs:
                print("---- Song: {}".format(song))
                song_path = os.path.join(album_path,song)
                y,sr = lib.load(song_path, sr=sampling_rate)
                
                S = lib.feature.melspectrogram(y,sr=sr)
                S = lib.power_to_db(S) #Transforming the songs to Spectrograms followed by log transformation
                    
                file = artist+"--"+album+"--"+song
                data = (S,artist,song)
                
                with open(os.path.join(dataset_path,file),'wb') as file_path:
                    pk.dump(data,file_path) #Creates the pickle file to store in the destination path provided

    print("Time taken: {}".format(datetime.now()-start))

 

def load_album(data_path,dataset_path,random_state = 1234):
    random.seed(random_state)
    songs =  os.listdir(dataset_path)
    artists = [artist for artist in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, artist))]
    train, test, val =[], [], [] #Creating 3 lists to store training, testing and validation set.

    for artist in artists:
        artist_path = os.path.join(data_path, artist)
        albums = os.listdir(artist_path)
        random.shuffle(albums) # Shuffle the list of artists
        test.append(artist + "--" + albums.pop(0)) #send one album to the test list
        val.append(artist + "--" + albums.pop(0)) #send one album to the validation list
        train = train + [artist + "--" + album for album in albums] #send rest to the training list

    #Creating 3 sets of lists to store the songs data, the corresponding artists and song name 
    X_train, y_train, s_train = [], [], []
    X_val, y_val, s_val = [], [], []
    X_test, y_test, s_test = [], [], []

   

    for song in songs:
        with open(os.path.join(dataset_path,song),'rb') as file_path:
            load = pk.load(file_path) #Load each song file from the dataset path
        artist, album, song_name = song.split("--")
        artist_album = artist + "--" + album


        if artist_album in train:
            X_train.append(load[0]) # Load the song file
            y_train.append(load[1]) # Load the artists name
            s_train.append(load[2]) # Laod the song name 
        
        elif artist_album in val:
            X_val.append(load[0])
            y_val.append(load[1])
            s_val.append(load[2])

        elif artist_album in test:
            X_test.append(load[0])
            y_test.append(load[1])
            s_test.append(load[2])

    return X_train, y_train, s_train, X_val, y_val, s_val, X_test, y_test, s_test

   

def load_songs(data_path,dataset_path,val_size,test_size,random_state = 1234):
    songs =  os.listdir(dataset_path)
    X, Y, s =[], [], [] #Creating 3 lists to store the songs data, the corresponding artists and song name 

    for song in songs:
        with open(os.path.join(dataset_path,song),'rb') as file_path:
            load = pk.load(file_path)
        X.append(load[0]) # Load the song file
        Y.append(load[1]) # Load the artists name
        s.append(load[2]) # Laod the song name 

    #split the dataset to training and testing    
    X_train, X_test, Y_train, Y_test, s_train, s_test = train_test_split(X, Y, s, test_size=test_size, stratify=Y, shuffle=True,random_state=random_state)
    #split the dataset to training and validation  
    X_train, X_val, Y_train, Y_val, s_train, s_val = train_test_split(X_train, Y_train, s_train, test_size=val_size, shuffle=True, stratify=Y_train, random_state=random_state)

    return X_train, Y_train, s_train, X_val, Y_val, s_val, X_test, Y_test, s_test

 

def slice_song(X,y,S, slice_len):
    spectrograms, artists, songs = [], [], [] #Creating 3 lists to store the songs data, the corresponding artists and song name 
    
    for i, spectrogram in enumerate(X):
        slices = spectrogram.shape[1]//slice_len #Calculate the number of slices in each song
        
        for j in range(slices-1):
            spectrograms.append(spectrogram[:,(slice_len*j):(slice_len*(j+1))]) #store the song slices in the list
            artists.append(y[i]) #store the corresponding artists
            songs.append(S[i]) #store the corresponding song

    return np.array(spectrograms), np.array(artists), np.array(songs)

   
def encode(y, label=None, onehot=None):
    y_len = len(y)
    
    #Perfrom Label encoding on the labels. If a Label encoder is not provided, create a new one.
    if not label:
        label = preprocessing.LabelEncoder()
    y_label = label.fit_transform(y).reshape(y_len,1)
    
    #Perfrom OneHot encoding on the labels. If a OneHot encoder is not provided, create a new one.
    if not onehot:
        onehot = preprocessing.OneHotEncoder()
    y_onehot = onehot.fit_transform(y_label).toarray()

    return y_onehot, label, onehot

def plot_spectrogram(S):
    plt.figure(figsize=(12, 5))
    lib.display.specshow(S, sr=16000, x_axis='time', y_axis='mel')
    plt.title('mel power spectrogram')
    plt.colorbar(format='%+02.0f dB')
    plt.tight_layout()

# Trainer

### This code block has the following purpose:
#### 1) load_data: creates dataloader class for the dataset corresponding to the batch size provided. It also adjusts the shape of the dataset to be compatible with the models. It also transfroms the dataset if any transformation is provided.
#### 2) train_predict: trains the model provided based on the epochs and learning rate provided. Returns the training loss, validation loss, training F1 scores, validation F1 scores, time and the final model.
#### 3) test_predict: runs the model on the testing dataset and returns the F1 score
#### 4) plot_trn_val_loss: plot the loss between training and validation
#### 5) plot_trn_val_f1: plot the F1 scores between training and validation

In [8]:
#Importing necessary libraries

import numpy as np
import torch
import torch.nn  as nn
from torch.autograd import Variable 
import matplotlib.pyplot as plt
from ignite.metrics import Accuracy, Precision, Recall, Fbeta

def load_data(X,y,batch_size,transform=None):
    data =[]
    for i in range(len(X)):
        if transform:
            x = transfrom(X[i]) # Performs tensor transfrom if provided
        else:
            x = np.swapaxes(np.swapaxes(X[i],0,2),1,2) # Reshapes the data
        data.append([x, y[i]])
    dataloader = torch.utils.data.DataLoader(data, shuffle=True, batch_size=batch_size) #Creates a dataloader
    return dataloader

def train_predict(dataloader_train,dataloader_val,model,epochs,learning_rate,use_cuda):
    
    #Creates timer to calculate model runtime
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    
    if use_cuda:
        model = model.cuda()
    model = model.train() #sets model to staining mode
    
    start.record()
    train_loss_list=[]
    val_loss_list=[]
    train_f1=[]
    val_f1=[]
    loss_fn = nn.CrossEntropyLoss() # creating loss function
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # creating optimizer
    
    #Creating evaluation metrics for Training set
    precision_train = Precision()
    accuracy_train = Accuracy()
    recall_train = Recall()
    f1_train = Fbeta(beta=1.0, average=True, precision=precision_train, recall=recall_train)
    
    #Creating evaluation metrics for Validation set
    precision_val = Precision()
    recall_val = Recall()
    f1_val= Fbeta(beta=1.0, average=True, precision=precision_val, recall=recall_val)
    accuracy_val = Accuracy()
    
    for epoch in range(epochs):
        
        #Reset the metrics for each epoch run
        precision_train.reset()
        accuracy_train.reset()
        recall_train.reset()
        f1_train.reset()
        
        print("Epoch: {}".format(epoch+1))
        
        for i,(img, label) in enumerate(dataloader_train):
            img, label = Variable(img),Variable(label)
            if use_cuda:
                img = img.cuda()
                label = label.cuda()
            optimizer.zero_grad()
            pred = model.forward(img) #make prediction with the model
            _,my_label = torch.max(label, dim=1) #get the best result 
            loss = loss_fn(pred,my_label) #Calculate Loss
            if i == len(dataloader_train)-1:
                train_loss_list.append(loss.item())
            loss.backward()
            optimizer.step()
            
            #Update the metrics after each batch run
            precision_train.update((pred, my_label))
            recall_train.update((pred, my_label))
            f1_train.update((pred, my_label))
            accuracy_train.update((pred, my_label))
        
        #Print the Metric after each epoch run
        train_f1.append(f1_train.compute()*100)
        print("\tTrain loss: {:0.2f}".format(train_loss_list[-1]))
        print("\tTrain Accuracy: {:0.2f}%".format(accuracy_train.compute()*100))
        print("\tTrain Precision: {:0.2f}%".format(precision_train.compute().mean()*100))
        print("\tTrain Recall: {:0.2f}%".format(recall_train.compute().mean()*100))
        print("\tTrain F1 Score: {:0.2f}%".format(train_f1[-1]))
        
        
        precision_val.reset()
        accuracy_val.reset()
        recall_val.reset()
        f1_val.reset()
        
        #Make prediction on the validation set using the updated model
        with torch.no_grad():
            for i,(img, label) in enumerate(dataloader_val):
                img, label = Variable(img),Variable(label)
                if use_cuda:
                    img = img.cuda()
                    label = label.cuda()
                pred = model(img)
                _,my_label = torch.max(label, dim=1)
                loss = loss_fn(pred,my_label)
                if i == len(dataloader_val)-1:
                    val_loss_list.append(loss.item())
                
                precision_val.update((pred, my_label))
                recall_val.update((pred, my_label))
                f1_val.update((pred, my_label))
                accuracy_val.update((pred, my_label))
        
        val_f1.append(f1_val.compute()*100)
        print("\n\tVal loss: {:0.2f}".format(val_loss_list[-1]))
        print("\tVal Accuracy: {:0.2f}%".format(accuracy_val.compute()*100))
        print("\tVal Precision: {:0.2f}%".format(precision_val.compute().mean()*100))
        print("\tVal Recall: {:0.2f}%".format(recall_val.compute().mean()*100))
        print("\tVal F1 Score: {:0.2f}%".format(val_f1[-1]))
        
    end.record()
    torch.cuda.synchronize()
    time = start.elapsed_time(end)
    return (train_loss_list,val_loss_list,train_f1,val_f1,time,model)

def test_predict(model,dataloader_test,use_cuda):
    if use_cuda:
        model = model.cuda()
    
    #Creating evaluation metrics for testing set
    precision = Precision()
    recall = Recall()
    f1 = Fbeta(beta=1.0, average=True, precision=precision, recall=recall)
    
    for i,(img, label) in enumerate(dataloader_test):
        img, label = Variable(img),Variable(label)
        if use_cuda:
            img = img.cuda()
            label = label.cuda()
            pred = model(img)
            _,my_label = torch.max(label, dim=1)
            precision.update((pred, my_label))
            recall.update((pred, my_label))
            f1.update((pred, my_label))
            
    precision.compute()
    recall.compute()
    print("\tF1 Score: {:0.2f}%".format(f1.compute()*100))

def plot_trn_val_loss(train,val):
    plt.figure(figsize=(12,6))
    plt.plot(train,label="Training Dataset")
    plt.plot(val,label="Validation Dataset")
    plt.xlabel("Number of epochs")
    plt.ylabel("Cross Entropy Loss")
    plt.xticks(range(0,len(train),2))
    plt.legend(loc='upper right')
    plt.show()
    
def plot_trn_val_f1(train,val):
    plt.figure(figsize=(12,6))
    plt.plot(train,label="Training Dataset")
    plt.plot(val,label="Validation Dataset")
    plt.xlabel("Number of epochs")
    plt.ylabel("F1 score")
    plt.xticks(range(0,len(train),2))
    plt.legend(loc='upper right')
    plt.show()

# Models

# Model 1 
#### Model with Inception Block
For the task of the detecting artist from Mel-spectrogram, we have developed a model which uses **inception blocks, convolution blocks, and GRU** as a basic block.
1. Since Mel-spectogram is a time vs Mel image it makes sense to use a convolution block.<br>
1. One main idea behind using inception block was to detect **macro and micro-level features** in the image because <br> inception block uses different size of kernels in order to detect the feature out image - (1*1),(3,3) ,(5,5) and (3,3) max pooling, and merges the output of all these filters together. <br>
1. Since mel-spectrogram is a mel-time based image we are using 2 layers **stacked-GRU** in order to detect the **feature based on time-based changes**. <br>
1. At last output of the GRU is passed through a linear dense layer with output size = number of total artists. We <br> are not calculating the SoftMax probability because it's taken care of by the CrossEntropyLoss we are using while training the model. <br>

In [9]:
import torch 
import torch.nn  as nn
from torch.autograd import Variable
import torch.nn.functional as F

class conv_block(nn.Module):
    def __init__(self,in_channels,out_channels,**kwargs):
        super(conv_block,self).__init__()
        self.elu = nn.ELU()
        self.conv = nn.Conv2d(in_channels,out_channels,**kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
    
    def forward(self,x):
        return self.elu(self.batchnorm(self.conv(x)))
    
class Inception_Block(nn.Module):
    def __init__(self,input_channels,out_1x1,reduction_3x3,out_3x3,reduction_5x5,out_5x5,out_1x1pool):
        super(Inception_Block,self).__init__()
        self.branch1 = conv_block(input_channels,out_1x1,kernel_size=(1,1))
        
        self.branch2 = nn.Sequential(
                           conv_block(input_channels,reduction_3x3,kernel_size=(1,1)),
                           conv_block(reduction_3x3,out_3x3,kernel_size=(3,3),padding=(1,1)),)
        self.branch3 = nn.Sequential(
                        conv_block(input_channels,reduction_5x5,kernel_size=(1,1)),
                        conv_block(reduction_5x5,out_5x5,kernel_size=(5,5),padding=(2,2)),)
        self.branch4 = nn.Sequential(
                        nn.AvgPool2d(kernel_size=(3,3),stride=(1,1),padding=(1,1)),
                        conv_block(input_channels,out_1x1pool,kernel_size=(1,1)))
    def forward(self,x):
        return torch.cat(
            [self.branch1(x),self.branch2(x),self.branch3(x),self.branch4(x)],1)

class MusicArtistClassificationModel(nn.Module):
    def __init__(self,input_channels,total_artists,use_cuda):
        super(MusicArtistClassificationModel,self).__init__()  
        
        self.batchnorm0 =  nn.BatchNorm2d(input_channels)
        
        self.layer1 = conv_block(input_channels,3,kernel_size=(3,3),padding=(1,1))
        self.batchnorm1 = nn.BatchNorm2d(3)
        self.maxpool1 = nn.AvgPool2d(kernel_size=(3,3),stride=2,padding=1)
        self.dropout1 = nn.Dropout(0.1)
        
        self.layer2 = Inception_Block(3,64,96,128,16,32,32)
        self.batchnorm2 = nn.BatchNorm2d(256)
        self.maxpool2 = nn.AvgPool2d(kernel_size=(3,3),stride=2,padding=1)
        self.dropout2 = nn.Dropout(0.1)
        
        self.layer3 = conv_block(256,32,kernel_size=(3,3),padding=(1,1))
        self.batchnorm3 = nn.BatchNorm2d(32)
        self.maxpool3 = nn.AvgPool2d(kernel_size=(3,3),stride=2,padding=1)
        self.dropout3 = nn.Dropout(0.1)
        
        self.rnn = nn.GRU(
            input_size=512, 
            hidden_size=32, 
            num_layers=2,
            batch_first=True,
            dropout=0.3)
        self.linear = nn.Linear(32,total_artists)
        self.use_cuda = use_cuda
        
        
    def forward(self,x):
        x = self.batchnorm0(x)
        x = self.maxpool1(self.batchnorm1(F.elu(self.layer1(x))))
        x = self.maxpool2(self.batchnorm2(F.elu(self.layer2(x))))
        x = self.maxpool3(self.batchnorm3(F.elu(self.layer3(x))))
        # input format is (B,C,F,T) . For RNN we need input form (B,T,C*F) . where channel and frequency 
        # dimensions are considered as a feature at any given time.
        batch_size, channel,freq, timesteps = x.size()
        x = x.permute(0,3,1,2)
        reshape_size = channel*freq
        x = x.reshape(batch_size, timesteps, -1)
        h0 = Variable(torch.zeros(2, x.size(0), 32).requires_grad_())
        if self.use_cuda:
            h0 = h0.cuda()
        out, (hn) = self.rnn(x, (h0.detach()))
        out = self.linear(out[:, -1, :])  
        return out


# Model 2: 
##### **Resnet** is one of the famous models used in computer vision tasks which has more than 100 layers. Acknowledging the performance of ResNet in computer vision application we have decided to use ResNet and its variants for artists classification on **Mel-spectrogram images**. Resnet uses **residual connections** for deep networks in order to solve **diminishing gradient problems**. In a typical Resnet model, those residual connections skip approximately 3-4 layers. For the task of classifying Mel-spectrogram image based on artists we have used the following variants of ResNet :
 *  **Variant 1 : ResNet50**
 * **Variant 2 :3 layers of ResNet50 + 2 Layers of GRU**:- in order to detect time-based feature in the time vs Mel image we have added 2 Layers of GRU network at the end of 3 layers (each layer in ResNet50 is comprised of multiple Conv nets) of ResNet50 model. This model was the best performing model from all the models we tried implementing.


In [10]:

# -*- coding: utf-8 -*-
"""
reference code Programmed by Aladdin Persson <aladdin.persson@hotmail.com>
"""

import torch
import torch.nn as nn


class block(nn.Module):
    def __init__(
        self, in_channels, intermediate_channels, identity_downsample=None, stride=1
    ):
        super(block, self).__init__()
        self.expansion = 4
        self.dropout = nn.Dropout(0.2)
        self.conv1 = nn.Conv2d(
            in_channels, intermediate_channels, kernel_size=1, stride=1, padding=0
        )
        self.bn1 = nn.BatchNorm2d(intermediate_channels)
        self.conv2 = nn.Conv2d(
            intermediate_channels,
            intermediate_channels,
            kernel_size=3,
            stride=stride,
            padding=1,
        )
        self.bn2 = nn.BatchNorm2d(intermediate_channels)
        self.conv3 = nn.Conv2d(
            intermediate_channels,
            intermediate_channels * self.expansion,
            kernel_size=1,
            stride=1,
            padding=0,
        )
        self.bn3 = nn.BatchNorm2d(intermediate_channels * self.expansion)
        self.relu = nn.ELU()
        self.identity_downsample = identity_downsample
        self.stride = stride

    def forward(self, x):
        identity = x.clone()

        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.conv3(x)
        x = self.bn3(x)

        if self.identity_downsample is not None:
            identity = self.identity_downsample(identity)

        x += identity
        x = self.relu(x)
        return x


class ResNet(nn.Module):
    def __init__(self, block, layers, image_channels, num_classes, use_cuda):
        super(ResNet, self).__init__()
        self.dropout = nn.Dropout(0.15)
        self.in_channels = 64
        self.conv1 = nn.Conv2d(image_channels, 64, kernel_size=7, stride=2, padding=3)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ELU()
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        # Essentially the entire ResNet architecture are in these 4 lines below
        self.layer1 = self._make_layer(
            block, layers[0], intermediate_channels=64, stride=1
        )
        self.layer2 = self._make_layer(
            block, layers[1], intermediate_channels=128, stride=2
        )
        self.layer3 = self._make_layer(
            block, layers[2], intermediate_channels=256, stride=2
        )
        self.layer4 = self._make_layer(
            block, layers[3], intermediate_channels=512, stride=2
        )

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * 4, num_classes)
        
        self.rnn = nn.GRU(
            input_size=8192, 
            hidden_size=32, 
            num_layers=2,
            batch_first=True,
            dropout=0.2)
        self.linear = nn.Linear(32,num_classes)
        self.use_cuda = use_cuda

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.layer2(x)
         # input format is (B,C,F,T) . For RNN we need input form (B,T,C*F) . where channel and frequency 
        # dimensions are considered as a feature at any given time.
        batch_size, channel,freq, timesteps = x.size()
        x = x.permute(0,3,1,2)
        reshape_size = channel*freq
        x = x.reshape(batch_size, timesteps, -1)
        h0 = Variable(torch.zeros(2, x.size(0), 32).requires_grad_())
        if self.use_cuda:
            h0 = h0.cuda()
        out, (hn) = self.rnn(x, (h0.detach()))
        out = self.linear(out[:, -1, :])
        return out
        
       

    def _make_layer(self, block, num_residual_blocks, intermediate_channels, stride):
        identity_downsample = None
        layers = []

        # Either if we half the input space for ex, 56x56 -> 28x28 (stride=2), or channels changes
        # we need to adapt the Identity (skip connection) so it will be able to be added
        # to the layer that's ahead
        if stride != 1 or self.in_channels != intermediate_channels * 4:
            identity_downsample = nn.Sequential(
                nn.Conv2d(
                    self.in_channels,
                    intermediate_channels * 4,
                    kernel_size=1,
                    stride=stride,
                ),
                nn.BatchNorm2d(intermediate_channels * 4),
            )

        layers.append(
            block(self.in_channels, intermediate_channels, identity_downsample, stride)
        )

        # The expansion size is always 4 for ResNet 50,101,152
        self.in_channels = intermediate_channels * 4

        # For example for first resnet layer: 256 will be mapped to 64 as intermediate layer,
        # then finally back to 256. Hence no identity downsample is needed, since stride = 1,
        # and also same amount of channels.
        for i in range(num_residual_blocks - 1):
            layers.append(block(self.in_channels, intermediate_channels))

        return nn.Sequential(*layers)


def MusicArtistClassificationModel_ResNet50(use_cuda,img_channel=3, num_classes=1000):
    return ResNet(block, [3, 4, 6, 3], img_channel, num_classes, use_cuda)

# Main

In [11]:
#Declaring paths
data_path = "../input/mp3s32k/mp3s-32k"
dataset_path = "../input/song-dataset/song_dataset"

In [13]:
#transfroming the mp3 file to spectrograms
transform(data_path,dataset_path)

In [None]:
#Loading Dataset based on song/album 
#Old code
'''
#X_train, y_train, _ , X_val, y_val, _ , X_test, y_test, _ = load_songs(data_path,dataset_path,0.1,0.1)
X_train, y_train, _ , X_val, y_val, _ , X_test, y_test, _ = load_album(data_path,dataset_path)
'''

#Changed code
#X_train, y_train, S_train, X_val, y_val, S_val, X_test, y_test, S_test = load_songs(data_path,dataset_path,0.1,0.1)
X_train, y_train, S_train, X_val, y_val, S_val, X_test, y_test, S_test = load_album(data_path,dataset_path)


In [None]:
#Slice songs to 1 second sample (31 frames)
X_train,y_train,S_train = slice_song(X_train,y_train,S_train, 94)
X_val,y_val,S_val = slice_song(X_val,y_val,S_val, 94)
X_test,y_test,S_test = slice_song(X_test,y_test,S_test, 94)

In [None]:
#plot a spectrogram
plot_spectrogram(X_train[0])

In [None]:
#Encoding the artists
y_train, label, onehot = encode(y_train)
y_val, label, onehot = encode(y_val, label, onehot)
y_test, label, onehot = encode(y_test, label, onehot)

In [None]:
#Adding an extra dimension
X_train = X_train.reshape(X_train.shape + (1,))
X_val = X_val.reshape(X_val.shape + (1,))
X_test = X_test.reshape(X_test.shape + (1,))

In [None]:
#Creating batches for training, validation and testing set  
train_dataloader = load_data(X_train, y_train, 32)
validation_dataloader = load_data(X_val, y_val, 32)
test_dataloader =  load_data(X_test, y_test, 32)

In [None]:
#Garbage Collection
del X_train, y_train, S_train, X_val, y_val, S_val, X_test, y_test, S_test
gc.collect()

In [None]:
#Checking hardware
use_cuda = torch.cuda.is_available()
torch.cuda.manual_seed(1234)

#Initializing models
#model = MusicArtistClassificationModel(1,20,use_cuda)
model = MusicArtistClassificationModel_ResNet50(1,20, use_cuda)

In [None]:
#Training the model
train_loss,val_loss,train_f1,val_f1,time,model = train_predict(train_dataloader, validation_dataloader, model, 300, 1e-4, use_cuda)

In [12]:
#Print the time taken by the model in minutes
print((time//1000)/60)

In [None]:
#Plot F1 scores in training and validation
plot_trn_val_f1(train_f1,val_f1)

In [None]:
#Plot loss in training and validation
plot_trn_val_loss(train_loss,val_loss)

In [None]:
#Make predcitions using testing dataset and final model
test_predict(model,validation_dataloader,use_cuda)

# Future Work

### 1) Improving the F1 scores  
### 2) Working with more song samples by adjusting the sample length 