# Guide for data_processing.py file



## Importing required libraries

In [12]:
%matplotlib inline

import os
import shutil
import IPython.display as ipd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm
import librosa
import librosa.display

import utils
import data_processing as datap

plt.rcParams['figure.figsize'] = (17, 5)

## Importing music files and metadata files

In [5]:
# Load metadata and features.
tracks = utils.load('data/fma_metadata/tracks.csv')
genres = utils.load('data/fma_metadata/genres.csv')
features = utils.load('data/fma_metadata/features.csv')
echonest = utils.load('data/fma_metadata/echonest.csv')

np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()

print("shape of tracks.csv file is :", tracks.shape)
print("shape of genres.csv file is :", genres.shape)
print("shape of features.csv file is :", features.shape)
print("shape of echonest.csv file is :", echonest.shape)



shape of tracks.csv file is : (106574, 52)
shape of genres.csv file is : (163, 4)
shape of features.csv file is : (106574, 518)
shape of echonest.csv file is : (13129, 249)


## 1 Metadata

The metadata table, a CSV file in the `fma_metadata.zip` archive, is composed of many colums:
1. The index is the ID of the song, taken from the website, used as the name of the audio file.
2. Per-track, per-album and per-artist metadata from the Free Music Archive website.
3. Two columns to indicate the subset (small, medium, large) and the split (training, validation, test).

In [6]:
print("-----dtypes of tracks.csv-----")
print(tracks.dtypes)
print("-----dtypes of genres.csv-----")
print(genres.dtypes)
print("-----dtypes of features.csv-----")
print(features.dtypes)
print("-----dtypes of echonest.csv-----")
print(echonest.dtypes)

-----dtypes of tracks.csv-----
album   comments                      int64
        date_created         datetime64[ns]
        date_released        datetime64[ns]
        engineer                     object
        favorites                     int64
        id                            int64
        information                category
        listens                       int64
        producer                     object
        tags                         object
        title                        object
        tracks                        int64
        type                       category
artist  active_year_begin    datetime64[ns]
        active_year_end      datetime64[ns]
        associated_labels            object
        bio                        category
        comments                      int64
        date_created         datetime64[ns]
        favorites                     int64
        id                            int64
        latitude                    float64
 

## 2 Splitting Example

In this part we call splitter function to split the given genres according to tracks dataframe artist ordering

In [13]:
chosen_genres = ['Hip-Hop', 'Pop', 'Rock', 'Folk', 'Experimental', 'Electronic', 'Classical', 'Old-Time / Historic' ]
train_perc = 80
validation_perc = 10
test_perc = 10
datap.splitter(train_perc, validation_perc, test_perc ,chosen_genres, tracks)

Distributing 400 tracks of Hip-Hop genre to training folder...
Folder 'train' created for genre 'Hip-Hop'
Distributing 50 tracks of Hip-Hop genre to validation folder...
Folder 'validation' created for genre 'Hip-Hop'
Distributing 50 tracks of Hip-Hop genre to test folder...
Folder 'test' created for genre 'Hip-Hop'
Distributing 400 tracks of Pop genre to training folder...
Folder 'train' created for genre 'Pop'
Distributing 50 tracks of Pop genre to validation folder...
Folder 'validation' created for genre 'Pop'
Distributing 50 tracks of Pop genre to test folder...
Folder 'test' created for genre 'Pop'
Distributing 400 tracks of Rock genre to training folder...
Folder 'train' created for genre 'Rock'
Distributing 50 tracks of Rock genre to validation folder...
Folder 'validation' created for genre 'Rock'
Distributing 50 tracks of Rock genre to test folder...
Folder 'test' created for genre 'Rock'
Distributing 400 tracks of Folk genre to training folder...
Folder 'train' created for g

## 3 Resetting Example

With this function, we reset the files and move them back to genres before using different values of splits

In [17]:
datap.data_reset(chosen_genres)

Resetting training data of Hip-Hop genre
Resetting validation data of Hip-Hop genre
Resetting test data of Hip-Hop genre
Resetting training data of Pop genre
Resetting validation data of Pop genre
Resetting test data of Pop genre
Resetting training data of Rock genre
Resetting validation data of Rock genre
Resetting test data of Rock genre
Resetting training data of Folk genre
Resetting validation data of Folk genre
Resetting test data of Folk genre
Resetting training data of Experimental genre
Resetting validation data of Experimental genre
Resetting test data of Experimental genre
Resetting training data of Electronic genre
Resetting validation data of Electronic genre
Resetting test data of Electronic genre
Resetting training data of Classical genre
Resetting validation data of Classical genre
Resetting test data of Classical genre
Resetting training data of Old-Time / Historic genre
Resetting validation data of Old-Time / Historic genre
Resetting test data of Old-Time / Historic ge

### Note: After splitting, you have to reset the data in order to split with different percentages

## 4 Checking statistics

Let's check some statistics and compare these data. First, let's split the data again

In [15]:
datap.splitter(train_perc, validation_perc, test_perc ,chosen_genres, tracks)

Distributing 400 tracks of Hip-Hop genre to training folder...
Folder 'train' created for genre 'Hip-Hop'
Distributing 50 tracks of Hip-Hop genre to validation folder...
Folder 'validation' created for genre 'Hip-Hop'
Distributing 50 tracks of Hip-Hop genre to test folder...
Folder 'test' created for genre 'Hip-Hop'
Distributing 400 tracks of Pop genre to training folder...
Folder 'train' created for genre 'Pop'
Distributing 50 tracks of Pop genre to validation folder...
Folder 'validation' created for genre 'Pop'
Distributing 50 tracks of Pop genre to test folder...
Folder 'test' created for genre 'Pop'
Distributing 400 tracks of Rock genre to training folder...
Folder 'train' created for genre 'Rock'
Distributing 50 tracks of Rock genre to validation folder...
Folder 'validation' created for genre 'Rock'
Distributing 50 tracks of Rock genre to test folder...
Folder 'test' created for genre 'Rock'
Distributing 400 tracks of Folk genre to training folder...
Folder 'train' created for g

In [16]:
path_train = "datasets/Rock/train"
path_val = "datasets/Rock/validation"
path_test = "datasets/Rock/test"

rock_train_tracks = np.array(os.listdir(path_train))
rock_val_tracks = np.array(os.listdir(path_val))
rock_train_test = np.array(os.listdir(path_test))

print("---Statistics of Rock Training Tracks---")
datap.show_statistics(rock_train_tracks, tracks)
print("---Statistics of Rock Validation Tracks---")
datap.show_statistics(rock_val_tracks, tracks)
print("---Statistics of Rock Test Tracks---")
datap.show_statistics(rock_train_test, tracks)

print("---Artist comparison of Training and Validation Data---")
datap.common_artists(rock_train_tracks,rock_val_tracks,tracks)
print("---Artist comparison of Training and Test Data---")
datap.common_artists(rock_train_tracks,rock_train_test,tracks)








---Statistics of Rock Training Tracks---
+++ The data contains: 400 tracks
+++ The genre is:  ['Rock']
+++ The data contains 82 artists 
+++ The data contains 100 albums
---Statistics of Rock Validation Tracks---
+++ The data contains: 50 tracks
+++ The genre is:  ['Rock']
+++ The data contains 11 artists 
+++ The data contains 13 albums
---Statistics of Rock Test Tracks---
+++ The data contains: 50 tracks
+++ The genre is:  ['Rock']
+++ The data contains 18 artists 
+++ The data contains 22 albums
---Artist comparison of Training and Validation Data---
+++ The first dataset contains 82 artists 
+++ The second dataset contains 11 artists 
+++ The datasets have 1 common artists
---Artist comparison of Training and Test Data---
+++ The first dataset contains 82 artists 
+++ The second dataset contains 18 artists 
+++ The datasets have 0 common artists
