In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

In [3]:
# retrieve dataset.
path = "Datasets/Turkish Music Emotion Dataset Data Set/Acoustic Features.csv"

turkish = pd.read_csv(path, header = 0)

Goal: classify 4 different classes of music using the attributes of the music. 

Info (from https://archive.ics.uci.edu/ml/datasets/Turkish+Music+Emotion+Dataset):
The dataset is designed as a discrete model. 
Four classes in the dataset: happy, sad, angry, relax. 
Verbal and non-verbal music.
Different genres of Turkish music.
A total of 100 music pieces are determined for each class. 
There are 400 observations. 
Each observation is 30 seconds of the original song. 

Attribute information
Mel Frequency Cepstral Coefficients (MFCCs), https://learn.flucoma.org/reference/mfcc/ 
Describes the contour of the spectrum, meaning the boundary of music frequencies.  
<br>
Tempo, https://en.wikipedia.org/wiki/Tempo 
Describes the speed the music plays.
<br>
Chromagram, https://en.wikipedia.org/wiki/Chroma_feature 
Describes pitch in discrete values for 12 classes. 
<br>
Spectral features, definitions from Oxford Dictionary.
concerning the spectrum of the music frequencies. 
<br>
Harmonic features, https://en.wikipedia.org/wiki/Harmonic
a positive integer multiplied by a fundamental frequency. 
<br>
These features analyze the emotional content in music signals.
<br>
MIR toolbox is used for feature extraction.

In [4]:
turkish

Unnamed: 0,Class,_RMSenergy_Mean,_Lowenergy_Mean,_Fluctuation_Mean,_Tempo_Mean,_MFCC_Mean_1,_MFCC_Mean_2,_MFCC_Mean_3,_MFCC_Mean_4,_MFCC_Mean_5,...,_Chromagram_Mean_9,_Chromagram_Mean_10,_Chromagram_Mean_11,_Chromagram_Mean_12,_HarmonicChangeDetectionFunction_Mean,_HarmonicChangeDetectionFunction_Std,_HarmonicChangeDetectionFunction_Slope,_HarmonicChangeDetectionFunction_PeriodFreq,_HarmonicChangeDetectionFunction_PeriodAmp,_HarmonicChangeDetectionFunction_PeriodEntropy
0,relax,0.052,0.591,9.136,130.043,3.997,0.363,0.887,0.078,0.221,...,0.426,1.000,0.008,0.101,0.316,0.261,0.018,1.035,0.593,0.970
1,relax,0.125,0.439,6.680,142.240,4.058,0.516,0.785,0.397,0.556,...,0.002,1.000,0.000,0.984,0.285,0.211,-0.082,3.364,0.702,0.967
2,relax,0.046,0.639,10.578,188.154,2.775,0.903,0.502,0.329,0.287,...,0.184,0.746,0.016,1.000,0.413,0.299,0.134,1.682,0.692,0.963
3,relax,0.135,0.603,10.442,65.991,2.841,1.552,0.612,0.351,0.011,...,0.038,1.000,0.161,0.757,0.422,0.265,0.042,0.354,0.743,0.968
4,relax,0.066,0.591,9.769,88.890,3.217,0.228,0.814,0.096,0.434,...,0.004,0.404,1.000,0.001,0.345,0.261,0.089,0.748,0.674,0.957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,angry,0.174,0.578,5.265,161.144,2.985,0.065,0.718,0.046,0.265,...,0.374,0.838,0.413,0.125,0.323,0.140,0.120,2.691,0.867,0.969
396,angry,0.176,0.498,5.042,179.787,2.790,-0.148,0.342,-0.011,0.029,...,0.020,1.000,0.640,0.010,0.233,0.127,0.145,3.364,0.879,0.967
397,angry,0.187,0.557,4.724,134.032,1.672,0.566,0.880,-0.113,0.244,...,0.052,0.184,0.287,0.101,0.269,0.157,0.111,3.364,0.819,0.962
398,angry,0.140,0.573,4.470,113.600,1.997,-0.210,0.694,0.089,0.206,...,0.137,1.000,0.304,0.140,0.277,0.144,0.061,3.364,0.811,0.969


In [5]:
turkish.columns

Index(['Class', '_RMSenergy_Mean', '_Lowenergy_Mean', '_Fluctuation_Mean',
       '_Tempo_Mean', '_MFCC_Mean_1', '_MFCC_Mean_2', '_MFCC_Mean_3',
       '_MFCC_Mean_4', '_MFCC_Mean_5', '_MFCC_Mean_6', '_MFCC_Mean_7',
       '_MFCC_Mean_8', '_MFCC_Mean_9', '_MFCC_Mean_10', '_MFCC_Mean_11',
       '_MFCC_Mean_12', '_MFCC_Mean_13', '_Roughness_Mean', '_Roughness_Slope',
       '_Zero-crossingrate_Mean', '_AttackTime_Mean', '_AttackTime_Slope',
       '_Rolloff_Mean', '_Eventdensity_Mean', '_Pulseclarity_Mean',
       '_Brightness_Mean', '_Spectralcentroid_Mean', '_Spectralspread_Mean',
       '_Spectralskewness_Mean', '_Spectralkurtosis_Mean',
       '_Spectralflatness_Mean', '_EntropyofSpectrum_Mean',
       '_Chromagram_Mean_1', '_Chromagram_Mean_2', '_Chromagram_Mean_3',
       '_Chromagram_Mean_4', '_Chromagram_Mean_5', '_Chromagram_Mean_6',
       '_Chromagram_Mean_7', '_Chromagram_Mean_8', '_Chromagram_Mean_9',
       '_Chromagram_Mean_10', '_Chromagram_Mean_11', '_Chromagram_Mean_12

In [6]:
# TODO: perform and explain data cleaning steps. 

In [7]:
turkish.describe()

Unnamed: 0,_RMSenergy_Mean,_Lowenergy_Mean,_Fluctuation_Mean,_Tempo_Mean,_MFCC_Mean_1,_MFCC_Mean_2,_MFCC_Mean_3,_MFCC_Mean_4,_MFCC_Mean_5,_MFCC_Mean_6,...,_Chromagram_Mean_9,_Chromagram_Mean_10,_Chromagram_Mean_11,_Chromagram_Mean_12,_HarmonicChangeDetectionFunction_Mean,_HarmonicChangeDetectionFunction_Std,_HarmonicChangeDetectionFunction_Slope,_HarmonicChangeDetectionFunction_PeriodFreq,_HarmonicChangeDetectionFunction_PeriodAmp,_HarmonicChangeDetectionFunction_PeriodEntropy
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,...,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,0.13465,0.553605,7.145932,123.68202,2.456422,0.07189,0.488065,0.030465,0.178897,0.038307,...,0.354632,0.590975,0.34234,0.38562,0.328213,0.192997,-0.000157,1.762288,0.76969,0.966712
std,0.064368,0.05075,2.280145,34.234344,0.799262,0.537865,0.294607,0.275839,0.19523,0.203754,...,0.334976,0.357981,0.315808,0.348117,0.05552,0.047092,0.104743,0.930352,0.072107,0.003841
min,0.01,0.302,3.58,48.284,0.323,-3.484,-0.87,-1.636,-0.494,-0.916,...,0.0,0.0,0.0,0.0,0.112,0.06,-0.285,0.187,0.53,0.939
25%,0.085,0.523,5.8595,101.49025,1.9485,-0.26275,0.28125,-0.117,0.06125,-0.07825,...,0.06675,0.2645,0.0595,0.06075,0.29075,0.16,-0.058,0.961,0.725,0.965
50%,0.128,0.553,6.734,120.1325,2.3895,0.0685,0.4645,0.0445,0.181,0.0495,...,0.247,0.612,0.247,0.2965,0.333,0.19,-0.002,1.682,0.786,0.967
75%,0.174,0.58325,7.8235,148.98625,2.86025,0.41325,0.686,0.19825,0.2885,0.15125,...,0.612,1.0,0.56525,0.67075,0.36725,0.226,0.06325,2.243,0.824,0.969
max,0.431,0.703,23.475,195.026,5.996,1.937,1.622,1.126,1.055,0.799,...,1.0,1.0,1.0,1.0,0.488,0.34,0.442,4.486,0.908,0.977


In [8]:
turkish.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 51 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Class                                           400 non-null    object 
 1   _RMSenergy_Mean                                 400 non-null    float64
 2   _Lowenergy_Mean                                 400 non-null    float64
 3   _Fluctuation_Mean                               400 non-null    float64
 4   _Tempo_Mean                                     400 non-null    float64
 5   _MFCC_Mean_1                                    400 non-null    float64
 6   _MFCC_Mean_2                                    400 non-null    float64
 7   _MFCC_Mean_3                                    400 non-null    float64
 8   _MFCC_Mean_4                                    400 non-null    float64
 9   _MFCC_Mean_5                               

In [9]:
# check for missing
turkish[turkish.isna().any(axis=1)]

Unnamed: 0,Class,_RMSenergy_Mean,_Lowenergy_Mean,_Fluctuation_Mean,_Tempo_Mean,_MFCC_Mean_1,_MFCC_Mean_2,_MFCC_Mean_3,_MFCC_Mean_4,_MFCC_Mean_5,...,_Chromagram_Mean_9,_Chromagram_Mean_10,_Chromagram_Mean_11,_Chromagram_Mean_12,_HarmonicChangeDetectionFunction_Mean,_HarmonicChangeDetectionFunction_Std,_HarmonicChangeDetectionFunction_Slope,_HarmonicChangeDetectionFunction_PeriodFreq,_HarmonicChangeDetectionFunction_PeriodAmp,_HarmonicChangeDetectionFunction_PeriodEntropy


In [10]:
# sanity check
# ordinal and categorical data: none. 

# discrete values: require semantic understanding.


In [11]:
mfccdf = pd.DataFrame([])
NUM_MFCC_MEAN = 13
for i in range((NUM_MFCC_MEAN)):
    mfccm = "_MFCC_Mean_" + (str) (i + 1)
    mfccdf[mfccm] = turkish[mfccm]

There are 13 coefficients that describe the boundds of the given sample. 

In [12]:
# MFCC values 
mfccdf.describe()

Unnamed: 0,_MFCC_Mean_1,_MFCC_Mean_2,_MFCC_Mean_3,_MFCC_Mean_4,_MFCC_Mean_5,_MFCC_Mean_6,_MFCC_Mean_7,_MFCC_Mean_8,_MFCC_Mean_9,_MFCC_Mean_10,_MFCC_Mean_11,_MFCC_Mean_12,_MFCC_Mean_13
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,2.456422,0.07189,0.488065,0.030465,0.178897,0.038307,0.059943,0.043467,0.02301,0.027793,0.028798,0.016667,0.024118
std,0.799262,0.537865,0.294607,0.275839,0.19523,0.203754,0.180982,0.165184,0.159239,0.152235,0.136156,0.128528,0.13347
min,0.323,-3.484,-0.87,-1.636,-0.494,-0.916,-0.936,-0.744,-0.621,-0.544,-0.487,-0.418,-0.62
25%,1.9485,-0.26275,0.28125,-0.117,0.06125,-0.07825,-0.04125,-0.04925,-0.071,-0.05925,-0.044,-0.056,-0.0455
50%,2.3895,0.0685,0.4645,0.0445,0.181,0.0495,0.072,0.0395,0.0165,0.0315,0.037,0.0225,0.039
75%,2.86025,0.41325,0.686,0.19825,0.2885,0.15125,0.17225,0.13,0.123,0.126,0.114,0.0945,0.10125
max,5.996,1.937,1.622,1.126,1.055,0.799,0.571,0.728,0.539,0.51,0.494,0.355,0.536


In [13]:
mfccm = "_MFCC_Mean_"
mfccdf[ mfccdf[mfccm + (str) (1)] > 2 ][mfccm + "1"].count()

285

In [14]:
mfccdf[ mfccdf[mfccm + (str) (1)] > 5 ][mfccm + "1"].count()

4

There seems to be no outlier, however the average value of MFCC 1 is higher than other mfcc means.

Tempo (mean) 

In [15]:
turkish["_Tempo_Mean"].describe()

count    400.000000
mean     123.682020
std       34.234344
min       48.284000
25%      101.490250
50%      120.132500
75%      148.986250
max      195.026000
Name: _Tempo_Mean, dtype: float64

Describes the average tempo across 30 seconds. 

Chromograms 

In [16]:
chromogramdf = pd.DataFrame([])
NUM_CHROMOGRAM = 12
for i in range((NUM_CHROMOGRAM)):
    chromogram = "_Chromagram_Mean_" + (str) (i + 1)
    chromogramdf[chromogram] = turkish[chromogram]

In [17]:
chromogramdf.describe()

Unnamed: 0,_Chromagram_Mean_1,_Chromagram_Mean_2,_Chromagram_Mean_3,_Chromagram_Mean_4,_Chromagram_Mean_5,_Chromagram_Mean_6,_Chromagram_Mean_7,_Chromagram_Mean_8,_Chromagram_Mean_9,_Chromagram_Mean_10,_Chromagram_Mean_11,_Chromagram_Mean_12
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,0.35256,0.253035,0.365098,0.208295,0.350412,0.26388,0.242797,0.391873,0.354632,0.590975,0.34234,0.38562
std,0.323071,0.287694,0.32457,0.253623,0.303521,0.292692,0.275796,0.330826,0.334976,0.357981,0.315808,0.348117
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.057,0.0185,0.07975,0.017,0.08975,0.01975,0.026,0.102,0.06675,0.2645,0.0595,0.06075
50%,0.2735,0.142,0.2885,0.105,0.271,0.144,0.141,0.2955,0.247,0.612,0.247,0.2965
75%,0.55125,0.39525,0.5765,0.315,0.53575,0.4505,0.365,0.6355,0.612,1.0,0.56525,0.67075
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Values of chromograms fall between 0 and 1. 

Spectral

In [34]:
temp = ['_Spectralcentroid_Mean', '_Spectralspread_Mean',
       '_Spectralskewness_Mean', '_Spectralkurtosis_Mean',
       '_Spectralflatness_Mean', '_EntropyofSpectrum_Mean']
spectral = pd.DataFrame([])
spectral[temp] = turkish[temp]



In [35]:
spectral.describe()

Unnamed: 0,_Spectralcentroid_Mean,_Spectralspread_Mean,_Spectralskewness_Mean,_Spectralkurtosis_Mean,_Spectralflatness_Mean,_EntropyofSpectrum_Mean
count,400.0,400.0,400.0,400.0,400.0,400.0
mean,2581.167267,3082.394695,1.870035,7.348953,0.048523,0.872607
std,863.520318,767.648035,0.881635,8.621386,0.026492,0.03726
min,606.524,814.817,0.39,1.93,0.006,0.74
25%,1981.55775,2506.7685,1.32725,3.8815,0.029,0.853
50%,2547.678,3150.949,1.687,5.216,0.047,0.879
75%,3182.56975,3684.32525,2.1825,7.849,0.062,0.899
max,5326.379,4721.479,7.855,121.996,0.209,0.942


No particular meaning. 

In [30]:
temp = ['_HarmonicChangeDetectionFunction_Mean',
       '_HarmonicChangeDetectionFunction_Std',
       '_HarmonicChangeDetectionFunction_Slope',
       '_HarmonicChangeDetectionFunction_PeriodFreq',
       '_HarmonicChangeDetectionFunction_PeriodAmp',
       '_HarmonicChangeDetectionFunction_PeriodEntropy']
harmonic = pd.DataFrame([])
harmonic[temp] = turkish[temp]


In [31]:
harmonic.describe()

Unnamed: 0,_HarmonicChangeDetectionFunction_Mean,_HarmonicChangeDetectionFunction_Std,_HarmonicChangeDetectionFunction_Slope,_HarmonicChangeDetectionFunction_PeriodFreq,_HarmonicChangeDetectionFunction_PeriodAmp,_HarmonicChangeDetectionFunction_PeriodEntropy
count,400.0,400.0,400.0,400.0,400.0,400.0
mean,0.328213,0.192997,-0.000157,1.762288,0.76969,0.966712
std,0.05552,0.047092,0.104743,0.930352,0.072107,0.003841
min,0.112,0.06,-0.285,0.187,0.53,0.939
25%,0.29075,0.16,-0.058,0.961,0.725,0.965
50%,0.333,0.19,-0.002,1.682,0.786,0.967
75%,0.36725,0.226,0.06325,2.243,0.824,0.969
max,0.488,0.34,0.442,4.486,0.908,0.977


_HarmonicChangeDetectionFunction_Slope likely has the direction included in the change. Thus, there are negative values.

These features do not have descriptions. 

In [26]:
undescribed = ['_RMSenergy_Mean', '_Lowenergy_Mean', '_Fluctuation_Mean', '_Roughness_Mean', '_Roughness_Slope',
       '_Zero-crossingrate_Mean', '_AttackTime_Mean', '_AttackTime_Slope',
       '_Rolloff_Mean', '_Eventdensity_Mean', '_Pulseclarity_Mean',
       '_Brightness_Mean']

undescribeddf = pd.DataFrame([])
undescribeddf[undescribed] = turkish[undescribed]

In [28]:
undescribeddf.describe()

Unnamed: 0,_RMSenergy_Mean,_Lowenergy_Mean,_Fluctuation_Mean,_Roughness_Mean,_Roughness_Slope,_Zero-crossingrate_Mean,_AttackTime_Mean,_AttackTime_Slope,_Rolloff_Mean,_Eventdensity_Mean,_Pulseclarity_Mean,_Brightness_Mean
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,0.13465,0.553605,7.145932,527.681365,0.072038,997.252315,0.031305,-0.00289,5691.069637,2.78482,0.249387,0.434158
std,0.064368,0.05075,2.280145,521.218943,0.174301,524.895867,0.016801,0.14992,2293.401839,1.326889,0.155335,0.131517
min,0.01,0.302,3.58,0.941,-0.525,149.49,0.01,-0.465,887.151,0.234,0.011,0.053
25%,0.085,0.523,5.8595,169.18875,-0.027,592.275,0.023,-0.094,3933.55275,1.737,0.12775,0.3525
50%,0.128,0.553,6.734,367.578,0.068,893.491,0.027,0.0075,5648.628,2.773,0.218,0.448
75%,0.174,0.58325,7.8235,734.3725,0.174,1303.49275,0.033,0.089,7355.88625,3.6925,0.32725,0.52725
max,0.431,0.703,23.475,3899.847,0.584,3147.907,0.165,0.599,11508.298,7.952,0.856,0.737
