# AI Machine Learning Practice 

# Informations

- Dataset: Music dataset

- Objectives: Classification

- Time Limits: 1 min

- Score: Classification Accuracy (Test Data)

- Please read all markdowns carefully 

- About Dataset: Music Style Data
    - 348 float type music features (frequency, tone, tempo, timbre...)
    - Label: Music Style
        - 1: Melancholy
        - 2: Romantic
        - 3: Rhythmical
    

## [Step 0] Importing Packages

You must specify all the packages you use in this practice in the cell below.



In [23]:
from __future__ import print_function
import os
data_path = ['data']

from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, balanced_accuracy_score, f1_score, recall_score, precision_score
import numpy as np

## [Step 1] Read Data

Train dataset is in the 'data' directory


In [2]:
import pandas as pd

# Import the data using the file path
filepath = os.sep.join(data_path + ['music_train_data.csv'])
data = pd.read_csv(filepath)

In [3]:
data.head(1)

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f340,f341,f342,f343,f344,f345,f346,f347,f348,answer
0,-0.166614,0.284691,-0.011022,-1.028812,0.101653,0.498247,-0.314566,1.208697,-1.503008,-1.457764,...,2.136721,-1.193955,0.040614,1.127366,0.741521,-0.70773,0.077748,0.832992,-1.291423,2


In [4]:
print(data.shape)
print(data.dtypes)

(650, 349)
f1        float64
f2        float64
f3        float64
f4        float64
f5        float64
           ...   
f345      float64
f346      float64
f347      float64
f348      float64
answer      int64
Length: 349, dtype: object


In [5]:
features = data.columns[:-1]
X_data = data[features]
y_data = data['answer']

In [6]:
# Label proportion => imbalanced!
print(y_data.value_counts(normalize=True).sort_index())

1    0.516923
2    0.369231
3    0.113846
Name: answer, dtype: float64


## [Step 2] Data Preprocessing

* Generate various dataset with combination of various preprocessing
* First, calculate each feature columns' correlation with other columns, generate new dataset with high correlation feature columns dropped
* Second, generate dataset with MinMaxScaler, StandardScaler
* Third, for MinMax scaled data, check skewing of each columns, and generate new dataset with applying log to columns with high skew values
* Generated datasets are 
1. original-data
2. dropped-data
3. minmaxscaled-data 
4. standardscaled-data
5. minmaxscaled-skewlogged-data
6. minmaxscaled-dropped-data
7. standardscaled-dropped-data
8. minmaxscaled-skewlogged-dropped-data

In [7]:
# Decision trees and ensemble methods do not require feature scaling to be performed 
# they are not sensitive to the the variance in the data

In [8]:
# Dictionary to store various experimental training dataset
X_data_dict = {'X_data': X_data}

In [9]:
# Correlation between every feature columns
X_data_corr = pd.DataFrame(index=X_data.columns, columns=X_data.columns)
for col in X_data.columns:
    X_data_corr[col] = X_data[X_data.columns].corrwith(X_data[col])
for i in range(len(X_data_corr.columns)):
    X_data_corr.iloc[i,i:] = 0.
X_data_corr = X_data_corr.apply(abs)
X_data_corr

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f339,f340,f341,f342,f343,f344,f345,f346,f347,f348
f1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
f2,0.986154,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
f3,0.141059,0.137554,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
f4,0.338368,0.352530,0.067998,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
f5,0.675279,0.678184,0.110398,0.435819,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
f344,0.401183,0.392504,0.294139,0.048916,0.070123,0.172394,0.196303,0.687886,0.006476,0.361006,...,0.005967,0.017944,0.041522,0.080583,0.693674,0.000000,0.000000,0.000000,0.000000,0.0
f345,0.203257,0.198477,0.352966,0.029907,0.032824,0.302407,0.054753,0.263086,0.037035,0.213334,...,0.132826,0.020030,0.077922,0.010812,0.333309,0.364506,0.000000,0.000000,0.000000,0.0
f346,0.028590,0.024365,0.026298,0.019282,0.209997,0.014193,0.014462,0.214913,0.004694,0.174357,...,0.078074,0.143545,0.069250,0.014253,0.065581,0.139374,0.051037,0.000000,0.000000,0.0
f347,0.282348,0.297530,0.200277,0.197937,0.574398,0.089421,0.377089,0.454674,0.129858,0.467068,...,0.026722,0.078784,0.035134,0.036331,0.769421,0.261522,0.113366,0.031478,0.000000,0.0


In [10]:
# Generate new dataset by dropping columns with correlation value equal or larger than threshold(0.8)
X_data_drop = X_data.copy()
drop_col = []
for col in X_data_corr.columns:
    for row in X_data_corr.index:
        if X_data_corr[col][row] >= 0.8:
            X_data_drop.drop(columns=col, inplace=True)
            drop_col.append(col)
            break
X_data_dict['X_data_drop'] = X_data_drop
print(drop_col)
X_data_drop

['f1', 'f14', 'f17', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f39', 'f41', 'f45', 'f47', 'f48', 'f49', 'f50', 'f51', 'f54', 'f55', 'f59', 'f65', 'f66', 'f77', 'f166', 'f167', 'f168', 'f169', 'f170', 'f171', 'f172', 'f173', 'f174', 'f175', 'f176', 'f177', 'f178', 'f205', 'f206', 'f207', 'f208', 'f209', 'f210', 'f211', 'f212', 'f213', 'f214', 'f215', 'f216', 'f217', 'f218', 'f219', 'f220', 'f221', 'f222', 'f223', 'f224', 'f225', 'f226', 'f227', 'f228', 'f229', 'f230', 'f245', 'f246', 'f247', 'f248', 'f249', 'f250', 'f251', 'f252', 'f253', 'f254', 'f255', 'f283', 'f284', 'f285', 'f286', 'f287', 'f288', 'f289', 'f290', 'f291', 'f292', 'f293', 'f294', 'f316', 'f322', 'f323', 'f324', 'f325']


Unnamed: 0,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,...,f339,f340,f341,f342,f343,f344,f345,f346,f347,f348
0,0.284691,-0.011022,-1.028812,0.101653,0.498247,-0.314566,1.208697,-1.503008,-1.457764,0.330351,...,-0.196257,2.136721,-1.193955,0.040614,1.127366,0.741521,-0.707730,0.077748,0.832992,-1.291423
1,0.651829,-1.754836,-0.512545,-1.063596,1.434039,-1.404162,-0.745222,0.054440,0.153028,-0.404961,...,0.012412,-0.803444,-0.327357,1.022505,-1.083422,-0.714399,-1.407135,0.846917,-1.562645,0.365577
2,0.353967,0.749125,-0.123296,-0.809340,0.042847,-0.688971,-0.741502,0.059323,0.995969,0.382129,...,-1.324741,0.423848,-0.200293,-0.265410,-1.182842,-0.416810,0.234092,-1.061020,-1.595486,-0.867771
3,0.151251,1.017266,-0.809429,-0.558230,-0.649047,-0.790528,-0.798548,1.471307,0.045307,1.159451,...,0.214091,0.601954,-0.092321,-1.319499,-0.673082,-0.816716,-4.172430,1.093709,1.106629,0.476974
4,-0.761447,0.138536,2.406821,0.735203,-0.058044,0.358249,1.027844,-0.043828,-1.221248,0.532904,...,0.218412,-0.673082,-0.100535,-0.623208,1.571379,0.270780,-0.792427,-0.402796,1.349432,0.174976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
645,0.244564,2.173754,0.034836,0.180263,-2.834412,-0.392688,0.684052,-0.745517,0.918739,-1.070555,...,-0.203314,-0.796906,0.086825,-1.014114,0.221252,0.200125,-0.153418,1.596107,0.641362,1.504907
646,-0.940471,-0.717929,0.888751,0.787062,0.897648,1.208942,-0.863827,0.117906,0.355986,1.415710,...,-0.152532,-0.746864,-0.600670,-1.552048,-0.825841,-0.372074,0.319805,-0.981512,-0.956493,0.741365
647,-0.970338,-0.388963,-0.555268,0.342415,0.414480,0.010077,-0.959610,-0.218170,0.607368,0.265552,...,-0.126745,-0.768960,-1.382635,0.147514,-0.699404,-1.162207,1.279939,0.647133,0.191886,0.421133
648,-0.186426,-0.949760,-1.151156,-0.052513,0.516760,0.909057,-0.147741,-0.781873,-0.224130,-0.345255,...,-0.536003,0.103654,1.257128,0.111375,-1.456077,-1.084994,-0.250769,-1.140101,-0.592954,0.498912


## [Step 3] Model Training

* First, train single decision tree classifier without any parameter to obtain max depth and length of feature importances. With each dataset, using decision tree classifier, gridsearch parameters max_depth, max_features, with scoring method balanced accuracy and 5-fold stratified k fold cross validation
* Second, for number of trees increasing from 15 to 500, calculate oob errors for bagging, random forest, and extra random trees models. With lowest oob error model, calculate mean score of balanced accuracy from cross validation with 5-fold stratified k fold

In [11]:
# Function for error measures
def measure_error(y_true, y_pred, label=''):
    return pd.Series({'accuracy':accuracy_score(y_true, y_pred),
                      'precision': precision_score(y_true, y_pred, average='micro'),
                      'recall': recall_score(y_true, y_pred, average='micro'),
                      'balanced accuracy': balanced_accuracy_score(y_true, y_pred),
                      'f1': f1_score(y_true, y_pred, average='micro')},
                      name=label)

### 1. Deicision Tree

In [12]:
# Base decision tree model to obtain gridsearch parameter
dt = DecisionTreeClassifier()
dt = dt.fit(X_data, y_data)
dt.tree_.max_depth, len(dt.feature_importances_)

(16, 348)

In [13]:
# Gridsearch each dataset with stratified cross validation, check balanced accuracy
for data_name, X_data in X_data_dict.items():
    dtc = DecisionTreeClassifier()
    GS_dtc = GridSearchCV(dtc,
                          param_grid={
                              'max_depth': range(1, dt.tree_.max_depth+1),
                              'max_features': range(1, len(dt.feature_importances_)+1),
                          },
                          scoring='balanced_accuracy',
                          refit=True,
                          cv=StratifiedKFold(n_splits=5)
                         )
    GS_dtc.fit(X_data, y_data)
    print(data_name, GS_dtc.best_params_, GS_dtc.best_score_)

X_data {'max_depth': 6, 'max_features': 117} 0.784860431734883
X_data_drop {'max_depth': 10, 'max_features': 139} 0.7560432083281073


### 2. Bagging, Random Forest, Extra Random Trees

In [14]:
# Suppress warnings about too few trees from the early models
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [15]:
# Bagging

In [16]:
# Check oob error with number of trees
bc_oob_df = list()
for data_name, X_data in X_data_dict.items():
    bc = BaggingClassifier(oob_score=True, warm_start=False, random_state=20184757)
    oob_list = list()
    
    for n_trees in [15, 20, 30, 40, 50, 100, 150, 200, 300, 400, 500]:
        bc.set_params(n_estimators=n_trees)
        bc.fit(X_data, y_data)
        oob_error = 1 - bc.oob_score_
        oob_list.append(pd.Series({'n_trees': n_trees, data_name: oob_error}))
        
    bc_oob = pd.concat(oob_list, axis=1).T.set_index('n_trees')
    bc_oob_df.append(bc_oob)
bc_oob_df = pd.concat(bc_oob_df, axis=1)
bc_oob_df

Unnamed: 0_level_0,X_data,X_data_drop
n_trees,Unnamed: 1_level_1,Unnamed: 2_level_1
15.0,0.135385,0.152308
20.0,0.130769,0.136923
30.0,0.124615,0.136923
40.0,0.115385,0.132308
50.0,0.115385,0.130769
100.0,0.107692,0.123077
150.0,0.106154,0.130769
200.0,0.106154,0.127692
300.0,0.104615,0.121538
400.0,0.103077,0.118462


In [32]:
# Cross validate (stratified) bagging best model
bc = BaggingClassifier(n_estimators=400)
np.mean(cross_val_score(bc, X_data_dict['X_data'], y_data, cv=StratifiedKFold(n_splits=5), scoring='balanced_accuracy'))

0.760109466672241

In [17]:
# Random Forest

In [26]:
# Check oob error with number of trees
rfc_oob_df = list()
for data_name, X_data in X_data_dict.items():
    rfc = RandomForestClassifier(oob_score=True, warm_start=False, bootstrap=True, random_state=20184757)
    oob_list = list()
    
    for n_trees in [15, 20, 30, 40, 50, 100, 150, 200, 300, 400, 500]:
        rfc.set_params(n_estimators=n_trees)
        rfc.fit(X_data, y_data)
        oob_error = 1 - rfc.oob_score_
        oob_list.append(pd.Series({'n_trees': n_trees, data_name: oob_error}))
        
    rfc_oob = pd.concat(oob_list, axis=1).T.set_index('n_trees')
    rfc_oob_df.append(rfc_oob)
rfc_oob_df = pd.concat(rfc_oob_df, axis=1)
rfc_oob_df

Unnamed: 0_level_0,X_data,X_data_drop
n_trees,Unnamed: 1_level_1,Unnamed: 2_level_1
15.0,0.14,0.166154
20.0,0.121538,0.156923
30.0,0.127692,0.141538
40.0,0.12,0.129231
50.0,0.113846,0.129231
100.0,0.104615,0.127692
150.0,0.107692,0.123077
200.0,0.106154,0.124615
300.0,0.103077,0.129231
400.0,0.107692,0.129231


In [33]:
# Cross validate (stratified) random forest best model
rfc = RandomForestClassifier(n_estimators=500, bootstrap=True)
np.mean(cross_val_score(rfc, X_data_dict['X_data'], y_data, cv=StratifiedKFold(n_splits=5), scoring='balanced_accuracy'))

0.744225302061123

In [19]:
# Extra Random Trees

In [20]:
# Check oob error with number of trees
etc_oob_df = list()
for data_name, X_data in X_data_dict.items():
    etc = ExtraTreesClassifier(oob_score=True, warm_start=False, bootstrap=True, random_state=20184757)
    oob_list = list()
    
    for n_trees in [15, 20, 30, 40, 50, 100, 150, 200, 300, 400, 500]:
        etc.set_params(n_estimators=n_trees)
        etc.fit(X_data, y_data)
        oob_error = 1 - etc.oob_score_
        oob_list.append(pd.Series({'n_trees': n_trees, data_name: oob_error}))
        
    etc_oob = pd.concat(oob_list, axis=1).T.set_index('n_trees')
    etc_oob_df.append(etc_oob)
etc_oob_df = pd.concat(etc_oob_df, axis=1)
etc_oob_df

Unnamed: 0_level_0,X_data,X_data_drop
n_trees,Unnamed: 1_level_1,Unnamed: 2_level_1
15.0,0.149231,0.206154
20.0,0.123077,0.186154
30.0,0.123077,0.164615
40.0,0.123077,0.164615
50.0,0.133846,0.155385
100.0,0.123077,0.146154
150.0,0.116923,0.143077
200.0,0.12,0.138462
300.0,0.12,0.136923
400.0,0.118462,0.14


In [34]:
# Cross validate (stratified) extra random trees best model
etc = ExtraTreesClassifier(n_estimators=400, bootstrap=True)
np.mean(cross_val_score(etc, X_data_dict['X_data'], y_data, cv=StratifiedKFold(n_splits=5), scoring='balanced_accuracy'))

0.6804979305154898

## Analysis 

* From above training code, select models which has highest balanced accuracy value
* Print confusion matrix and various error measures including accuracy, precision, recall, balanced accuracy, f1 score

In [None]:
# Test for best models, with whole data

In [43]:
# DT best model 1 (0.784860431734883)
model = DecisionTreeClassifier(max_depth=6, max_features=117)
model.fit(X_data_dict['X_data'], y_data)
y_pred = model.predict(X_data_dict['X_data'])
confusion_matrix(y_true=y_data, y_pred=y_pred), measure_error(y_true=y_data, y_pred=y_pred)

(array([[329,   3,   4],
        [  4, 236,   0],
        [ 12,   1,  61]]),
 accuracy             0.963077
 precision            0.963077
 recall               0.963077
 balanced accuracy    0.928941
 f1                   0.963077
 Name: , dtype: float64)

In [36]:
# DT best model 2 (0.760109466672241)
model = BaggingClassifier(n_estimators=400)
model.fit(X_data_dict['X_data'], y_data)
y_pred = model.predict(X_data_dict['X_data'])
confusion_matrix(y_true=y_data, y_pred=y_pred), measure_error(y_true=y_data, y_pred=y_pred)

(array([[336,   0,   0],
        [  0, 240,   0],
        [  0,   0,  74]]),
 accuracy             1.0
 precision            1.0
 recall               1.0
 balanced accuracy    1.0
 f1                   1.0
 Name: , dtype: float64)