### Flat classification for 50 genres

In [2]:
import pandas as pd
import numpy as np
import ast

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

from util_koyak import col_selector_hiera, grid_estimator

import warnings; warnings.simplefilter('ignore')



#### Importing features and selecting them.

In [3]:
df_genre = pd.read_csv("features_genre_HIERA_min_100.csv", index_col=0)
df_mfcc_cont = col_selector_hiera(['mfcc','contrast'], df_genre)

In [4]:
X = df_mfcc_cont.drop("genre_selected",axis=1)
y = df_mfcc_cont["genre_selected"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.10, random_state=4444)

In [6]:
sca = StandardScaler()
X_train_sca = sca.fit_transform(X_train)
X_test_sca = sca.transform(X_test);

In [6]:
param_grid = {}
grid_estimator(RandomForestClassifier(), param_grid, X_train_sca, y_train, X_test_sca, y_test)

Accuracy training: 0.48100664767331436
Accuracy test: 0.4906988436400201
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [7]:
param_grid = {'n_jobs':[4], 'n_estimators': [10,100,1000], 'max_depth': [50,100,200,400], 'max_leaf_nodes':[10000,100000,1000000]}
grid_estimator(RandomForestClassifier(), param_grid, X_train_sca, y_train, X_test_sca, y_test)

Accuracy training: 0.5149991620579856
Accuracy test: 0.5168426344896934
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=100, max_features='auto', max_leaf_nodes=100000,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=4,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


### With oversampling

In [7]:
X_smoted, y_smoted = SMOTE(random_state=42).fit_sample(X_train_sca,y_train)

In [8]:
param_grid = {}
grid_estimator(RandomForestClassifier(), param_grid, X_smoted, y_smoted, X_test_sca, y_test)

Accuracy training: 0.9745540162424077
Accuracy test: 0.3743086978381096
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
