### Hierarchical classification

In [17]:
import pandas as pd
import numpy as np
import ast

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

from sklearn.metrics import classification_report

from sklearn.preprocessing import MultiLabelBinarizer

from sklearn_hierarchical_classification.classifier import HierarchicalClassifier
from sklearn_hierarchical_classification.constants import ROOT

from util_koyak import col_selector_hiera, grid_estimator
from tqdm import tqdm_notebook

import warnings; warnings.simplefilter('ignore')

#### Importing features and selecting them.

In [3]:
df_genre = pd.read_csv("features_genre_HIERA_min_100.csv", index_col=0)
df_mfcc_cont = col_selector_hiera(['mfcc','contrast'], df_genre)

In [4]:
X = df_mfcc_cont.drop("genre_selected",axis=1)
y = df_mfcc_cont["genre_selected"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.10, random_state=4444)

In [6]:
sca = StandardScaler()
X_train_sca = sca.fit_transform(X_train)
X_test_sca = sca.transform(X_test);

Creating the class hierarchy

In [7]:
df_genres = pd.read_csv("genres.csv")

In [8]:
class_hierarchy = {}
for index in range(df_genres["genre_id"].shape[0]):
    try:
        class_hierarchy[df_genres["parent"].iloc[index]].append(df_genres["genre_id"].iloc[index])
    except:    
        class_hierarchy[df_genres["parent"].iloc[index]] = [df_genres["genre_id"].iloc[index]]
                     
class_hierarchy[ROOT]=class_hierarchy[0]
del class_hierarchy[0]

### Hierarchical classification with RandomForest

In [11]:
def Hierarchical_fit(estimator, class_hierarchy, X_train_sca, y_train, X_test_sca, y_test, stopping_criteria =0.5):
    base_estimator = make_pipeline(estimator)
    clf = HierarchicalClassifier(
                        base_estimator = base_estimator,
                        class_hierarchy=class_hierarchy,
                        prediction_depth="nmlnp",
                        stopping_criteria=stopping_criteria,
#                        progress_wrapper=tqdm_notebook
                        )
    clf.fit(X_train_sca, y_train)
    y_pred = clf.predict(X_test_sca)
    y_pred_train = clf.predict(X_train_sca)    
    return accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred)

In [None]:
results=[]
#for stopping_criteria in np.arange(0.01, 1.05, 0.05):
for stopping_criteria in [0.2,0.5,0.8]:
    results.append([stopping_criteria, 
                    Hierarchical_fit(RandomForestClassifier(), 
                                     class_hierarchy, X_train_sca, y_train, X_test_sca, 
                                     y_test, stopping_criteria =stopping_criteria)])

In [13]:
results

[[0.2, (0.5891290989330205, 0.43162393162393164)],
 [0.5, (0.5924529355901905, 0.43036701860231275)],
 [0.8, (0.5494106474498631, 0.3929110105580694)]]

In [12]:
Hierarchical_fit(RandomForestClassifier(max_depth=100, max_leaf_nodes=1000000, n_estimators=1000), 
                 class_hierarchy, X_train_sca, y_train, X_test_sca, 
                 y_test, stopping_criteria =0.5)

_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 46
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 79
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 86
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 92
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 102
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 130
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 3
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 4
_train_l

(0.604519300597732, 0.4318753142282554)

### Now with TruncatedSVD

In [20]:
def Hierarchical_fit_TruncatedSVD(estimator, class_hierarchy, X_train_sca, y_train, X_test_sca,
                     y_test, stopping_criteria =0.5, n_components=20):
    base_estimator = make_pipeline(TruncatedSVD(n_components=n_components),estimator)
    clf = HierarchicalClassifier(
                        base_estimator = base_estimator,
                        class_hierarchy=class_hierarchy,
                        prediction_depth="nmlnp",
                        stopping_criteria=stopping_criteria,
#                        progress_wrapper=tqdm_notebook
                        )
    clf.fit(X_train_sca, y_train)
    y_pred = clf.predict(X_test_sca)
    y_pred_train = clf.predict(X_train_sca)    
    return accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred)

In [None]:
results=[]
#for stopping_criteria in np.arange(0.01, 1.05, 0.05):
for stopping_criteria in [0.2,0.5,0.8]:
    results.append([stopping_criteria, 
                    Hierarchical_fit_TruncatedSVD(RandomForestClassifier(), 
                                     class_hierarchy, X_train_sca, y_train, X_test_sca, 
                                     y_test, stopping_criteria =stopping_criteria)])

In [22]:
results

[[0.2, (0.5766996257192336, 0.42609351432880843)],
 [0.5, (0.5794648343667952, 0.42433383609854197)],
 [0.8, (0.5371487626389587, 0.3984414278531926)]]