### Hierarchical classification

In [10]:
import pandas as pd
import numpy as np
import ast

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

from sklearn_hierarchical_classification.classifier import HierarchicalClassifier
from sklearn_hierarchical_classification.constants import ROOT

from util_koyak import col_selector_hiera, grid_estimator
from tqdm import tqdm_notebook

import warnings; warnings.simplefilter('ignore')

In [20]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("seaborn")
%matplotlib inline

In [21]:
def conf_plot(y, y_pred):
    plt.figure(dpi=180)
    sns.heatmap(confusion_matrix(y, y_pred), cmap=plt.cm.Blues, annot=True, square=True, fmt='d', annot_kws={"size": 15});
    plt.xlabel('prediction')
    plt.ylabel('actual');

#### Importing features and selecting them.

In [11]:
df_genre = pd.read_csv("features_genre_HIERA_min_100.csv", index_col=0)
df_mfcc_cont = col_selector_hiera(['mfcc','contrast'], df_genre)

In [12]:
X = df_mfcc_cont.drop("genre_selected",axis=1)
y = df_mfcc_cont["genre_selected"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.10, random_state=4444)

In [14]:
sca = StandardScaler()
X_train_sca = sca.fit_transform(X_train)
X_test_sca = sca.transform(X_test);

Creating the class hierarchy

In [15]:
df_genres = pd.read_csv("genres.csv")

In [16]:
class_hierarchy = {}
for index in range(df_genres["genre_id"].shape[0]):
    try:
        class_hierarchy[df_genres["parent"].iloc[index]].append(df_genres["genre_id"].iloc[index])
    except:    
        class_hierarchy[df_genres["parent"].iloc[index]] = [df_genres["genre_id"].iloc[index]]
                     
class_hierarchy[ROOT]=class_hierarchy[0]
del class_hierarchy[0]

### Hierarchical classification with RandomForest

In [17]:
def Hierarchical_fit(estimator, class_hierarchy, X_train_sca, y_train, X_test_sca, y_test, stopping_criteria =0.5):
    base_estimator = make_pipeline(estimator)
    clf = HierarchicalClassifier(
                        base_estimator = base_estimator,
                        class_hierarchy=class_hierarchy,
                        prediction_depth="nmlnp",
                        stopping_criteria=stopping_criteria,
#                        progress_wrapper=tqdm_notebook
                        )
    clf.fit(X_train_sca, y_train)
    y_pred = clf.predict(X_test_sca)
    y_pred_train = clf.predict(X_train_sca)    
    return accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred)

In [18]:
results=[]
for stopping_criteria in [0.2,0.5,0.8]:
    results.append([stopping_criteria, 
                    Hierarchical_fit(LogisticRegression(), 
                                     class_hierarchy, X_train_sca, y_train, X_test_sca, 
                                     y_test, stopping_criteria =stopping_criteria)])

_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 46
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 79
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 86
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 92
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 102
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 130
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 3
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 4
_train_l

_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 14
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 19
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 181
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 182
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 468
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 65
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 6
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 16
_train

In [19]:
results

[[0.2, (0.4328529132450701, 0.42961287078934135)],
 [0.5, (0.4334953354561198, 0.4306184012066365)],
 [0.8, (0.41606614155633764, 0.4034690799396682)]]

In [22]:
results=[]
for C in np.logspace(-2,1,6):
    for stopping_criteria in [0.2,0.5,0.8]:
        results.append([C, stopping_criteria, 
                        Hierarchical_fit(LogisticRegression(C=C), 
                                         class_hierarchy, X_train_sca, y_train, X_test_sca, 
                                         y_test, stopping_criteria =stopping_criteria)])

_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 46
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 79
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 86
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 92
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 102
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 130
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 3
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 4
_train_l

_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 14
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 19
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 181
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 182
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 468
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 65
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 6
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 16
_train

_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 4
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 9
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 651
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 109
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 26
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 45
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 53
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 85
_train_l

_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 16
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 763
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 46
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 79
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 86
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 92
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 102
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 130
_trai

_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 85
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 13
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 14
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 19
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 181
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 182
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 468
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 65
_trai

_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 130
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 3
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 4
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 9
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 651
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 109
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 26
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 45
_train_l

_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 65
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 6
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 16
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 763
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 46
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 79
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 86
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 92
_train_l

In [23]:
results

[[0.01, 0.2, (0.4243617674990224, 0.42483660130718953)],
 [0.01, 0.5, (0.4248924641081504, 0.4240824534942182)],
 [0.01, 0.8, (0.3443103737221384, 0.3378582202111614)],
 [0.039810717055349734, 0.2, (0.4274900843528294, 0.42735042735042733)],
 [0.039810717055349734, 0.5, (0.4278531925590749, 0.4276018099547511)],
 [0.039810717055349734, 0.8, (0.40718395620356407, 0.4004524886877828)],
 [0.15848931924611134, 0.2, (0.43036701860231275, 0.42860734037204623)],
 [0.15848931924611134, 0.5, (0.4310653036143232, 0.4291101055806938)],
 [0.15848931924611134, 0.8, (0.4140830121222278, 0.4032176973353444)],
 [0.630957344480193, 0.2, (0.4322942852354617, 0.43011563599798897)],
 [0.630957344480193, 0.5, (0.4331322272498743, 0.4311211664152841)],
 [0.630957344480193, 0.8, (0.41598234735489636, 0.4042232277526395)],
 [2.5118864315095797, 0.2, (0.43321602145131555, 0.42860734037204623)],
 [2.5118864315095797, 0.5, (0.4334953354561198, 0.42936148818501757)],
 [2.5118864315095797, 0.8, (0.4158706217529747

In [25]:
def Hierarchical_fit_TruncatedSVD(estimator, class_hierarchy, X_train_sca, y_train, X_test_sca,
                     y_test, stopping_criteria =0.5, n_components=20):
    base_estimator = make_pipeline(TruncatedSVD(n_components=n_components),estimator)
    clf = HierarchicalClassifier(
                        base_estimator = base_estimator,
                        class_hierarchy=class_hierarchy,
                        prediction_depth="nmlnp",
                        stopping_criteria=stopping_criteria,
#                        progress_wrapper=tqdm_notebook
                        )
    clf.fit(X_train_sca, y_train)
    y_pred = clf.predict(X_test_sca)
    y_pred_train = clf.predict(X_train_sca)    
    return accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred)

In [None]:
results=[]
for n_components in np.arange(10,151,20):
    results.append([n_components, 
                    Hierarchical_fit_TruncatedSVD(LogisticRegression(C=0.631), 
                                     class_hierarchy, X_train_sca, y_train, X_test_sca, 
                                     y_test, stopping_criteria =0.5,n_components=n_components)])

In [None]:
results