### Hierarchical classification

In [1]:
import pandas as pd
import numpy as np
import ast

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

from sklearn_hierarchical_classification.classifier import HierarchicalClassifier
from sklearn_hierarchical_classification.constants import ROOT

from util_koyak import col_selector_hiera, grid_estimator
from tqdm import tqdm_notebook
from imblearn.over_sampling import SMOTE

from features_adapted import compute_features

import warnings; warnings.simplefilter('ignore')



#### Importing features and selecting them.

In [3]:
df_genre = pd.read_csv("features_genre_HIERA_min_100.csv", index_col=0)
df_mfcc_cont = col_selector_hiera(['mfcc','contrast'], df_genre)

In [4]:
X = df_mfcc_cont.drop("genre_selected",axis=1)
y = df_mfcc_cont["genre_selected"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.10, random_state=4444)

In [6]:
sca = StandardScaler()
X_train_sca = sca.fit_transform(X_train)
X_test_sca = sca.transform(X_test);

Creating the class hierarchy

In [7]:
df_genres = pd.read_csv("genres.csv")

In [8]:
class_hierarchy = {}
for index in range(df_genres["genre_id"].shape[0]):
    try:
        class_hierarchy[df_genres["parent"].iloc[index]].append(df_genres["genre_id"].iloc[index])
    except:    
        class_hierarchy[df_genres["parent"].iloc[index]] = [df_genres["genre_id"].iloc[index]]
                     
class_hierarchy[ROOT]=class_hierarchy[0]
del class_hierarchy[0]

In [9]:
class_hierarchy

{-1: [2, 3, 4, 5, 8, 9, 10, 12, 13, 14, 15, 17, 20, 21, 38, 1235],
 2: [46, 77, 79, 86, 92, 102, 117, 118, 130, 171, 172, 176, 177, 232, 504],
 3: [567],
 4: [37, 74, 97, 178, 179, 906],
 5: [187, 322, 441, 442, 443, 444, 659],
 6: [16, 360],
 9: [63, 137, 169, 651],
 10: [76, 362],
 12: [25, 26, 27, 31, 36, 45, 58, 66, 70, 85, 88, 98, 314, 359, 440],
 13: [170, 311, 810],
 14: [11, 19],
 15: [42, 181, 182, 183, 184, 185, 236, 286, 296, 297, 337, 468, 495, 695],
 16: [763],
 17: [33, 49, 94, 103, 180],
 18: [538],
 19: [377],
 20: [7, 65, 138, 188, 374, 378, 428, 465],
 21: [83, 100, 539, 542, 580, 693, 811],
 25: [64, 71, 89, 109, 111],
 26: [113],
 31: [101, 167, 439],
 38: [1, 6, 22, 30, 32, 41, 47, 125, 186, 224, 247, 250, 456, 514],
 45: [53],
 46: [502, 808, 1060],
 53: [90],
 65: [43, 166, 189],
 79: [602],
 85: [404],
 86: [173, 174, 175],
 92: [81, 214],
 102: [1032],
 109: [361],
 130: [524, 619, 741, 1156],
 181: [401],
 182: [400],
 297: [240],
 468: [491],
 651: [493],
 76

### Check best classifier 

In [None]:
base_estimator = make_pipeline(LogisticRegression(C=0.631))
clf = HierarchicalClassifier(
                    base_estimator = base_estimator,
                    class_hierarchy=class_hierarchy,
                    prediction_depth="nmlnp",
                    stopping_criteria=0.5,
                    )
clf.fit(X_train_sca, y_train)
y_pred = clf.predict(X_test_sca)
y_pred_train = clf.predict(X_train_sca)    

In [26]:
(accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred))

(0.43316015865035473, 0.4311211664152841)

In [37]:
df_pred = pd.DataFrame(y_pred_train)

In [41]:
df_pred[0].value_counts().head(4)

1     33767
8       479
66      421
12      205
Name: 0, dtype: int64

### Now with SMOTE

In [9]:
X_smoted, y_smoted = SMOTE(random_state=42).fit_sample(X_train_sca,y_train)

In [10]:
base_estimator = make_pipeline(LogisticRegression(C=0.631))
clf = HierarchicalClassifier(
                    base_estimator = base_estimator,
                    class_hierarchy=class_hierarchy,
                    prediction_depth="nmlnp",
                    stopping_criteria=0.5,
                    )
clf.fit(X_smoted, y_smoted)
y_pred = clf.predict(X_test_sca)
y_pred_train = clf.predict(X_smoted) 

_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 46
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 79
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 86
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 92
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 102
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 130
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 3
_train_local_classifier() - not enough training data available to train, classification in branch will terminate at node 4
_train_l

In [12]:
(accuracy_score(y_smoted, y_pred_train), accuracy_score(y_test, y_pred))

(0.3572278714256466, 0.09074912016088486)