### Flat classification for 50 genres

In [21]:
import pandas as pd
import numpy as np
import ast

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

from util_koyak import col_selector_hiera, grid_estimator

import warnings; warnings.simplefilter('ignore')

In [9]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("seaborn")
%matplotlib inline

In [10]:
def conf_plot(y, y_pred):
    plt.figure(dpi=180)
    sns.heatmap(confusion_matrix(y, y_pred), cmap=plt.cm.Blues, annot=True, square=True, fmt='d', annot_kws={"size": 15});
    plt.xlabel('prediction')
    plt.ylabel('actual');

#### Importing features and selecting them.

In [3]:
df_genre = pd.read_csv("features_genre_HIERA_min_100.csv", index_col=0)
df_mfcc_cont = col_selector_hiera(['mfcc','contrast'], df_genre)

In [4]:
X = df_mfcc_cont.drop("genre_selected",axis=1)
y = df_mfcc_cont["genre_selected"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.10, random_state=4444)

In [6]:
sca = StandardScaler()
X_train_sca = sca.fit_transform(X_train)
X_test_sca = sca.transform(X_test);

In [7]:
param_grid = {}
grid_estimator(LogisticRegression(), param_grid, X_train_sca, y_train, X_test_sca, y_test)

Accuracy training: 0.5001396570024021
Accuracy test: 0.49421820010055306
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [None]:
LR = LogisticRegression()
LR.fit(X_train_sca, y_train)
y_pred = LR.predict(X_test_sca)

In [12]:
accuracy_score(y_test, y_pred)

0.49421820010055306

In [13]:
LR.score(X_train_sca,y_train)

0.5226244343891403

In [17]:
results = []
for C in np.logspace(-2,1,6):
    LR = LogisticRegression(C=C)
    LR.fit(X_train_sca, y_train)
    y_pred = LR.predict(X_test_sca)
    results.append([C,accuracy_score(y_test, y_pred),LR.score(X_train_sca,y_train)])

In [18]:
results

[[0.01, 0.47863247863247865, 0.48838053740014525],
 [0.039810717055349734, 0.48893916540975363, 0.5029327970504441],
 [0.15848931924611134, 0.497737556561086, 0.514831573655103],
 [0.630957344480193, 0.4959778783308195, 0.5213116585665605],
 [2.5118864315095797, 0.4937154348919055, 0.5253896430367019],
 [10.0, 0.4917043740573152, 0.52726104686889]]

In [19]:
results = []
for C in np.arange(0.05, 0.6, 0.1):
    LR = LogisticRegression(C=C)
    LR.fit(X_train_sca, y_train)
    y_pred = LR.predict(X_test_sca)
    results.append([C,accuracy_score(y_test, y_pred),LR.score(X_train_sca,y_train)])

In [20]:
results

[[0.05, 0.4906988436400201, 0.5053907602927211],
 [0.15000000000000002, 0.4974861739567622, 0.5142729456454946],
 [0.25000000000000006, 0.497737556561086, 0.5171219484944976],
 [0.35000000000000003, 0.4964806435394671, 0.5191330093290878],
 [0.45000000000000007, 0.49673202614379086, 0.5203061281492654],
 [0.5500000000000002, 0.4962292609351433, 0.5207250991564717]]

### Now with SMOTE

In [9]:
X_smoted, y_smoted = SMOTE(random_state=42).fit_sample(X_train_sca,y_train)

In [10]:
LR = LogisticRegression(C=0.25, n_jobs=8)
LR.fit(X_smoted, y_smoted)
y_pred = LR.predict(X_test_sca)
accuracy_score(y_test, y_pred), LR.score(X_smoted,y_smoted)

(0.16716943187531422, 0.6333733706408244)