In [2]:
import numpy as np
from sklearn import model_selection, preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


random_state = 42

In [3]:
# Use the same method of loading the data as in previous sub problems

input_file = "Data/features_3_sec.csv"

with open(input_file) as f:
    f.readline()
    data = np.loadtxt(f, delimiter=',', dtype=np.object_)

labels = [
    'blues', 'classical', 'country', 'disco', 'hiphop', 
    'jazz', 'metal', 'pop', 'reggae', 'rock'
]

X = data[:, 1:-1].astype(np.float32)

min_max_scaler = preprocessing.MinMaxScaler((-1, 1))
X = min_max_scaler.fit_transform(X)

y = [labels.index(l) for l in data[:, -1]]


train_X, test_X, train_y, test_y = model_selection.train_test_split(X, y, test_size=0.2, random_state=random_state)

In [12]:
# Grid Search with Random Forest

parameter_grid = {
    'n_estimators': [256],
    'max_depth': [24],
    'min_samples_split': [2],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1],
}

grid_search = model_selection.GridSearchCV(
    RandomForestClassifier(random_state=random_state), 
    parameter_grid, 
    cv=5, 
    verbose=5
)

grid_search.fit(train_X, train_y)

print(f"Best score: {grid_search.best_score_:.4f}")
print(f"Best parameters: {grid_search.best_params_}")

test_y_pred = grid_search.predict(test_X)
print(f"Accuracy: {accuracy_score(test_y, test_y_pred):.4f}")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END max_depth=24, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=256;, score=0.862 total time=   7.1s
[CV 2/5] END max_depth=24, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=256;, score=0.857 total time=   7.0s
[CV 3/5] END max_depth=24, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=256;, score=0.875 total time=   7.1s
[CV 4/5] END max_depth=24, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=256;, score=0.860 total time=   7.2s
[CV 5/5] END max_depth=24, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=256;, score=0.857 total time=   7.0s
Best score: 0.8624
Best parameters: {'max_depth': 24, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 256}
Accuracy: 0.8909


### Random Forest

Throwing together an untuned forest classifier against the 3 second features performed very well even to begin with, resulting in an 88% accuracy. Performing a grid search on that essentially told us that none of the parameters resulted in significant improvement - we weren't able to improve the performance beyond 89.09%.

In [17]:
# Grid search with SVM

parameter_grid_svm = {
    'C': [2.6],
    'gamma': [1.0],
    'kernel': ['rbf'],
}

grid_search_svm = model_selection.GridSearchCV(
    SVC(random_state=random_state),
    parameter_grid_svm,
    cv=5,
    verbose=5
)

grid_search_svm.fit(train_X, train_y)

print(f"Best score: {grid_search_svm.best_score_:.4f}")
print(f"Best parameters: {grid_search_svm.best_params_}")

test_y_pred = grid_search_svm.predict(test_X)
print(f"Accuracy: {accuracy_score(test_y, test_y_pred):.4f}")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END ......C=2.6, gamma=1.0, kernel=rbf;, score=0.922 total time=   1.9s
[CV 2/5] END ......C=2.6, gamma=1.0, kernel=rbf;, score=0.936 total time=   1.9s
[CV 3/5] END ......C=2.6, gamma=1.0, kernel=rbf;, score=0.917 total time=   1.9s
[CV 4/5] END ......C=2.6, gamma=1.0, kernel=rbf;, score=0.919 total time=   1.9s
[CV 5/5] END ......C=2.6, gamma=1.0, kernel=rbf;, score=0.932 total time=   1.9s
Best score: 0.9250
Best parameters: {'C': 2.6, 'gamma': 1.0, 'kernel': 'rbf'}
Accuracy: 0.9414


### SVM

While the first attempt at SVM didn't go quite as well as the first attempt as Random Forest, it still gave us a very respectable 73% accuracy. A grid search for the C and gamma hyperparameters and swapping the kernel to rbf got that up to 94.14%.