In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [15]:
df = pd.read_csv('audio_features.csv', delimiter='\t')

In [16]:
df.head(10)

Unnamed: 0,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,genre
0,7LVrrX8pkzI9fMF88nGabt,0.742,0.662,5.0,-4.651,1.0,0.0647,0.578,0.0,0.126,0.486,84.006,200000.0,4.0,indie viet
1,3wn8HJNjkY4wzTBy35ZvQ6,0.429,0.364,7.0,-10.006,1.0,0.0547,0.455,0.0,0.0907,0.0857,173.63,300000.0,3.0,viet lo-fi
2,32K6zBYFUUxKrdsoZd7k3v,0.838,0.494,1.0,-7.51,0.0,0.142,0.78,2.9e-05,0.123,0.636,127.945,138973.0,4.0,vietnamese hip hop
3,3nw5vYaH4qVx8joq3qhtNx,0.891,0.28,1.0,-19.182,1.0,0.181,0.0197,0.0,0.0966,0.336,92.003,139276.0,4.0,vietnamese hip hop
4,6m1oexLqQpf9nbrkrHXOWc,0.698,0.587,6.0,-4.318,0.0,0.0569,0.717,2e-06,0.192,0.569,88.006,246818.0,4.0,v-pop
5,4Ba99csVDlRFQILO5cSi7I,0.34,0.255,11.0,-13.713,1.0,0.0611,0.882,6.4e-05,0.161,0.46,67.905,202625.0,4.0,v-pop
6,1CyYcBdukmFjFibDZaDlHc,0.648,0.543,0.0,-6.781,1.0,0.0399,0.143,0.0,0.187,0.382,119.926,201507.0,4.0,v-pop
7,7AS3f6iaLQ4NZCMZKuhWmy,0.729,0.618,2.0,-7.546,1.0,0.316,0.432,2.7e-05,0.268,0.345,89.234,200000.0,4.0,v-pop
8,696w9BHmEY62myVCSDyrYW,0.656,0.331,2.0,-9.88,1.0,0.0302,0.363,0.0,0.201,0.5,105.015,247429.0,3.0,indie viet
9,4HBZA5flZLE435QTztThqH,0.597,0.45,8.0,-6.658,1.0,0.0418,0.223,0.0,0.382,0.537,178.765,228482.0,3.0,dance pop


In [80]:
data = df[df['id'] != 'genre'].drop(['id'], axis=1)

In [81]:
data = data.loc[(data['genre'].isin(list(data['genre'].value_counts().nlargest(10).index)))]

In [77]:
label = data.iloc[:, -1]
label = label.sort_values()

In [78]:
encoder = LabelEncoder()
y = encoder.fit_transform(label)
len(y)

1858

In [79]:
scaler = StandardScaler()
x = scaler.fit_transform(np.array(data.iloc[:, :-1], dtype = float))
x.shape

(1858, 13)

In [65]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=True, random_state=20)
x_train.shape

(1300, 13)

-------

In [66]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

import plotly.express as px

In [67]:
knn_model = KNeighborsClassifier()
dec_model = DecisionTreeClassifier()
mlp_model = MLPClassifier()
svm_model = SVC()
bagging_model = BaggingClassifier()
lgs_model = LogisticRegression()

In [68]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(knn_model, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
n_scores.mean()

0.31794871794871804

In [69]:
lgs_model.fit(x_train, y_train)
knn_model.fit(x_train, y_train)
dec_model.fit(x_train, y_train)
mlp_model.fit(x_train, y_train)
svm_model.fit(x_train, y_train)
bagging_model.fit(x_train, y_train)


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.



In [70]:
lgs_acc = lgs_model.score(x_test, y_test)
knn_acc = knn_model.score(x_test, y_test)
dec_acc = dec_model.score(x_test, y_test)
mlp_acc = mlp_model.score(x_test, y_test)
svm_acc = svm_model.score(x_test, y_test)
bagging_acc = bagging_model.score(x_test, y_test)

print("Logistic Regression Accuracy", lgs_acc)
print("K-Nearest-Neighbors Accuracy:", knn_acc)
print("Decision Tree Accuracy:", dec_acc)
print("Neural Network Accuracy:", mlp_acc)
print("Bagging Algorithms Accuracy:", bagging_acc)
print("Support Vector Machine Accuracy:", svm_acc)

Logistic Regression Accuracy 0.2974910394265233
K-Nearest-Neighbors Accuracy: 0.34946236559139787
Decision Tree Accuracy: 0.35304659498207885
Neural Network Accuracy: 0.35842293906810035
Bagging Algorithms Accuracy: 0.3673835125448029
Support Vector Machine Accuracy: 0.3387096774193548


In [71]:
accuracy_score(y_test, knn_model.predict(x_test))

0.34946236559139787

In [72]:
f1_score(y_test, knn_model.predict(x_test), average='weighted')

0.32327206135919645

In [73]:
fig = px.bar(
    x=["Logistic Regression", "K-Nearest-Neighbors", "Decision Tree", "Neural Network", "Bagging Algorithms", "Support Vector Machine"],
    y=[lgs_acc, knn_acc, dec_acc, mlp_acc, bagging_acc, svm_acc],
    color=["Logistic Regression", "K-Nearest-Neighbors", "Decision Tree", "Neural Network", "Bagging Algorithms", "Support Vector Machine"],
    labels={'x': "Model", 'y': "Accuracy"},
    title="Model Accuracy Comparison"
)

fig.show()