In [1]:
# Import required libraries
import pandas
import sklearn
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score

from sklearn.model_selection import cross_val_score

In [2]:
# Load the dataset file
with open("musicclean.csv", 'r') as csvfile:
    df1 = pandas.read_csv(csvfile)

In [3]:
# Display The names of all the columns in the data.
print(df1.columns.values)

['artistHotttnesss' 'artistId' 'artistName' 'artistMbtags'
 'artistMbtagsCount' 'barsConfidence' 'barsStart' 'beatsConfidence'
 'beatsStart' 'duration' 'endOfFadeIn' 'familiarity' 'key' 'keyConfidence'
 'latitude' 'location' 'longitude' 'loudness' 'mode' 'modeConfidence'
 'releaseId' 'releaseName' 'similar' 'songHotttnesss' 'songId'
 'startOfFadeOut' 'tatumsConfidence' 'tatumsStart' 'tempo' 'terms'
 'termsFreq' 'timeSignature' 'timeSignatureConfidence' 'title' 'year']


In [4]:
# Display how many rows is in the dataset
len(df1)

9990

In [5]:
# Choose genres
value_list = ['country rock', 'latin jazz', 'post-grunge', 'dance pop', 'gangster rap']
#Grab DataFrame rows where column has values from "value_list" defined above
df1 = df1[df1.terms.isin(value_list)]

In [6]:
# Choose features (audio features) for training and testing (all 11 features)
x = df1[['artistHotttnesss', 'barsStart','beatsStart',
        'duration', 'endOfFadeIn', 'familiarity', 'key','loudness',
        'mode','startOfFadeOut','tatumsStart','tempo',
        'timeSignature']]
# Choose the most affecting features (Uncomment the line below)
#x = df1[['artistHotttnesss', 'familiarity','loudness','startOfFadeOut']]

# Choose class (genre of music) for training and testing
y = df1[['terms']]

In [7]:
# Check the number of examples after feature selection
len(df1)

727

In [8]:
# Use train/test split with different random_state values
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [9]:
# Create a random forest classifier
# clf means classifier
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train.values.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10000, n_jobs=-1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [10]:
# Print the name and gini importance of each feature
for feature in zip(x, clf.feature_importances_):
    print(feature)

('artistHotttnesss', 0.29507378108942522)
('familiarity', 0.2801703571484776)
('loudness', 0.22556766031737607)
('startOfFadeOut', 0.19918820144472082)


In [11]:
# Check the number of testing examples
print(X_train.shape)

(545, 4)


In [12]:
# Classification (KNN)
# Create KNN Classifier
clf = KNeighborsClassifier(n_neighbors=23)
# Train the classifier
clf.fit(X_train, y_train.values.ravel()) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=23, p=2,
           weights='uniform')

In [13]:
# Test accuracy of KNN algorithm
a1 = accuracy_score(clf.predict(X_test), y_test)
print('Accuracy score for KNN algorithm =',a1*100,'%')

Accuracy score for KNN algorithm = 29.6703296703 %


In [14]:
# Decision Tree Classification
# Create Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=0)
# Evaluate a score by cross-validation
cross_val_score(clf, X_train, y_train.values.ravel(), cv=10)

array([ 0.58928571,  0.60714286,  0.5       ,  0.56363636,  0.52727273,
        0.61818182,  0.66037736,  0.52830189,  0.50943396,  0.67924528])

In [15]:
# Train the classifier
clf.fit(X_train, y_train.values.ravel())

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=0, splitter='best')

In [16]:
# Test accuracy of Decision Trees algorithm
a1 = accuracy_score(clf.predict(X_test), y_test)
print('Accuracy score for Decision Trees algorithm =',a1*100,'%')

Accuracy score for Decision Trees algorithm = 58.2417582418 %


In [17]:
# Classification (Random Forest)
# Create Random Forest Classifier
clf = RandomForestClassifier(random_state=0)
# Train the classifier
clf.fit(X_train, y_train.values.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [18]:
# Test accuracy of Random Forest algorithm
a1 = accuracy_score(clf.predict(X_test), y_test)
print('Accuracy score for Random Forest algorithm =',a1*100,'%')

Accuracy score for Random Forest algorithm = 67.032967033 %
