In [1]:
%matplotlib inline

import os

import IPython.display as ipd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import librosa
import librosa.display

import utils

plt.rcParams['figure.figsize'] = (17, 5)

In [2]:
# Directory where mp3 are stored.
AUDIO_DIR = os.environ.get('fma_small')

# Load metadata and features.
tracks = utils.load('fma_metadata/tracks.csv')
genres = utils.load('fma_metadata/genres.csv')
features = utils.load('fma_metadata/features.csv')
echonest = utils.load('fma_metadata/echonest.csv')

np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()

tracks.shape, genres.shape, features.shape, echonest.shape

((106574, 52), (163, 4), (106574, 518), (13129, 249))

In [3]:
small = tracks['set', 'subset'] <= 'small'

train = tracks['set', 'split'] == 'training'
val = tracks['set', 'split'] == 'validation'
test = tracks['set', 'split'] == 'test'

y_train = tracks.loc[small & train, ('track', 'genre_top')]
y_test = tracks.loc[small & test, ('track', 'genre_top')]
X_train = features.loc[small & train, 'mfcc']
X_test = features.loc[small & test, 'mfcc']

print('{} training examples, {} testing examples'.format(y_train.size, y_test.size))
print('{} features, {} classes'.format(X_train.shape[1], np.unique(y_train).size))

6400 training examples, 800 testing examples
140 features, 8 classes


In [None]:
# Be sure training samples are shuffled.
X_train, y_train = skl.utils.shuffle(X_train, y_train, random_state=42)

# Standardize features by removing the mean and scaling to unit variance.
scaler = skl.preprocessing.StandardScaler(copy=False)
scaler.fit_transform(X_train)
scaler.transform(X_test)

# Support vector classification.
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
print('Accuracy: {:.2%}'.format(score))

Fundamentals of Data Science - Final Project 
Code written in this file is taken from the class powerpoint "9A - Decision Trees in Python" on iCollege. 

In [None]:
'''
All the code below is taken from and referenced from the class 
powerpoint "9A - Decision Trees in Python" on iCollege. 

'''

import pandas as pd
import seaborn as sns
import matplotlib.plot as plt

# reading the tracks file
readTracks = pd.read_csv("tracks.csv", sep=",")
print(readTracks.head())
print(readTracks.shape)

# reading the genres file
readGenre = pd.read_csv("genres.csv", sep=",")
print(readGenre.head())
print(readGenre.shape)

# reading the features file
readFeatures = pd.read_csv("features.csv", sep=",")
print(readFeatures.head())
print(readFeatures.shape)

# counting the number of features
numFeatures = readFeatures[''].value_counts().plot(kind = "bar")
print("The number of features are = " + str(numFeatures))

# plotting the histogram for the features
readFeatures["chroma_cens"].plot.hist(bins = 10)
readFeatures["chroma_cqt"].plot.hist(bins = 10)
readFeatures["chroma_stft"].plot.hist(bins = 10)
readFeatures["mfcc"].plot.hist(bins = 10)
readFeatures["rmse"].plot.hist(bins = 10)
readFeatures["spectral_bandwidth"].plot.hist(bins = 10)
readFeatures["spectral_centroid"].plot.hist(bins = 10)
readFeatures["spectral_contrast"].plot.hist(bins = 10)
readFeatures["spectral_rolloff"].plot.hist(bins = 10)
readFeatures["tonnetz"].plot.hist(bins = 10)
readFeatures["zcr"].plot.hist(bins = 10)

# using seaborn to make a pairplot for our features 
sns.set(style='whitegrid', context ='notebook')
cols = ["chroma_cens", "chroma_cqt", "chroma_stft", "mfcc", "rmse", "spectral_bandwidth",
        "spectral_centroid", "spectral_contrast", "spectral_rolloff", "tonnetz", "zcr"]
sns.pairplot(readFeatures, hue="TrainingClass", height=2.0);
plt.show()
