# Saving, Reading Files and Models

In [56]:
# For this notebook, we will just be using numpy, pandas, and scikit learn.
#The goal is to learn to manipulate data, and save and read from files in different ways.
import numpy as np
import pandas as pd
import sklearn

In [57]:
array = np.random.rand(500,10)
print(array.shape)
array

(500, 10)


array([[0.67746718, 0.5096883 , 0.70758998, ..., 0.46266417, 0.42348991,
        0.32474884],
       [0.44216742, 0.44609224, 0.32901458, ..., 0.30540781, 0.72220629,
        0.54469237],
       [0.08059496, 0.90392876, 0.29211787, ..., 0.24548088, 0.87886185,
        0.41578092],
       ...,
       [0.62540083, 0.79654494, 0.62508658, ..., 0.58520341, 0.94427071,
        0.84361504],
       [0.68516112, 0.93620323, 0.31259907, ..., 0.19049685, 0.00595479,
        0.53009262],
       [0.32973431, 0.88376546, 0.71805075, ..., 0.52903615, 0.33732599,
        0.56184093]])

In [58]:
df = pd.DataFrame(array)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.677467,0.509688,0.707590,0.057028,0.626685,0.137019,0.110339,0.462664,0.423490,0.324749
1,0.442167,0.446092,0.329015,0.205422,0.754569,0.945691,0.637116,0.305408,0.722206,0.544692
2,0.080595,0.903929,0.292118,0.594333,0.990000,0.458197,0.682490,0.245481,0.878862,0.415781
3,0.478495,0.756760,0.546258,0.389027,0.065202,0.551556,0.846113,0.143062,0.217625,0.690052
4,0.046914,0.156225,0.244266,0.595211,0.241328,0.860169,0.792456,0.131174,0.182643,0.866740
5,0.810560,0.518907,0.647678,0.336709,0.190755,0.617081,0.826219,0.135302,0.191167,0.242359
6,0.093540,0.810438,0.251637,0.754182,0.415910,0.315851,0.636005,0.647768,0.423704,0.731750
7,0.708361,0.377819,0.596926,0.145909,0.752150,0.819567,0.941203,0.500256,0.245274,0.965423
8,0.574685,0.918109,0.064504,0.911016,0.837742,0.579198,0.655209,0.318193,0.094880,0.432543
9,0.867086,0.420383,0.755501,0.208065,0.522503,0.240784,0.705626,0.895582,0.373973,0.672556


# Save numpy array directly to file

In [70]:
np.save(arr = array,file= 'foo',allow_pickle=True)

In [73]:
ls

 [0m[01;34mData[0m/               Intro.ipynb            Perceptron.ipynb
 Day1.ipynb          KNN.ipynb              Preprocessing.ipynb
 Day2.ipynb          LibrosaTut.ipynb       ReadMe.md
 Day3_2.ipynb        [01;34mLongfiles[0m/             Scikit-learn.ipynb
 Day3.ipynb         'Main Notebook.ipynb'   SciPy.ipynb
 Day3_oldest.ipynb   Matplotlib.ipynb       SVM.ipynb
 Day4.ipynb          MIDI.ipynb            'Torch Intro.ipynb'
 Day5b.ipynb         NumPy.ipynb           [01;34m'Untitled Folder'[0m/
 foo.npy             Pandas.ipynb


In [74]:
np.fromfile('./foo.npy')

array([1.87585069e-309, 1.17119999e+171, 5.22741680e-037, ...,
       5.29036152e-001, 3.37325994e-001, 5.61840928e-001])

# Save Pandas dataframe and read csv, tsv, excel files

In [61]:
# write to a csv file
df.to_csv('foo.csv', index=False)

In [62]:
# list files in the current working directory

In [63]:
ls

 [0m[01;34mData[0m/               foo.npy                Pandas.ipynb
 Day1.ipynb          Intro.ipynb            Perceptron.ipynb
 Day2.ipynb          KNN.ipynb              Preprocessing.ipynb
 Day3_2.ipynb        LibrosaTut.ipynb       ReadMe.md
 Day3.ipynb          [01;34mLongfiles[0m/             Scikit-learn.ipynb
 Day3_oldest.ipynb  'Main Notebook.ipynb'   SciPy.ipynb
 Day4.ipynb          Matplotlib.ipynb       SVM.ipynb
 Day5b.ipynb         MIDI.ipynb            'Torch Intro.ipynb'
 foo.csv             NumPy.ipynb           [01;34m'Untitled Folder'[0m/


In [64]:
# read file back in
path = './foo.csv'
newDf = pd.read_csv(path)
newDf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.677467,0.509688,0.70759,0.057028,0.626685,0.137019,0.110339,0.462664,0.42349,0.324749
1,0.442167,0.446092,0.329015,0.205422,0.754569,0.945691,0.637116,0.305408,0.722206,0.544692
2,0.080595,0.903929,0.292118,0.594333,0.99,0.458197,0.68249,0.245481,0.878862,0.415781
3,0.478495,0.75676,0.546258,0.389027,0.065202,0.551556,0.846113,0.143062,0.217625,0.690052
4,0.046914,0.156225,0.244266,0.595211,0.241328,0.860169,0.792456,0.131174,0.182643,0.86674


In [65]:
# remove the file
import os
os.remove(path)

In [66]:
# can also do Excel
df.to_excel('foo.xlsx', sheet_name='Sheet1')

In [67]:
newDf2 = pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
newDf2.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,0.677467,0.509688,0.70759,0.057028,0.626685,0.137019,0.110339,0.462664,0.42349,0.324749
1,1,0.442167,0.446092,0.329015,0.205422,0.754569,0.945691,0.637116,0.305408,0.722206,0.544692
2,2,0.080595,0.903929,0.292118,0.594333,0.99,0.458197,0.68249,0.245481,0.878862,0.415781
3,3,0.478495,0.75676,0.546258,0.389027,0.065202,0.551556,0.846113,0.143062,0.217625,0.690052
4,4,0.046914,0.156225,0.244266,0.595211,0.241328,0.860169,0.792456,0.131174,0.182643,0.86674


In [68]:
os.remove('foo.xlsx')

# Save Scikit Model 

In [76]:
#Load data from the dataset as we worked on on day 3
traindata = pd.read_excel('dataset.xlsx')
traindata

Unnamed: 0.1,Unnamed: 0,ZC,SpecCen,SpecCon,RMS,SpecFlat,Label
0,0,-0.820426,-1.036326,-0.629936,0.129125,2.055580,0
1,1,-0.953396,-1.375536,-0.516714,-0.381884,1.715251,0
2,2,-1.008865,-1.129275,-0.288536,-0.485144,0.677316,0
3,3,-1.096578,-1.594815,-0.211995,1.484972,0.543615,0
4,4,-0.938593,-1.238997,-0.378333,-0.494040,1.160168,0
5,5,-1.024181,-1.166060,-1.505613,-0.442675,0.754057,0
6,6,-0.987558,-1.065897,-0.963941,-0.584177,0.770680,0
7,7,-0.931890,-1.041607,-1.123719,-0.433951,1.902529,0
8,8,-0.974168,-1.006897,-0.634290,-0.363039,0.796517,0
9,9,-1.022050,-1.125361,-0.494611,-0.579642,0.606424,0


In [83]:
from pathlib import Path
import librosa

kick_signals = [ librosa.load(p, mono=True)[0] for p in Path().glob('Data/drum_samples/kick*.wav') ]

#Repeating the same for snare samples
snare_signals = [ librosa.load(p, mono=True)[0] for p in Path().glob('Data/drum_samples/snare*.wav') ]

#Repeating the same for cymbal
cymbal_signals = [ librosa.load(p, mono=True)[0] for p in Path().glob('Data/drum_samples/cymbal*.wav')]

def extract_features(signal):

    return [
        np.mean(librosa.feature.zero_crossing_rate(signal)),
        np.mean(librosa.feature.spectral_centroid(signal)),
        np.mean(librosa.feature.spectral_contrast(signal)),
        np.mean(librosa.feature.rmse(signal)),
        np.mean(librosa.feature.spectral_flatness(signal)),
    ]


#Extracting our the 5 scalar features for all kick samples.
#Ee are using another inline for loop (this is very convenient when working with lists).
#Now we can store the data on an Numpy array because the size of the data is consistent,
#indeed we will have 5 numbers (features) per sample
#to be precise, we are still storing data into a list [], and then we use the function np.array
#to convert the list into an array (we need Numpy arrays for our ML algorithm, not lists)
kick_features = np.array([extract_features(x) for x in kick_signals])

#Repearing the same for the snare samples.
snare_features = np.array([extract_features(x) for x in snare_signals])

#repeating the same for cymbal samples
cymbal_features = np.array([extract_features(x) for x in cymbal_signals])

#Displaying the size of the Numpy arrays (this time we use the .shape attribute)
#Check if the printed numbers are the expected ones (what's on the rows and what's on the columns?)
print('Size of Numpy arrays for kick and snare features:')
print(kick_features.shape, snare_features.shape, cymbal_features.shape)

#Now we create an array of labels, we can use zeros for the kicks and ones for the snare (or any other number).
#This will help us to discriminate set of featires associated with kicks and snares
#We can opt for "text" labels but this is not convenient,
#It wont work well with neural networks, and we put "text" labels in Numpy arrays

#Create a row of zeroes as long as the number of kick samples
kicklabels = np.zeros(kick_features.shape[0])

#Create a row of ones as long as the number of snare samples
snarelabels = np.ones(snare_features.shape[0])

#Create a row of twos as long as the number of cymbal samples
cymballabels = np.full(cymbal_features.shape[0],2)

#Now we concatenate (attach) the numeric labels into a single array,
#and we also concatenate the two set of features
labels = np.concatenate((kicklabels,snarelabels,cymballabels))
features = np.concatenate((kick_features,snare_features,cymbal_features))

#Before proceeding with training and testing of the classifiers
#we split the data in training and testing set using a 70/30 partitioning.
#This is done a useful function in scikit-learn (called train_test_split)
#The partitioning is done randomly but starting from a seed you can specify (random_state)
#Every time you change the starting random state value, you will experience
#a different partitioning and (likely) a different classification result.

#Importing the tool,
#alternatively you can call sklearn.model_selection.train_test_split(..,..,..,)
#but that will be too long
from sklearn.model_selection import train_test_split

#splitting the dataset in training and testing parts
feat_train, feat_test, lab_train, lab_test = train_test_split(features, labels, test_size=0.3, random_state=214)

Size of Numpy arrays for kick and snare features:
(42, 5) (42, 5) (18, 5)


In [84]:
#Import the classifier
from sklearn.neural_network import MLPClassifier

##Creating an instance of a MLP classifier
#and setting it some option (max mum epoch, verbose on, activation of neurons)
mlp = MLPClassifier(hidden_layer_sizes=(2,2), max_iter=200, verbose=True, activation='relu')

#train the model
mlp.fit(feat_train, lab_train)

#applying the the model on the test data (features)
lab_predict = mlp.predict(feat_test)

Iteration 1, loss = 1.22135945
Iteration 2, loss = 1.22083042
Iteration 3, loss = 1.22030238
Iteration 4, loss = 1.21977536
Iteration 5, loss = 1.21924937
Iteration 6, loss = 1.21872443
Iteration 7, loss = 1.21820055
Iteration 8, loss = 1.21767775
Iteration 9, loss = 1.21715605
Iteration 10, loss = 1.21663545
Iteration 11, loss = 1.21611598
Iteration 12, loss = 1.21559764
Iteration 13, loss = 1.21508045
Iteration 14, loss = 1.21456443
Iteration 15, loss = 1.21404958
Iteration 16, loss = 1.21353591
Iteration 17, loss = 1.21302344
Iteration 18, loss = 1.21251218
Iteration 19, loss = 1.21200213
Iteration 20, loss = 1.21149331
Iteration 21, loss = 1.21098572
Iteration 22, loss = 1.21047937
Iteration 23, loss = 1.20997428
Iteration 24, loss = 1.20947043
Iteration 25, loss = 1.20896786
Iteration 26, loss = 1.20846655
Iteration 27, loss = 1.20796651
Iteration 28, loss = 1.20746776
Iteration 29, loss = 1.20697028
Iteration 30, loss = 1.20647409
Iteration 31, loss = 1.20597919
Iteration 32, los



In [88]:
import pickle

filename = 'model.sav'
pickle.dump(mlp, open(filename, 'wb'))


# load the model from disk
model = pickle.load(open(filename, 'rb'))

result = mlp.predict(feat_test)
# model.score(X_test, Y_test)
print(result)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0.]


In [89]:
ls

 [0m[01;34mData[0m/                 foo.npy                Pandas.ipynb
 dataset.xlsx          Intro.ipynb            Perceptron.ipynb
 Day1.ipynb            KNN.ipynb              Preprocessing.ipynb
 Day2.ipynb            LibrosaTut.ipynb       ReadMe.md
 Day3_2.ipynb          [01;34mLongfiles[0m/             Scikit-learn.ipynb
 Day3.ipynb           'Main Notebook.ipynb'   SciPy.ipynb
 Day3_oldest.ipynb     Matplotlib.ipynb       SVM.ipynb
 Day4.ipynb            MIDI.ipynb            'Torch Intro.ipynb'
 Day5b.ipynb           model.sav             [01;34m'Untitled Folder'[0m/
 finalized_model.sav   NumPy.ipynb
