# CONVERT DATA FROM WAV TO MFCC

making a decision between WAV loader and mfcc creator

## 0. IMPORTS

In [29]:
import numpy as np
import matplotlib.pyplot as plt

from IPython.core.display import HTML, display
from IPython.display import display, Audio

import sys
import base64
import struct  
import librosa
import librosa.display

import scipy.io.wavfile as wav
import python_speech_features as p

## 1. DATA 

In [15]:
fullpath = "/home/rob/Dropbox/UCL/DIS/apple-test/kDS/data/ldc93s1/"

filename = fullpath+"LDC93S1.wav"



## 2. PROBLEM 1: Loaders give different values

In [25]:
## LIBROSA

audio1, fs1 = librosa.load(filename, sr=16000)
print("Shape and type of the Audio data x:", audio1.shape, type(audio1),"SR: ", fs1)
tot_secs_len = len(audio1)/fs1
tot_mins_len = tot_secs_len/60
print("Seconds:", tot_secs_len, "Mins:", tot_mins_len)

audio1

('Shape and type of the Audio data x:', (46797,), <type 'numpy.ndarray'>, 'SR: ', 16000)
('Seconds:', 2, 'Mins:', 0)


array([  3.05175781e-05,  -3.05175781e-05,   6.10351562e-05, ...,
        -3.05175781e-05,  -1.52587891e-04,  -2.44140625e-04], dtype=float32)

In [26]:
## SCIPY

fs2, audio2 = wav.read(filename)
print("Shape and type of the Audio data x:", audio2.shape, type(audio2), "SR: ", fs2)

tot_secs_len = len(audio2)/fs2
tot_mins_len = tot_secs_len/60

print("Seconds:", tot_secs_len, "Mins:", tot_mins_len)

audio2

('Shape and type of the Audio data x:', (46797,), <type 'numpy.ndarray'>, 'SR: ', 16000)
('Seconds:', 2, 'Mins:', 0)


array([ 1, -1,  2, ..., -1, -5, -8], dtype=int16)

## LIBROSA vs SCIPY

Librosa defaults to 22k SR so has to be forced to 16k, it gives floats vs SCIPY gives int16. Choose SCIPY as INTS can be compressed and is a cleaner represenation.

In [27]:
# load a NumPy array
Audio(x, rate=fs)

In [28]:
Audio(audio, rate=fs2)

## 3. PROBLEM 2: LIBROSA MFCC vs python_speech_features MFCC

In [33]:
##librosa

libmfcc2 = librosa.feature.mfcc(audio2, sr=fs2, n_mfcc=26)
print("Librosa mfcc:",libmfcc2)
print("Librosa mfcc dims:",libmfcc2.shape)

##py

pymfcc2 = p.mfcc(audio2,samplerate=fs2, numcep=26)
print("Py Speech mfcc:", pymfcc2)
print("Py Speech mfcc dims:", pymfcc2.shape)

('Librosa mfcc:', array([[  3.10878159e+02,   3.21280480e+02,   3.52246850e+02, ...,
          4.67273689e+02,   4.34064055e+02,   3.95668812e+02],
       [  3.39975646e+01,   4.15417412e+01,   6.26155418e+01, ...,
          5.90154969e+01,   4.25220255e+01,   4.31393198e+01],
       [  7.95981059e+00,   4.90073931e+00,  -5.73262763e+00, ...,
         -6.72576945e+01,  -6.42231152e+01,  -5.07805881e+01],
       ..., 
       [  5.43565871e+00,   3.29820687e+00,  -2.60467072e-01, ...,
         -4.27004540e+00,   7.21372715e+00,   9.63581343e+00],
       [  3.74167668e+00,   3.81097755e+00,   2.53648998e+00, ...,
         -4.75553064e+00,   2.57592875e+00,   2.43274625e+00],
       [  4.64921916e+00,   3.18906455e+00,   2.60010602e+00, ...,
          4.73735015e+00,   6.94270050e+00,   1.76299818e+00]]))
('Librosa mfcc dims:', (26, 92))
('Py Speech mfcc:', array([[  7.07865742, -25.0219473 ,  -6.32589368, ...,   0.25937519,
          0.70851522,   1.43421034],
       [  7.16654419, -25.24