In [224]:
import textgrid as tg
import os
import numpy as np
import pandas as pd

from pydub import AudioSegment

import librosa

import h5py

import math
import datetime

import soundfile as sf

from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank

In [225]:
class FeatureExtractor:
    def __init__(self, marks):
        self.marks = marks
        self.features = np.empty((0,187))
        self.labels = np.empty(0)
        self.df = pd.DataFrame(columns=["label", "features"])
    
    def __loadWav(self, path):
#         sig = AudioSegment.from_file(path, format="wav")
        X, sample_rate = sf.read(path, dtype='float32')

        return X, sample_rate
    
    def __extractFeatures(self,segment, sample_rate):
        samples = np.array(segment.get_array_of_samples())
        frame_rate = segment.frame_rate

        mfcc_feat = mfcc(samples, frame_rate, nfft=1103)
        d_mfcc_feat = delta(mfcc_feat, 2)
        fbank_feat = logfbank(samples, frame_rate, nfft=1103)

        return np.concatenate((np.array(mfcc_feat),np.array(d_mfcc_feat),np.array(fbank_feat)), axis=1)

    def __extractFeaturesLibrosa(self,X, sample_rate):
        X = X.T

        # short term fourier transform
        stft = np.abs(librosa.stft(X))

        # mfcc
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)

        # chroma
        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)

        # melspectrogram
        mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)

        # spectral contrast
        contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)

#         tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
        return mfccs,chroma,mel,contrast

    
    def extract(self, path):
        for i, file in enumerate(os.listdir(path)):        
            if file.endswith(".TextGrid"): 
                wav_path = os.path.join(path, os.path.splitext(file)[0] + '.wav')
                if os.path.isfile(wav_path): 
                    sig, sample_rate = self.__loadWav(wav_path)
                    
                    textgrid_path = os.path.join(path, file)
                    textgrid = tg.TextGrid.fromFile(textgrid_path)

                    humorous_tier = textgrid[1]
                    for interval in humorous_tier: #humorous tier
                        if interval.mark in self.marks:
                            start = int(interval.minTime * 1000)
                            end = int(interval.maxTime * 1000)
                            trimmed_sig = sig[start:end]
                            label = self.marks.index(interval.mark)
                            
                            mfccs, chroma, mel, contrast = self.__extractFeaturesLibrosa(trimmed_sig, sample_rate)
                            ext_features = np.hstack([mfccs,chroma,mel,contrast])
                            self.features = np.vstack([self.features,ext_features])
                            
                            self.labels = np.append(self.labels, label)
                            
            
        return np.array(self.features), np.array(self.labels, dtype = np.int)
    
#     def saveFeaturesHDF(self, path):
#         filename = 'features{}.h5'.format(datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S"))
#         self.df.to_hdf(os.path.join(path, filename),'df',mode='w',format='fixed',data_columns=True, compression='zlib')
    
#     def saveFeaturesCSV(self, path):
#         filename = 'features{}.csv'.format(datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S"))
#         self.df.to_csv(os.path.join(path, filename),mode='w')

    def saveNumpy(self, path):
        np.save(os.path.join(path, 'feat.npy'), self.features)
        np.save(os.path.join(path, 'label.npy'), self.labels)

In [226]:
extractor = FeatureExtractor(['H', 'N'])

extractor.extract('data/')

(array([[-498.00571094,  115.22830751,  -17.71747072, ...,   16.59821214,
           20.49534195,   21.61198308],
        [-519.22341526,  103.46712998,  -15.10817037, ...,   16.76542554,
           18.82820224,   18.97010032],
        [-417.47021286,  146.68699207,  -34.42571881, ...,   17.69312003,
           15.66631595,   22.15552396],
        ...,
        [-401.12140477,  148.16541655,  -36.35623607, ...,   16.76183975,
           16.63571731,   23.81582308],
        [-495.78024875,  106.45667549,  -42.10897193, ...,   24.28949132,
           32.09515495,   34.24728628],
        [-450.51683619,  172.94091484,   11.55364805, ...,   21.974382  ,
           16.99298938,   21.65583656]]),
 array([1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
        0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0,
        1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1]))

In [228]:
extractor.saveNumpy('data/')