In [1]:
#Concise feature extraction
import librosa
from librosa import feature
import numpy as np
from sklearn.preprocessing import normalize
from functools import reduce

def get_percussion(y):
    har, perc =   librosa.decompose.hpss(np.abs(librosa.stft(y)), margin=16)
    return perc


fn_list_i = [
   # feature.chroma_stft,
    feature.spectral_centroid,
    feature.spectral_bandwidth,
    feature.spectral_rolloff, 
    librosa.onset.onset_strength,
    librosa.feature.mfcc
]

fn_list_ii = [
#    feature.rmse,
    feature.zero_crossing_rate,
    get_percussion
]


def get_feature_vector(y, sr):
   feat_vect_i = [np.mean(funct(y, sr)) for funct in fn_list_i]
   feat_vect_ii = [np.mean(funct(y)) for funct in fn_list_ii]
   feature_vector = feat_vect_i + feat_vect_ii
   return feature_vector


In [2]:
#Classify features
# The naming convention for files:
#Anything with *D.wav is a a water leak - ht, medium, slow, fast drips
#Anything with *N.wav is white  noise / ambient nouse
#Anything with *R?.wav or HW is running water or running tap or falling from a height

run = ['RW', 'RT', 'HW']
leak = ['FD', 'MD', 'SD', 'SP', 'HD']
noise = ['WN', 'AN']

def classify_datafile(datafile): 
    file, ext = os.path.splitext(datafile)
    [rest, cl_str] = file.rsplit("_", 1)
    #print(datafile)
    if cl_str in leak: 
        classify =  ['leak']
    # elif cl_str in run :
    #     classify =  ['running water']
    else:
        classify = ['running water/noise']
    #endif
    #print(classify)
    return classify


In [3]:
#main cell 
#reads each file, extracts features
#and saves in a ''/Users/ns/development/iisc/WLDS/data/out/dataset_1.csv'
#this dataset should be used for training  
###---------------------------------------------
from tqdm import tqdm
import os
import pandas as pd

# read each file
# extract features in a dict
# put them in a list
# convert to data frame
import warnings
warnings.filterwarnings('ignore')

data_file_path = '/Users/ns/development/iisc/WLDS2/data/extra/'
out_file_path = '/Users/ns/development/iisc/WLDS2/data/extra/'
header = ['Sample#', 'spectral_centroid', 'spectral_bandwidth',
          'spectral_rolloff', 'onset_strength',
          'mfcc', 'zero_crossing_rate', 'percussion', 'label']
f_header = header[1:7] #extract only features
samples = os.listdir(data_file_path)
#print(samples)
feature_list = []
for datafile in tqdm(samples):
    data_f = data_file_path + datafile
    print('Extracting Features for  ',data_f)
    y, sr = librosa.load(data_f)
    feature_vector = get_feature_vector(y, sr) 
    label = classify_datafile(datafile)
    Sno = samples.index(datafile) 
    feature_list.append([Sno] + feature_vector + label)
#end for

#normalize and save the dataset. 

#print(feature_list) 
feature_df = pd.DataFrame(feature_list, columns=header)
#feature_df[f_header] = normalize(feature_df[f_header].to_numpy())
feature_df.to_csv(out_file_path + 'dataset_1.csv', index=False)

##end


0it [00:00, ?it/s]


<!--
 Copyright 2022 ns
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
     http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-->

