Details and Resources of the project in the doc:                                
https://docs.google.com/document/d/1F-KYQ5nRnDAUVDUrEbTFYGn7TEVsT8EByuM9xJLT4gA/edit

In [1]:
import warnings                        # To ignore any warnings
warnings.filterwarnings("ignore")
%matplotlib inline
%pylab inline
import os
import pandas as pd
import numpy as np
import librosa
import librosa.display
import glob 
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'

Populating the interactive namespace from numpy and matplotlib


In [2]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:

INPUT_DIR = '/content/drive/MyDrive/AI ML Things/University of Turku Research Internship'
'''
set_a_path=base_path+'/set_a'
set_a_metadata_path=base_path+'/set_a.csv'
set_b_path=base_path+'/set_b'
set_b_metadata_path=base_path+'/set_b.csv'
set_a_metadata = pd.read_csv(set_a_metadata_path)
'''
dataset_path = INPUT_DIR+'/set_a'
metadata = pd.read_csv(INPUT_DIR+'/set_a.csv')

SAMPLE_RATE = 16000
# seconds
MAX_SOUND_CLIP_DURATION=12  

In [4]:
set_a=pd.read_csv(INPUT_DIR+"/set_a.csv")
set_a.head()

Unnamed: 0,dataset,fname,label,sublabel
0,a,set_a/artifact__201012172012.wav,artifact,
1,a,set_a/artifact__201105040918.wav,artifact,
2,a,set_a/artifact__201105041959.wav,artifact,
3,a,set_a/artifact__201105051017.wav,artifact,
4,a,set_a/artifact__201105060108.wav,artifact,


In [5]:
set_a_timing=pd.read_csv(INPUT_DIR+"/set_a_timing.csv")
set_a_timing.head()

Unnamed: 0,fname,cycle,sound,location
0,set_a/normal__201102081321.wav,1,S1,10021
1,set_a/normal__201102081321.wav,1,S2,20759
2,set_a/normal__201102081321.wav,2,S1,35075
3,set_a/normal__201102081321.wav,2,S2,47244
4,set_a/normal__201102081321.wav,3,S1,62992


In [6]:
set_b=pd.read_csv(INPUT_DIR+"/set_b.csv")
set_b.head()

Unnamed: 0,dataset,fname,label,sublabel
0,b,set_b/Btraining_extrastole_127_1306764300147_C...,extrastole,
1,b,set_b/Btraining_extrastole_128_1306344005749_A...,extrastole,
2,b,set_b/Btraining_extrastole_130_1306347376079_D...,extrastole,
3,b,set_b/Btraining_extrastole_134_1306428161797_C...,extrastole,
4,b,set_b/Btraining_extrastole_138_1306762146980_B...,extrastole,


In [7]:
#merging both set-a and set-b
frames = [set_a, set_b]
train_ab=pd.concat(frames)
train_ab.describe()

Unnamed: 0,dataset,fname,label,sublabel
count,832,832,585,149
unique,2,832,5,2
top,b,set_b/Btraining_normal_Btraining_noisynormal_2...,normal,noisynormal
freq,656,1,351,120


In [8]:
#checking for duplicates
nb_classes=train_ab.label.unique()

print("Number of training examples=", train_ab.shape[0], "  Number of classes=", len(train_ab.label.unique()))
print (nb_classes)

Number of training examples= 832   Number of classes= 6
['artifact' 'extrahls' 'murmur' 'normal' nan 'extrastole']


Note: 'nan' indicate unclassified and unlabel test files

#Extracting Features of Data in Audio Domain

##Sound Feature: MFCCs

In [9]:
# Checking an example generate mfccs from a audio file
example_file=INPUT_DIR+"/set_a/normal__201106111136.wav"
#y, sr = librosa.load(sample_file, offset=7, duration=7)
y, sr = librosa.load(example_file)
mfccs = librosa.feature.mfcc(y=y, sr=sr)
print (mfccs)

[[-2.4546130e+02 -2.9111914e+02 -4.0244803e+02 ... -3.6787161e+02
  -3.7070840e+02 -3.7246997e+02]
 [ 8.8640701e+01  9.8866135e+01  1.3325652e+02 ...  1.5128104e+02
   1.5726184e+02  1.5245227e+02]
 [ 1.0397891e+02  8.5698502e+01  2.3744316e+01 ...  2.9162670e+01
   2.9376698e+01  3.7446297e+01]
 ...
 [-1.3324329e+01 -8.1943016e+00 -1.1898929e+00 ...  9.6601009e-02
   6.2489367e-01  1.1953502e+00]
 [ 3.1632262e-01 -4.9924183e-01 -1.5885925e-01 ...  2.2277083e+00
  -2.1508545e-01  4.8489895e+00]
 [ 3.3431330e+00 -9.8901147e-01 -2.7710872e+00 ...  2.6507771e+00
  -1.3875097e+00  1.8579394e+00]]


In [10]:
mfccs.shape

(20, 214)

In [11]:
# Get more components
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
mfccs.shape

(40, 214)

#Loading Data

In [12]:
print("Number of training examples=", train_ab.shape[0], "  Number of classes=", len(train_ab.label.unique()))

Number of training examples= 832   Number of classes= 6


In [13]:
def audio_norm(data):
    max_data = np.max(data)
    min_data = np.min(data)
    data = (data-min_data)/(max_data-min_data+0.0001)
    return data-0.5

# get audio data without padding highest qualify audio
def load_file_data_without_change(folder,file_names, duration=3, sr=16000):
    input_length=sr*duration
    # function to load files and extract features
    # file_names = glob.glob(os.path.join(folder, '*.wav'))
    data = []
    for file_name in file_names:
        try:
            sound_file=folder+file_name
            print ("load file ",sound_file)
            # use kaiser_fast technique for faster extraction
            X, sr = librosa.load( sound_file, res_type='kaiser_fast') 
            dur = librosa.get_duration(y=X, sr=sr)
            # extract normalized mfcc feature from data
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sr, n_mfcc=40).T,axis=0) 
        except Exception as e:
            print("Error encountered while parsing file: ", file)
        feature = np.array(mfccs).reshape([-1,1])
        data.append(feature)
    return data


# get audio data with a fix padding may also chop off some file
def load_file_data (folder,file_names, duration=12, sr=16000):
    input_length=sr*duration
    # function to load files and extract features
    # file_names = glob.glob(os.path.join(folder, '*.wav'))
    data = []
    for file_name in file_names:
        try:
            sound_file=folder+file_name
            print ("load file ",sound_file)
            # use kaiser_fast technique for faster extraction
            X, sr = librosa.load( sound_file, sr=sr, duration=duration,res_type='kaiser_fast') 
            dur = librosa.get_duration(y=X, sr=sr)
            # pad audio file same duration
            if (round(dur) < duration):
                print ("fixing audio lenght :", file_name)
                y = librosa.util.fix_length(X, input_length)                
            #normalized raw audio 
            # y = audio_norm(y)            
            # extract normalized mfcc feature from data
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sr, n_mfcc=40).T,axis=0)             
        except Exception as e:
            print("Error encountered while parsing file: ", file)        
        feature = np.array(mfccs).reshape([-1,1])
        data.append(feature)
    return data

In [14]:
# load dataset-a, keep them separate for testing purpose
import os, fnmatch

A_folder=INPUT_DIR+'/set_a/'
# set-a
A_artifact_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_a'), 'artifact*.wav')
A_artifact_sounds = load_file_data(folder=A_folder,file_names=A_artifact_files, duration=MAX_SOUND_CLIP_DURATION)
A_artifact_labels = [1 for items in A_artifact_files]

A_normal_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_a'), 'normal*.wav')
A_normal_sounds = load_file_data(folder=A_folder,file_names=A_normal_files, duration=MAX_SOUND_CLIP_DURATION)
A_normal_labels = [2 for items in A_normal_sounds]

A_extrahls_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_a'), 'extrahls*.wav')
A_extrahls_sounds = load_file_data(folder=A_folder,file_names=A_extrahls_files, duration=MAX_SOUND_CLIP_DURATION)
A_extrahls_labels = [3 for items in A_extrahls_sounds]

A_murmur_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_a'), 'murmur*.wav')
A_murmur_sounds = load_file_data(folder=A_folder,file_names=A_murmur_files, duration=MAX_SOUND_CLIP_DURATION)
A_murmur_labels = [4 for items in A_murmur_files]

# test files
A_unlabelledtest_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_a'), 'Aunlabelledtest*.wav')
A_unlabelledtest_sounds = load_file_data(folder=A_folder,file_names=A_unlabelledtest_files, duration=MAX_SOUND_CLIP_DURATION)
A_unlabelledtest_labels = [-1 for items in A_unlabelledtest_sounds]

print ("loaded dataset-a")

load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_a/artifact__201106031558.wav
fixing audio lenght : artifact__201106031558.wav
load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_a/artifact__201106010559.wav
fixing audio lenght : artifact__201106010559.wav
load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_a/artifact__201105280851.wav
fixing audio lenght : artifact__201105280851.wav
load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_a/artifact__201106040933.wav
fixing audio lenght : artifact__201106040933.wav
load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_a/artifact__201106030612.wav
fixing audio lenght : artifact__201106030612.wav
load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_a/artifact__201105051017.wav
fixing audio lenght : artifact__201

In [15]:
%%time
# load dataset-b, keep them separate for testing purpose 
B_folder=INPUT_DIR+'/set_b/'
# set-b
B_normal_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_b'), 'normal*.wav')  # include noisy files
B_normal_sounds = load_file_data(folder=B_folder,file_names=B_normal_files, duration=MAX_SOUND_CLIP_DURATION)
B_normal_labels = [2 for items in B_normal_sounds]

B_murmur_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_b'), 'murmur*.wav')  # include noisy files
B_murmur_sounds = load_file_data(folder=B_folder,file_names=B_murmur_files, duration=MAX_SOUND_CLIP_DURATION)
B_murmur_labels = [4 for items in B_murmur_files]

B_extrastole_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_b'), 'extrastole*.wav')
B_extrastole_sounds = load_file_data(folder=B_folder,file_names=B_extrastole_files, duration=MAX_SOUND_CLIP_DURATION)
B_extrastole_labels = [5 for items in B_extrastole_files]

#test files
B_unlabelledtest_files = fnmatch.filter(os.listdir(INPUT_DIR+'/set_b'), 'Bunlabelledtest*.wav')
B_unlabelledtest_sounds = load_file_data(folder=B_folder,file_names=B_unlabelledtest_files, duration=MAX_SOUND_CLIP_DURATION)
B_unlabelledtest_labels = [-1 for items in B_unlabelledtest_sounds]
print ("loaded dataset-b")

load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_b/normal_noisynormal_123_1306331925797_B.wav
load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_b/normal_noisynormal_123_1306331925797_C.wav
fixing audio lenght : normal_noisynormal_123_1306331925797_C.wav
load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_b/normal__209_1308162216750_A.wav
fixing audio lenght : normal__209_1308162216750_A.wav
load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_b/normal__202_1308145175747_C.wav
fixing audio lenght : normal__202_1308145175747_C.wav
load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_b/normal__146_1306778707532_D3.wav
fixing audio lenght : normal__146_1306778707532_D3.wav
load file  /content/drive/MyDrive/AI ML Things/University of Turku Research Internship/set_b/normal__227_1308594233667_B.wav

In [16]:
#combine set-a and set-b 
x_data = np.concatenate((A_artifact_sounds, A_normal_sounds,A_extrahls_sounds,A_murmur_sounds, 
                         B_normal_sounds,B_murmur_sounds,B_extrastole_sounds))

y_data = np.concatenate((A_artifact_labels, A_normal_labels,A_extrahls_labels,A_murmur_labels,
                         B_normal_labels,B_murmur_labels,B_extrastole_labels))

test_x = np.concatenate((A_unlabelledtest_sounds,B_unlabelledtest_sounds))
test_y = np.concatenate((A_unlabelledtest_labels,B_unlabelledtest_labels))

print ("combined training data record: ",len(y_data), len(test_y))
print ("combined training data record: ",len(x_data), len(test_x))

combined training data record:  584 247
combined training data record:  584 247


In [17]:
import os 

# Set your working directory to a folder in your Google Drive. This way, if your notebook times out,
# your files will be saved in your Google Drive!

# the base Google Drive directory
root_dir = "/content/drive/My Drive/"

# choose where you want your project files to be saved
project_folder = "AI ML Things/University of Turku Research Internship/MFCC_features/"

def create_and_set_working_directory(project_folder):
  # check if your project folder exists. if not, it will be created.
  if os.path.isdir(root_dir + project_folder) == False:
    os.mkdir(root_dir + project_folder)
    print(root_dir + project_folder + ' did not exist but was created.')

  # change the OS to use your project folder as the working directory
  os.chdir(root_dir + project_folder)

  # create a test file to make sure it shows up in the right place
  !touch 'new_file_in_working_directory.txt'
  print('\nYour working directory was changed to ' + root_dir + project_folder + \
        "\n\nAn empty text file was created there. You can also run !pwd to confirm the current working directory." )

create_and_set_working_directory(project_folder)


Your working directory was changed to /content/drive/My Drive/AI ML Things/University of Turku Research Internship/MFCC_features/

An empty text file was created there. You can also run !pwd to confirm the current working directory.


In [18]:
pwd

'/content/drive/My Drive/AI ML Things/University of Turku Research Internship/MFCC_features'

In [19]:
np.save('train_x.npy',x_data)
np.save('train_y.npy',y_data)
np.save('test_x.npy',test_x)
np.save('test_y.npy',test_y)

In [20]:
x_data_mfcc = np.concatenate((x_data, test_x))
y_data_mfcc = np.concatenate((y_data, test_y))

In [21]:
x_data_mfcc.shape

(831, 40, 1)

In [22]:
y_data_mfcc.shape

(831,)

In [23]:
np.save('x_data_mfcc .npy',x_data_mfcc )
np.save('y_data_mfcc.npy',y_data_mfcc)