In [148]:
import numpy as np
import pandas as pd
import IPython.display as ipd
import scipy.io.wavfile as wav
from scipy.fftpack import dct
from zipfile import ZipFile
import matplotlib.pyplot as plt
from scipy.stats import skew
import os,glob
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

#### The function below returns a dataframe containing all file paths and their corresponding label

In [4]:
def generate_path(base_path):
    temp = []
    for types in dialects: 
        df = pd.DataFrame()
        base_dir = base_path + str(types)
        files = glob.glob(base_dir+'/*.wav')
        y = [types]*len(files)
        df["FILE"] = files
        df["OUTPUT"] = y
        temp.append(df)
    return pd.concat(temp).reset_index(drop=True)

In [21]:
def generate_audio_frames(fs,audio,frameSize,frameOverlap):
    frameLen, frameLap = int(round(fs*frameSize)), int(round(fs*frameOverlap))
    audioLen = len(audio)
    paddingLen = frameLap - (audioLen - frameLen) % frameLap #making number of frames even
    paddedAudio = np.concatenate((audio, np.zeros(paddingLen)), axis = 0)
    audioLen = len(paddedAudio)
    numberOfFrames = int(np.floor((audioLen - frameLen)/frameLap) + 1)
    indices = np.tile(np.arange(0, frameLen), (numberOfFrames, 1)) + np.tile((np.arange(0, numberOfFrames*frameLap, frameLap)), (frameLen, 1)).T
    frames = paddedAudio[indices]
    return frames, frameLen

In [6]:
def apply_hamming(frames,frameLen):
    frames = frames * np.hamming(frameLen)
    return frames

In [25]:
def power_spectrum(frames, nfft):
    mag_frames = np.absolute(np.fft.rfft(frames, nfft))  # Magnitude of the FFT
    pow_frames = np.square(mag_frames)/nfft  # Power Spectrum
    return pow_frames

In [8]:
def generate_filter_bank(fs,num_banks,nfft):
    
    hz_to_mel = lambda x: (2595 * np.log10(1 + (x / 2) / 700))
    mel_to_hz = lambda x: (700*(np.power(10, x/2595) - 1))
    
    lower_mel = 0
    upper_mel = hz_to_mel(fs)
    mel_points = np.linspace(lower_mel, upper_mel, num_banks + 2)
    hz_points = mel_to_hz(mel_points)
    bins = np.floor((nfft + 1) * hz_points/fs)
    
    fbank = np.zeros((num_banks, int(np.floor(nfft/2 + 1))))
    for m in range(1, num_banks + 1):
        left = int(bins[m - 1])
        center = int(bins[m])
        right = int(bins[m + 1])
        for k in range(left, center):
            fbank[m - 1, k] = (k - bins[m - 1])/(bins[m] - bins[m - 1])
        for k in range(center, right):
            fbank[m - 1, k] = (bins[m + 1] - k)/(bins[m + 1] - bins[m])
    return fbank

In [9]:
def apply_filter_banks(power_frames,fbank):
    filter_banks = np.dot(power_frames, fbank.T)
    #for numerical stability
    filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) #if condition is true, return eps, else return original val
    filter_banks = 20*np.log10(filter_banks)
    return filter_banks

In [27]:
def generate_mfcc(filterBanks,num_ceps):
    mfcc = dct(filterBanks, type = 2, axis = 1, norm = 'ortho')[:, 1:(num_ceps + 1)]
    return mfcc  

In [23]:
def audio_feature_extraction(audio_path,frameLen,frameOverlap,nfft,num_fbanks,num_ceps):
    fs, audio = wav.read(audio_path)
    frm,frmlen = generate_audio_frames(fs,audio,frameLen,frameOverlap)
    frm = apply_hamming(frm,frmlen)
    pow_frm = power_spectrum(frm,nfft)
    fbank = generate_filter_bank(fs,num_fbanks,nfft)
    filter_banks = apply_filter_banks(pow_frm,fbank)
    mfcc = generate_mfcc(filter_banks,num_ceps)
    return mfcc

In [162]:
framelength = 0.025
frameoverlap = 0.015
nfft = 512
num_fbanks = 40
num_ceps = 20
N = 2
dialects = [1,2,3,4,5,6,7,8,9]
dataset_path = "dataset/IDR"

In [169]:
dataset = generate_path(dataset_path)
dataset.tail()

Unnamed: 0,FILE,OUTPUT
598,dataset/IDR9/18.wav,9
599,dataset/IDR9/59.wav,9
600,dataset/IDR9/17.wav,9
601,dataset/IDR9/35.wav,9
602,dataset/IDR9/61.wav,9


In [58]:
num = 0
test = dataset["FILE"][num]
mfcc = audio_feature_extraction(test,framelength,frameoverlap,nfft,num_fbanks,num_ceps)

In [150]:
print(len(mfcc[0]))
dataset.shape

12


(603, 2)

In [181]:
features = {
    "mean" : lambda x: np.mean(x,axis=0),
    "min" : lambda x: np.min(x,axis=0),
    "max" : lambda x: np.max(x,axis=0),
    "std_dev" : lambda x: np.std(x,axis=0),
    "median" : lambda x: np.median(x,axis=0)
}

In [182]:
def generate_model_data(df,size=100):
    df = df.sample(frac=1).reset_index(drop=True)
    col = []
    y = []
    for key in features.keys():
        for i in range(1,num_ceps+1):
            col.append(key + str(i))
    rows = []
    for index, row in df.iterrows():
        if(index<size):
            temp = []
            y.append(row["OUTPUT"])
            path = row["FILE"]
            mfcc = audio_feature_extraction(path,framelength,frameoverlap,nfft,num_fbanks,num_ceps)
            for key in features.keys():
                temp.append(features[key](mfcc))
            rows.append(np.concatenate(temp))
        else:
            break
    feat = pd.DataFrame(rows)
    feat.columns = col
    feat["OUTPUT"] = y
    return feat

In [183]:
df = generate_model_data(dataset,602)

In [184]:
df.shape

(602, 101)

In [185]:
df.head(10)

Unnamed: 0,mean1,mean2,mean3,mean4,mean5,mean6,mean7,mean8,mean9,mean10,...,median12,median13,median14,median15,median16,median17,median18,median19,median20,OUTPUT
0,96.105814,35.659211,31.947512,5.271002,-4.942159,-26.103701,-12.069546,4.803206,-9.014876,6.191544,...,-8.792896,-5.051871,-9.717639,-6.860387,1.913253,2.481,-2.579916,-1.415435,-1.865956,3
1,112.153247,42.361315,26.638163,-8.232559,6.052879,-15.713844,-10.765235,1.095043,-8.108846,-5.471334,...,-5.934792,-8.281207,-6.418569,-3.604072,1.073097,0.551462,3.014841,0.826201,4.20275,8
2,114.619305,18.359297,15.874928,-12.08946,-11.840493,-24.281016,-6.279295,7.301229,0.087926,-4.27005,...,-10.282062,-7.911186,-6.249562,-0.816586,0.65556,-0.242446,-0.945582,1.800636,5.499386,3
3,94.862938,-3.804672,16.268195,-3.742158,-7.277673,-3.190895,1.872777,-1.385073,0.660522,-2.464818,...,1.275249,-2.739007,-12.393611,-1.460398,-0.96552,-2.239507,0.184897,3.164308,0.23006,6
4,86.885441,29.019849,33.784252,12.603544,12.129251,-9.454431,-0.51133,2.40832,-2.119199,-0.158092,...,-1.593917,-4.838419,-11.600838,-5.093907,-4.416873,-0.492907,-3.500187,0.848095,-5.765922,9
5,98.951095,27.835802,12.799852,-1.229706,-26.647396,1.835163,-17.298287,-13.607219,6.311309,-8.151298,...,-14.142438,-4.064457,-1.986249,4.216432,3.734326,6.353738,1.959758,4.461541,3.742916,2
6,109.668851,7.643882,20.165221,5.126716,0.414881,1.975192,-15.693223,-2.974378,-1.84784,-8.245727,...,-9.428364,-2.999093,-8.200524,-8.453538,-0.644155,0.887107,-3.261713,0.429562,-4.614444,9
7,118.209336,23.444513,45.50105,0.060234,-9.259417,-17.057902,-0.788409,-5.208525,-19.082823,5.854077,...,-7.659059,-9.909797,-18.483485,-6.659902,-3.190911,-4.173672,-5.834776,-2.239423,-3.952699,3
8,105.670879,26.954744,9.287845,-6.248522,-28.940836,-15.460778,-8.929155,-9.430842,-4.717956,-0.185971,...,-11.219678,-5.945586,-8.984128,5.92763,3.624589,0.454481,7.618336,8.439105,8.23779,2
9,95.149153,24.733778,11.254078,-4.302771,-13.420981,-15.99363,-10.077809,-0.860684,-6.900108,-2.574305,...,-3.717224,-0.4725,-12.222422,0.249381,0.063199,0.503055,-0.793519,5.022891,0.515883,8


In [186]:
data_train, data_test = train_test_split(df, test_size=0.3)
x_train = data_train.drop(data_train[['OUTPUT']],axis=1)
y_train = data_train[['OUTPUT']]

x_test = data_test.drop(data_test[['OUTPUT']],axis=1)
y_test = data_test[['OUTPUT']]

In [187]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

clf = SVC(gamma='auto')
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)
clf.score(x_test, y_test)

  y = column_or_1d(y, warn=True)


0.988950276243094