In [76]:
import librosa
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import IPython
import IPython.display as ipd  # To play sound in the notebook

In [77]:
def audio_norm(data):
    max_data = np.max(np.absolute(data))
    return data/(max_data+1e-6)*0.5

In [78]:
class Config(object):
    def __init__(self,
                 sampling_rate=16000, audio_duration=2, n_classes=41,
                 use_mfcc=False, n_folds=10, learning_rate=0.0001, 
                 max_epochs=50, n_mfcc=40, datagen_num = 2):
        self.sampling_rate = sampling_rate
        self.audio_duration = audio_duration
        self.n_classes = n_classes
        self.use_mfcc = use_mfcc
        self.n_mfcc = n_mfcc
        self.n_folds = n_folds
        self.learning_rate = learning_rate
        self.max_epochs = max_epochs
        self.datagen_num = datagen_num
        self.audio_length = self.sampling_rate * self.audio_duration
        if self.use_mfcc:
            self.dim = (self.n_mfcc, 1 + int(np.floor(self.audio_length/512)), 1)
        else:
            self.dim = (self.audio_length, 1)

In [79]:
def prepare_data(df, config, data_dir):
    X = np.empty(shape=(df.shape[0] * config.datagen_num, config.dim[0], config.dim[1], 1))
    y = np.empty(df.shape[0] * config.datagen_num)
    input_length = config.audio_length
    for i, fname in enumerate(df.index):
        print(fname+' {0}/{1})'.format(i,df.shpae[0]))
        file_path = data_dir + fname
        data, _ = librosa.core.load(file_path, sr=config.sampling_rate)
        data = audio_norm(data)
        for j in range(config.datagen_num):
            shifted_data = data
            #pitch shift
            #bpo = 24 #how many steps per octave
            #pr = 0.3 #pitch shift range
            #ps = int(np.random.uniform(-pr * bpo, pr * bpo) + 0.5) #how many (fractional) half-steps to shift y
            #shifted_data = librosa.effects.pitch_shift(data, config.sampling_rate, n_steps = ps, bins_per_octave = bpo)
            # time stretch
            tr = 1.1 #speed up/down rate
            lgtr = np.log(tr)
            ts = 2 ** np.random.uniform(-lgtr,lgtr)
            shifted_data = librosa.effects.time_stretch(shifted_data, ts)
            #white noise
            #wnvr = 0.05 # white noise volume range
            #wnv  = np.random.uniform(0, wnvr) # white noise volume, random
            #shifted_data += np.random.uniform(-wnv, wnv, shifted_data.shape)
            # Random offset / Padding
            if len(shifted_data) > input_length:
                max_offset = len(shifted_data) - input_length
                offset = np.random.randint(max_offset)
                shifted_data = shifted_data[offset:(input_length+offset)]
            else:
                if input_length > len(shifted_data):
                    max_offset = input_length - len(shifted_data)
                    offset = np.random.randint(max_offset)
                else:
                    offset = 0
                shifted_data = np.pad(shifted_data, (offset, input_length - len(shifted_data) - offset), "constant")
            #mfcc
            shifted_data = librosa.feature.mfcc(shifted_data, sr=config.sampling_rate, n_mfcc=config.n_mfcc)
            shifted_data = np.expand_dims(shifted_data, axis=-1)
            X[ i * config.datagen_num + j, :] = shifted_data
            y[ i * config.datagen_num + j] = df.label_idx[i]
    return X, y

In [80]:
def prepare_test_data(config, data_dir='../input/audio_test/'):
    df = pd.read_csv('../input/test.csv')
    test_data = np.empty(shape=(df.shape[0], config.dim[0], config.dim[1], 1))
    input_length = config.audio_length
    for i, fname in enumerate(df['fname']):
        print(fname)
        file_path = data_dir + fname
        data, _ = librosa.core.load(file_path, sr=config.sampling_rate)
        if len(data)==0:
            data = np.zeros(88200)
        data = audio_norm(data)
        # Random offset / Padding
        if len(data) > input_length:
            max_offset = len(data) - input_length
            offset = np.random.randint(max_offset)
            data = data[offset:(input_length+offset)]
        else:
            if input_length > len(data):
                max_offset = input_length - len(data)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
            data = np.pad(data, (offset, input_length - len(data) - offset), "constant")
        data = librosa.feature.mfcc(data, sr=config.sampling_rate, n_mfcc=config.n_mfcc)
        data = np.expand_dims(data, axis=-1)
        test_data[i] = data
    return test_data

In [81]:
config = Config(sampling_rate=44100, audio_duration=2, n_folds=5, 
                learning_rate=0.001, use_mfcc=True, n_mfcc=40, datagen_num=5)
train = pd.read_csv("../input/train.csv")
LABELS = list(train.label.unique())
label_idx = {label: i for i, label in enumerate(LABELS)}
train.set_index("fname", inplace=True)
train["label_idx"] = train.label.apply(lambda x: label_idx[x])

In [82]:
#test_data = prepare_test_data(config, data_dir='../input/audio_test/')

In [83]:
#np.save('../data_gen/test_mfcc.npy',test_data)
#del test_data

In [None]:
X_train, y = prepare_data(train, config, '../input/audio_train/')

00044347.wav
001ca53d.wav
002d256b.wav
0033e230.wav
00353774.wav
003b91e8.wav
003da8e5.wav
0048fd00.wav
004ad66f.wav
0063ab88.wav
006f2f32.wav
0075d39c.wav
00780200.wav
0079d310.wav
0091fc7f.wav
0097160c.wav
00ad7068.wav
00c5808a.wav
00c82919.wav
00c934d7.wav
00c9e799.wav
00cb787c.wav
00ce569f.wav
00d1fe46.wav
00d3bba3.wav
00d40fa2.wav
00d9fa61.wav
00e2b4cd.wav
00f88dc5.wav
00fbb28b.wav
00fcbab2.wav
010aa387.wav
011a2185.wav
0120d246.wav
01235a12.wav
01257aad.wav
01302128.wav
013264d3.wav
013c3135.wav
01506d76.wav
015cf474.wav
0160d55e.wav
01638f61.wav
0172a2a5.wav
017ea24e.wav
01811e48.wav
0184c390.wav
018863f5.wav
018a10bb.wav
018b1df6.wav
018d1dc4.wav
0193042e.wav
01974c7c.wav
019aae9d.wav
019d2a2c.wav
01a36643.wav
01a39e95.wav
01a59a61.wav
01a59c11.wav
01a5dc85.wav
01b9f44a.wav
01c2f88b.wav
01d2475c.wav
01d4dafd.wav
01df7ada.wav
01e723f5.wav
01ec7a01.wav
01ee18fd.wav
01f2e70b.wav
01fc4661.wav
020eb9f6.wav
021f8009.wav
022092bc.wav
02267a1a.wav
02274ee8.wav
022a3507.wav
022cc908.wav

11e3ffde.wav
11e63946.wav
11e9db25.wav
11f33ae4.wav
11f7fc13.wav
11f8b3dd.wav
11fab0b0.wav
11fbd6c2.wav
120536d6.wav
1207a675.wav
12091a74.wav
120e0846.wav
121110b1.wav
12169b86.wav
121749b9.wav
12198b88.wav
121d766e.wav
1220471f.wav
122b0e88.wav
12323e04.wav
12371b67.wav
1238c3a4.wav
123d0d60.wav
124360b8.wav
124eadc9.wav
125194b0.wav
125e64b7.wav
125e6ccf.wav
1264b8c7.wav
129439fd.wav
129dc328.wav
12a38093.wav
12a650a4.wav
12cb4097.wav
12d3cdd8.wav
12d43904.wav
12d4ec7c.wav
12e4c120.wav
12f72b79.wav
12fcf7a7.wav
1301b8e3.wav
13043bab.wav
1304d545.wav
1314cdad.wav
131b1516.wav
1329c048.wav
13314233.wav
134df575.wav
1356f987.wav
1365348d.wav
1373fdab.wav
1374fcda.wav
13798190.wav
13847ff3.wav
1387ba89.wav
13903b82.wav
1391883f.wav
1393c9e0.wav
1397ddc8.wav
139c42cb.wav
13ac472f.wav
13b2325b.wav
13ca1116.wav
13da16ae.wav
13db0700.wav
13e15a49.wav
13ee6c44.wav
13f08982.wav
140027c6.wav
1401c7ff.wav
14041f9a.wav
141291fc.wav
1415b8b9.wav
14225f23.wav
142cb2c4.wav
142dac40.wav
14303ca5.wav

22cafa2d.wav
22cc1654.wav
22cf5ded.wav
22d8346b.wav
22e1c02c.wav
22e814ee.wav
22e83dcb.wav
22f85886.wav
231b7c90.wav
23219d68.wav
23226fdd.wav
232cc8b2.wav
2336285e.wav
233fc9e5.wav
234a0656.wav
234f4672.wav
234f95ec.wav
23663788.wav
236648d4.wav
236b8093.wav
236cbab1.wav
23704fa9.wav
2371f627.wav
2372eeb4.wav
23786ab2.wav
237b9e2a.wav
237e1078.wav
2383be9f.wav
238b16a5.wav
238fb6b8.wav
23912d0e.wav
23925f36.wav
2392ad36.wav
2398cede.wav
23a1a82e.wav
23a3984c.wav
23b6d45c.wav
23b7d8f5.wav
23b8788a.wav
23bc06b9.wav
23c0de2c.wav
23c11bee.wav
23cb1f38.wav
23cdff07.wav
23e8ac90.wav
23ebf391.wav
23f38a7e.wav
23fb24e8.wav
240590c8.wav
2405de39.wav
24202032.wav
242113be.wav
24218e25.wav
2424ad60.wav
2439dedd.wav
244025d2.wav
24503135.wav
245a503a.wav
245e58ea.wav
2462c802.wav
246ad992.wav
246e8414.wav
24779e2d.wav
24791b50.wav
247ac56f.wav
2492a915.wav
249912b5.wav
2499fad5.wav
24a3c430.wav
24addbbc.wav
24b90fac.wav
24ba0ef2.wav
24bae325.wav
24cb1e29.wav
24dfdfa5.wav
24e1d9ad.wav
24e250f8.wav

3486d1db.wav
34881c5f.wav
349ff282.wav
34a1086e.wav
34a90030.wav
34b2a4f7.wav
34b432be.wav
34bb94e8.wav
34c01286.wav
34c5db34.wav
34d0a285.wav
34d3e8bc.wav
34dacafb.wav
34dd9426.wav
34e04b95.wav
34e4d187.wav
34f10443.wav
35039ae1.wav
3505e582.wav
3518605b.wav
351d0b53.wav
35270d67.wav
35271da5.wav
352875e7.wav
353b359f.wav
354bd271.wav
354c7009.wav
354cabb9.wav
35571af7.wav
3564bae2.wav
35669b76.wav
356f6f26.wav
35717b0f.wav
35721b6a.wav
357a5b9a.wav
357f7e95.wav
358139e5.wav
35839982.wav
35844050.wav
3586bc2d.wav
3592e056.wav
35950597.wav
35a38d01.wav
35accc11.wav
35ad5cd8.wav
35bd35da.wav
35d75ab7.wav
35d7ac24.wav
35e0b4ea.wav
35e97db3.wav
35f05c5d.wav
35f0e452.wav
35f2639c.wav
35f589b8.wav
35f6b357.wav
35fb9b2e.wav
35fd6638.wav
3600d08e.wav
360174e1.wav
3601d225.wav
361230e1.wav
36177faa.wav
3618cb03.wav
3623e99e.wav
3626eabd.wav
36315bea.wav
36342286.wav
363c41b3.wav
3640c702.wav
3649d60c.wav
3650faee.wav
36514399.wav
365bee4a.wav
365d9809.wav
3662e614.wav
366664bb.wav
36698953.wav

4517a4af.wav
45243c3d.wav
45250e90.wav
4526e386.wav
452a9812.wav
4541aa66.wav
45449f84.wav
4546e002.wav
4549ffaf.wav
4551f231.wav
455377bf.wav
45539540.wav
45539bb8.wav
455aee34.wav
45665f2e.wav
456cc727.wav
458b1992.wav
458ea6cc.wav
4590ba37.wav
45910ae0.wav
45a8823e.wav
45a8d396.wav
45b9bc91.wav
45bf81f1.wav
45c077da.wav
45ce1b24.wav
45d405ca.wav
45d71502.wav
45da8553.wav
45de3233.wav
45e241c1.wav
45e5a952.wav
45e63fdd.wav
45e6e129.wav
45ea531e.wav
45f3e225.wav
4601c0f2.wav
46076458.wav
460d7b62.wav
461558c3.wav
4619c0dd.wav
461c6cd1.wav
4629e77b.wav
462b676f.wav
4636fd73.wav
464c651e.wav
465cf6eb.wav
46657625.wav
46670055.wav
466a45d5.wav
46732e56.wav
46768bc1.wav
46772c0d.wav
467953c6.wav
46798d97.wav
467ad774.wav
467c86b7.wav
468226fa.wav
468f9c4c.wav
469e4ebe.wav
46a95319.wav
46b52f97.wav
46b66e03.wav
46c0a7cd.wav
46d3877b.wav
46d7d98d.wav
46e13bac.wav
46fbedcf.wav
470e36b8.wav
470f4da3.wav
4713dac4.wav
4715ec32.wav
4715f79a.wav
4716c05b.wav
47178b5e.wav
471c8fa5.wav
471ecdc0.wav

54f4ed65.wav
54f86efe.wav
54f89ec3.wav
54feec8f.wav
5503dbff.wav
5503e01b.wav
55040d8b.wav
5506629f.wav
55087d4d.wav
550cae67.wav
5531a5e1.wav
553245ef.wav
5543631a.wav
5554d95d.wav
5558f338.wav


In [None]:
np.save('../data_gen/train_mfcc_stretch=1.1_num=5.npy',X_train)
np.save('../data_gen/label_stretch=1.1_num=5.npy',y)
