In [102]:
import librosa
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import IPython
import IPython.display as ipd  # To play sound in the notebook

In [103]:
def audio_norm(data):
    max_data = np.max(np.absolute(data))
    return data/(max_data+1e-6)*0.5

In [104]:
class Config(object):
    def __init__(self,
                 sampling_rate=16000, audio_duration=2, n_classes=41,
                 use_mfcc=False, n_folds=10, learning_rate=0.0001, 
                 max_epochs=50, n_mfcc=40, datagen_num = 2):
        self.sampling_rate = sampling_rate
        self.audio_duration = audio_duration
        self.n_classes = n_classes
        self.use_mfcc = use_mfcc
        self.n_mfcc = n_mfcc
        self.n_folds = n_folds
        self.learning_rate = learning_rate
        self.max_epochs = max_epochs
        self.datagen_num = datagen_num
        self.audio_length = self.sampling_rate * self.audio_duration
        if self.use_mfcc:
            self.dim = (self.n_mfcc, 1 + int(np.floor(self.audio_length/512)), 1)
        else:
            self.dim = (self.audio_length, 1)

In [105]:
def prepare_data(df, config, data_dir):
    X = np.empty(shape=(df.shape[0] * config.datagen_num, config.dim[0], config.dim[1], 1))
    y = np.empty(df.shape[0] * config.datagen_num)
    input_length = config.audio_length
    for i, fname in enumerate(df.index):
        print(fname+' ({0}/{1})'.format(i,df.shape[0]))
        file_path = data_dir + fname
        data, _ = librosa.core.load(file_path, sr=config.sampling_rate)
        data = audio_norm(data)
        for j in range(config.datagen_num):
            shifted_data = data
            #pitch shift
            bpo = 24 #how many steps per octave
            pr = 3/24 #pitch shift range
            ps = int(np.random.uniform(-pr * bpo, pr * bpo) + 0.5) #how many (fractional) half-steps to shift y
            shifted_data = librosa.effects.pitch_shift(shifted_data, config.sampling_rate, n_steps = ps, bins_per_octave = bpo)
            # time stretch
            #tr = 1.1 #speed up/down rate
            #lgtr = np.log(tr)
            #ts = 2 ** np.random.uniform(-lgtr,lgtr)
            #shifted_data = librosa.effects.time_stretch(shifted_data, ts)
            #white noise
            #wnvr = 0.05 # white noise volume range
            #wnv  = np.random.uniform(0, wnvr) # white noise volume, random
            #shifted_data += np.random.uniform(-wnv, wnv, shifted_data.shape)
            # Random offset / Padding
            if len(shifted_data) > input_length:
                max_offset = len(shifted_data) - input_length
                offset = np.random.randint(max_offset)
                shifted_data = shifted_data[offset:(input_length+offset)]
            else:
                if input_length > len(shifted_data):
                    max_offset = input_length - len(shifted_data)
                    offset = np.random.randint(max_offset)
                else:
                    offset = 0
                shifted_data = np.pad(shifted_data, (offset, input_length - len(shifted_data) - offset), "constant")
            #mfcc
            shifted_data = librosa.feature.mfcc(shifted_data, sr=config.sampling_rate, n_mfcc=config.n_mfcc)
            shifted_data = np.expand_dims(shifted_data, axis=-1)
            X[ i * config.datagen_num + j, :] = shifted_data
            y[ i * config.datagen_num + j] = df.label_idx[i]
    return X, y

In [106]:
def prepare_test_data(config, data_dir='../input/audio_test/'):
    df = pd.read_csv('../input/test.csv')
    test_data = np.empty(shape=(df.shape[0], config.dim[0], config.dim[1], 1))
    input_length = config.audio_length
    for i, fname in enumerate(df['fname']):
        print(fname)
        file_path = data_dir + fname
        data, _ = librosa.core.load(file_path, sr=config.sampling_rate)
        if len(data)==0:
            data = np.zeros(88200)
        data = audio_norm(data)
        # Random offset / Padding
        if len(data) > input_length:
            max_offset = len(data) - input_length
            offset = np.random.randint(max_offset)
            data = data[offset:(input_length+offset)]
        else:
            if input_length > len(data):
                max_offset = input_length - len(data)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
            data = np.pad(data, (offset, input_length - len(data) - offset), "constant")
        data = librosa.feature.mfcc(data, sr=config.sampling_rate, n_mfcc=config.n_mfcc)
        data = np.expand_dims(data, axis=-1)
        test_data[i] = data
    return test_data

In [107]:
config = Config(sampling_rate=44100, audio_duration=2, n_folds=5, 
                learning_rate=0.001, use_mfcc=True, n_mfcc=40, datagen_num=5)
train = pd.read_csv("../input/train.csv")
LABELS = list(train.label.unique())
label_idx = {label: i for i, label in enumerate(LABELS)}
train.set_index("fname", inplace=True)
train["label_idx"] = train.label.apply(lambda x: label_idx[x])

In [108]:
#test_data = prepare_test_data(config, data_dir='../input/audio_test/')

In [109]:
#np.save('../data_gen/test_mfcc.npy',test_data)
#del test_data

In [None]:
X_train, y = prepare_data(train, config, '../input/audio_train/')

00044347.wav (0/9473)
001ca53d.wav (1/9473)
002d256b.wav (2/9473)
0033e230.wav (3/9473)
00353774.wav (4/9473)
003b91e8.wav (5/9473)
003da8e5.wav (6/9473)
0048fd00.wav (7/9473)
004ad66f.wav (8/9473)
0063ab88.wav (9/9473)
006f2f32.wav (10/9473)
0075d39c.wav (11/9473)
00780200.wav (12/9473)
0079d310.wav (13/9473)
0091fc7f.wav (14/9473)
0097160c.wav (15/9473)
00ad7068.wav (16/9473)
00c5808a.wav (17/9473)
00c82919.wav (18/9473)
00c934d7.wav (19/9473)
00c9e799.wav (20/9473)
00cb787c.wav (21/9473)
00ce569f.wav (22/9473)
00d1fe46.wav (23/9473)
00d3bba3.wav (24/9473)
00d40fa2.wav (25/9473)
00d9fa61.wav (26/9473)
00e2b4cd.wav (27/9473)
00f88dc5.wav (28/9473)
00fbb28b.wav (29/9473)
00fcbab2.wav (30/9473)
010aa387.wav (31/9473)
011a2185.wav (32/9473)
0120d246.wav (33/9473)
01235a12.wav (34/9473)
01257aad.wav (35/9473)
01302128.wav (36/9473)
013264d3.wav (37/9473)
013c3135.wav (38/9473)
01506d76.wav (39/9473)
015cf474.wav (40/9473)
0160d55e.wav (41/9473)
01638f61.wav (42/9473)
0172a2a5.wav (43/9473

09bc3033.wav (346/9473)
09ca1e09.wav (347/9473)
09ea5276.wav (348/9473)
0a037e96.wav (349/9473)
0a0a8d4c.wav (350/9473)
0a15b36b.wav (351/9473)
0a277f11.wav (352/9473)
0a2a5c05.wav (353/9473)
0a2b4c80.wav (354/9473)
0a32271b.wav (355/9473)
0a366772.wav (356/9473)
0a484e9f.wav (357/9473)
0a49afad.wav (358/9473)
0a54c770.wav (359/9473)
0a6bba04.wav (360/9473)
0a6dbf2c.wav (361/9473)
0a82b4d6.wav (362/9473)
0a8ac55d.wav (363/9473)
0a8d300c.wav (364/9473)
0a925754.wav (365/9473)
0a98104d.wav (366/9473)
0aa17642.wav (367/9473)
0aa32376.wav (368/9473)
0aa7edd0.wav (369/9473)
0aab722d.wav (370/9473)
0aad0a16.wav (371/9473)
0aaed52b.wav (372/9473)
0ac443ad.wav (373/9473)
0ac83c13.wav (374/9473)
0acba147.wav (375/9473)
0ad0bf22.wav (376/9473)
0ad5618e.wav (377/9473)
0ad666e3.wav (378/9473)
0ade0819.wav (379/9473)
0af32053.wav (380/9473)
0afca134.wav (381/9473)
0afcf36b.wav (382/9473)
0aff3cde.wav (383/9473)
0b05208c.wav (384/9473)
0b142b38.wav (385/9473)
0b1f22c3.wav (386/9473)
0b2a3ef0.wav (38

1393c9e0.wav (688/9473)
1397ddc8.wav (689/9473)
139c42cb.wav (690/9473)
13ac472f.wav (691/9473)
13b2325b.wav (692/9473)
13ca1116.wav (693/9473)
13da16ae.wav (694/9473)
13db0700.wav (695/9473)
13e15a49.wav (696/9473)
13ee6c44.wav (697/9473)
13f08982.wav (698/9473)
140027c6.wav (699/9473)
1401c7ff.wav (700/9473)
14041f9a.wav (701/9473)
141291fc.wav (702/9473)
1415b8b9.wav (703/9473)
14225f23.wav (704/9473)
142cb2c4.wav (705/9473)
142dac40.wav (706/9473)
14303ca5.wav (707/9473)
1430f062.wav (708/9473)
14344787.wav (709/9473)
143f37eb.wav (710/9473)
1441c25b.wav (711/9473)
14577fc0.wav (712/9473)
14683a1c.wav (713/9473)
14684ee8.wav (714/9473)
1468f23e.wav (715/9473)
146d5e25.wav (716/9473)
146fd7fd.wav (717/9473)
1472a525.wav (718/9473)
147f4395.wav (719/9473)
147f61ff.wav (720/9473)
14819552.wav (721/9473)
148a199b.wav (722/9473)
1498eb6e.wav (723/9473)
149ca673.wav (724/9473)
14a12f6c.wav (725/9473)
14ac7cbd.wav (726/9473)
14b489cd.wav (727/9473)
14b4f917.wav (728/9473)
14c22798.wav (72

1c520cae.wav (1029/9473)
1c590c46.wav (1030/9473)
1c603b9f.wav (1031/9473)
1c67a5b9.wav (1032/9473)
1c6b07b8.wav (1033/9473)
1c720526.wav (1034/9473)
1c76a229.wav (1035/9473)
1c78595a.wav (1036/9473)
1c8943c0.wav (1037/9473)
1c8cddc2.wav (1038/9473)
1c9a423f.wav (1039/9473)
1ca0bd2f.wav (1040/9473)
1ca4d107.wav (1041/9473)
1cc13604.wav (1042/9473)
1cc55db4.wav (1043/9473)
1cccb873.wav (1044/9473)
1cd6919e.wav (1045/9473)
1ce38807.wav (1046/9473)
1ce4a00e.wav (1047/9473)
1ce70b78.wav (1048/9473)
1ced478c.wav (1049/9473)
1cf3ee46.wav (1050/9473)
1d08bd6d.wav (1051/9473)
1d0c860b.wav (1052/9473)
1d119cfc.wav (1053/9473)
1d1d0d72.wav (1054/9473)
1d23c74b.wav (1055/9473)
1d23fb6d.wav (1056/9473)
1d24518b.wav (1057/9473)
1d26c5e5.wav (1058/9473)
1d2808dd.wav (1059/9473)
1d28b9cb.wav (1060/9473)
1d395faa.wav (1061/9473)
1d3a05ab.wav (1062/9473)
1d491960.wav (1063/9473)
1d542616.wav (1064/9473)
1d5a6f5f.wav (1065/9473)
1d5d63d1.wav (1066/9473)
1d73907a.wav (1067/9473)
1d8502f5.wav (1068/9473)


255db1a8.wav (1357/9473)
256bc358.wav (1358/9473)
256ce8b0.wav (1359/9473)
25794ec7.wav (1360/9473)
258777c8.wav (1361/9473)
2588c966.wav (1362/9473)
258b6166.wav (1363/9473)
259c3671.wav (1364/9473)
25a3f8ba.wav (1365/9473)
25a4dda4.wav (1366/9473)
25a82ad1.wav (1367/9473)
25a8c380.wav (1368/9473)
25ad6ed7.wav (1369/9473)
25bca056.wav (1370/9473)
25c11b5d.wav (1371/9473)
25c89ec0.wav (1372/9473)
25c99ad7.wav (1373/9473)
25cc1f6a.wav (1374/9473)
25cd1d74.wav (1375/9473)
25eef09a.wav (1376/9473)
25f26095.wav (1377/9473)
25f4273e.wav (1378/9473)
260161b5.wav (1379/9473)
260da1b4.wav (1380/9473)
260ffd74.wav (1381/9473)
260fffaf.wav (1382/9473)
261ac300.wav (1383/9473)
261b8615.wav (1384/9473)
2620d32b.wav (1385/9473)
2624b75b.wav (1386/9473)
26262fd9.wav (1387/9473)
2627affd.wav (1388/9473)
2632db24.wav (1389/9473)
2634b73c.wav (1390/9473)
264301a3.wav (1391/9473)
264574a5.wav (1392/9473)
2647f749.wav (1393/9473)
2648b632.wav (1394/9473)
2655d941.wav (1395/9473)
2658f202.wav (1396/9473)


2e762cca.wav (1685/9473)
2e7a4dc2.wav (1686/9473)
2e8579ad.wav (1687/9473)
2e8d6e34.wav (1688/9473)
2e95fa3d.wav (1689/9473)
2e97a149.wav (1690/9473)
2e992414.wav (1691/9473)
2ea318e7.wav (1692/9473)
2ea5702d.wav (1693/9473)
2eb1e5f9.wav (1694/9473)
2eb83a0b.wav (1695/9473)
2ed3c63d.wav (1696/9473)
2ed52423.wav (1697/9473)
2ed7a267.wav (1698/9473)
2ed838e9.wav (1699/9473)
2ee0f61c.wav (1700/9473)
2ee73a9d.wav (1701/9473)
2eeac352.wav (1702/9473)
2f06441d.wav (1703/9473)
2f17a4e8.wav (1704/9473)
2f1bb802.wav (1705/9473)
2f22d8b0.wav (1706/9473)
2f2429ad.wav (1707/9473)
2f2d2e7f.wav (1708/9473)
2f3515c2.wav (1709/9473)
2f3ba7ab.wav (1710/9473)
2f3ca1dc.wav (1711/9473)
2f45db21.wav (1712/9473)
2f4d09b2.wav (1713/9473)
2f4f25e0.wav (1714/9473)
2f50de25.wav (1715/9473)
2f5d12f2.wav (1716/9473)
2f5fe8ea.wav (1717/9473)
2f6c4dec.wav (1718/9473)
2f6f44fa.wav (1719/9473)
2f77f7c4.wav (1720/9473)
2f84b9ec.wav (1721/9473)
2f9c5f62.wav (1722/9473)
2fb2de13.wav (1723/9473)
2fb7c05e.wav (1724/9473)


In [None]:
np.save('../data_gen/train_mfcc_pitch=3\\24_num=5.npy',X_train)
np.save('../data_gen/label_pitch=3\\24_num=5.npy',y)
