In [30]:
import librosa
import math
import numpy as np
import os
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
import tensorflow as tf

In [12]:
def featurise_data(folder):
    res = []
    time_slice = 30
    for genre in os.listdir(folder):
        dir_path = os.path.join(folder, genre)
        if os.path.isdir(dir_path):
            for file in os.listdir(dir_path):
                if '.mp3' in file:
                    songname = os.path.join(dir_path, file)
                    print(f'Featurising song: {songname}')
                    duration = librosa.get_duration(filename=songname)
                    samples = int(duration // time_slice)
                    for s in range(2, samples):
                        y, sr = librosa.load(songname, mono=True, offset=s*time_slice, duration=time_slice)
                        mfcc = librosa.feature.mfcc(y=y, sr=sr)
                        #print(mfcc.shape)
                        res.append(np.array([genre, mfcc]))
    return pd.DataFrame(np.array(res))

In [13]:
train = featurise_data('../songs/train/')
test = featurise_data('../songs/test/')

Featurising song: ../songs/train/mohanam/02-enduku_bAga_teliyadu-mOhanam.mp3
Featurising song: ../songs/train/mohanam/01-ninnu_kOriyunnAnurA-VARNAM-mOhanam.mp3
Featurising song: ../songs/train/mohanam/04-rArA_rAjIva_lOcana_rAma-mOhanam.mp3
Featurising song: ../songs/train/mohanam/09-swagatham_krishna-mohanam.mp3
Featurising song: ../songs/train/mohanam/02-En_paLLI_koNDIrayya-mOhanam-aruNAcala_kavi.mp3
Featurising song: ../songs/train/mohanam/04-pAhi_mAm_pArvati_paramEshvari-mOhanam.mp3
Featurising song: ../songs/train/mohanam/01-Maruvaka_daya_mOhananga_nApai-mOhanam.mp3
Featurising song: ../songs/train/mohanam/06-dhim_dhim_kitataka_dhimta-mOhanam.mp3
Featurising song: ../songs/train/mohanam/06-sadA_pAlaya_sArasAkSi-mOhanam.mp3
Featurising song: ../songs/train/mohanam/04-mayil_vAhanA_vaLLi-mOhanam.mp3
Featurising song: ../songs/train/mohanam/07_giridhara_gOpAla_mOhana_pApanAsam_shivan.mp3
Featurising song: ../songs/train/mohanam/13-jaya_mangaLam_nitya-mOhanam.mp3
Featurising song: ../so

In [14]:
train.head()

Unnamed: 0,0,1
0,mohanam,"[[-134.61331, -139.08217, -146.79865, -147.290..."
1,mohanam,"[[-117.09872, -112.08309, -120.157104, -133.98..."
2,mohanam,"[[-119.64694, -138.22232, -177.64699, -209.485..."
3,mohanam,"[[-134.68448, -157.24037, -186.11882, -186.297..."
4,mohanam,"[[-76.544205, -66.21905, -73.61639, -77.88258,..."


In [25]:
train[1][23].shape

(20, 1292)

In [18]:
X_original, lab_train = train.iloc[:, 1:], train[0]
X_test, lab_test = test.iloc[:, 1:], test[0]

model_y = LabelEncoder().fit(lab_train)
y_original = model_y.transform(lab_train)
y_test = model_y.transform(lab_test)
X_train, y_train = shuffle(X_original, y_original, random_state=42)

In [26]:
buckets = 20
window_length=1292
channels = 1
n_inputs = buckets * window_length * channels

conv1_filters = 64
conv1_ksize = 4
conv1_stride = 1
conv1_padding = 'SAME'

conv2_filters = 64
conv2_ksize = 4
conv2_stride = 1
conv2_padding = 'SAME'

conv3_filters = 128
conv3_ksize = 4
conv3_stride = 1
conv3_padding = 'SAME'

conv4_filters = 64
conv4_ksize = 3
conv4_stride = 1
conv4_padding = 'SAME'

pool1_psize = (1, 4)
pool1_stride = (1, 4)
pool1_padding = 'VALID'

pool2_psize = (1, 4)
pool2_stride = (1, 4)
pool2_padding = 'VALID'

pool3_psize = (2, 4)
pool3_stride = (2, 4)
pool3_padding = 'VALID'

pool4_psize = (2, 4)
pool4_stride = (2, 4)
pool4_padding = 'VALID'

n_fc1 = 256
n_fc2 = 128
n_fc3 = 64
n_outputs = 2

tf.reset_default_graph()

In [27]:
X = tf.placeholder(tf.float32, shape=(None, buckets, window_length), name='X')
X_reshaped = tf.reshape(X, shape=(-1, buckets, window_length, channels))
y = tf.placeholder(tf.int64, shape=(None), name='y')

with tf.name_scope('cnn'):
    conv1 = tf.layers.conv2d(X_reshaped, filters=conv1_filters, kernel_size=conv1_ksize, strides=conv1_stride,
                             padding=conv1_padding, activation=tf.nn.relu)
    pool1 = tf.layers.max_pooling2d(conv1, pool_size=pool1_psize, strides=pool1_stride, padding=pool1_padding)
    conv2 = tf.layers.conv2d(pool1, filters=conv2_filters, kernel_size=conv2_ksize, strides=conv2_stride,
                             padding=conv2_padding, activation=tf.nn.relu)
    pool2 = tf.layers.max_pooling2d(conv2, pool_size=pool2_psize, strides=pool2_stride, padding=pool2_padding)
    drop1 = tf.layers.dropout(pool2, rate=0.2)
    conv3 = tf.layers.conv2d(drop1, filters=conv3_filters, kernel_size=conv3_ksize, strides=conv3_stride,
                             padding=conv3_padding, activation=tf.nn.relu)
    pool3 = tf.layers.max_pooling2d(conv3, pool_size=pool3_psize, strides=pool3_stride, padding=pool3_padding)
    conv4 = tf.layers.conv2d(pool3, filters=conv4_filters, kernel_size=conv4_ksize, strides=conv4_stride,
                             padding=conv4_padding, activation=tf.nn.relu)
    pool4 = tf.layers.max_pooling2d(conv4, pool_size=pool4_psize, strides=pool4_stride, padding=pool4_padding)
    drop2 = tf.layers.dropout(pool4, rate=0.2)
    drop2_flat = tf.reshape(drop2, shape=(-1, 5 * 5 * conv4_filters))
    
with tf.name_scope('fc'):
    dense1 = tf.layers.dense(drop2_flat, n_fc1, activation=tf.nn.relu, name='fc1')
    dense2 = tf.layers.dense(dense1, n_fc2, activation=tf.nn.relu, name='fc2')
    dense3 = tf.layers.dense(dense2, n_fc3, activation=tf.nn.relu, name='fc3')
    drop3 = tf.layers.dropout(dense3, rate=0.5)

with tf.name_scope('output'):
    logits = tf.layers.dense(drop3, n_outputs, name='output')
    y_prob = tf.nn.softmax(logits, name='y_prob')

with tf.name_scope('train'):
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y)
    loss = tf.reduce_mean(cross_entropy)
    optimizer = tf.train.AdamOptimizer()
    training_op = optimizer.minimize(loss)
    
with tf.name_scope('eval'):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
init = tf.global_variables_initializer()

W0724 14:31:28.687176 4396492224 deprecation.py:323] From <ipython-input-27-f025b35086c4>:7: conv2d (from tensorflow.python.layers.convolutional) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.keras.layers.Conv2D` instead.
W0724 14:31:28.689610 4396492224 deprecation.py:506] From /Users/prroy/Documents/MachineLearning/speech/speech_env/lib/python3.7/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0724 14:31:28.886847 4396492224 deprecation.py:323] From <ipython-input-27-f025b35086c4>:8: max_pooling2d (from tensorflow.python.layers.pooling) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.MaxPooling2D instead.
W0724 14:31:29.003696 439649

In [58]:
def get_next_batch(starting_index, batch_size):
    return (X_train_final[starting_index: min(starting_index + batch_size, len(X_train_final))],
            y_train[starting_index: min(starting_index + batch_size, len(y_train))])

In [56]:
X_train_final = np.array(list(map(lambda x: x, X_train[1])))
X_test_final = np.array(list(map(lambda x: x, X_test[1])))

In [59]:
n_epochs = 20
batch_size = 100
with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(math.ceil(len(X_train_final) / batch_size)):
            X_batch, y_batch = get_next_batch(iteration * batch_size, batch_size)
            #print(X_batch)
            sess.run(training_op, feed_dict = {X: X_batch, y: y_batch})
        acc_train = accuracy.eval(feed_dict={X: X_train_final, y: y_train})
        #acc_val = accuracy.eval(feed_dict={X: X_test, y: y_test})
        print('epoch: {}, training accuracy: {}'.format(epoch, acc_train))
    print('Test set accuracy : {}'.format(accuracy.eval(feed_dict={X: X_test_final, y: y_test})))

epoch: 0, training accuracy: 0.4854312241077423
epoch: 1, training accuracy: 0.6672494411468506
epoch: 2, training accuracy: 0.693473219871521
epoch: 3, training accuracy: 0.7604895234107971
epoch: 4, training accuracy: 0.8525640964508057
epoch: 5, training accuracy: 0.866550087928772
epoch: 6, training accuracy: 0.8904429078102112
epoch: 7, training accuracy: 0.8904429078102112
epoch: 8, training accuracy: 0.9696969985961914
epoch: 9, training accuracy: 0.9801864624023438
epoch: 10, training accuracy: 0.9324009418487549
epoch: 11, training accuracy: 0.9836829900741577
epoch: 12, training accuracy: 0.9714452028274536
epoch: 13, training accuracy: 0.8916084170341492
epoch: 14, training accuracy: 0.939976692199707
epoch: 15, training accuracy: 0.9790209531784058
epoch: 16, training accuracy: 0.9912587404251099
epoch: 17, training accuracy: 0.9737762212753296
epoch: 18, training accuracy: 0.9900932312011719
epoch: 19, training accuracy: 0.9807692170143127
epoch: 20, training accuracy: 0.9

In [13]:
model = Pipeline([('scaler', StandardScaler()),
                  ('gb', GradientBoostingClassifier(learning_rate=0.05, n_estimators=200, random_state=42))])
model.fit(X_train, y_train)
np.mean(model.predict(X_test) == y_test)

0.6707317073170732

In [18]:
def predict_raaga(model, songname):
    time_slice = 30           
    duration = librosa.get_duration(filename=songname)
    samples = int(duration // time_slice)
    preds = []
    for s in range(2, samples):
        y, sr = librosa.load(songname, mono=True, offset=s*time_slice, duration=time_slice)
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        X_features = np.median(mfcc, axis=1)
        preds.append(model.predict([X_features])[0])
    return np.argmax(np.bincount(preds))

In [19]:
def run_predictions(model, folder):
    for genre in os.listdir(folder):
        dir_path = os.path.join(folder, genre)
        if os.path.isdir(dir_path):
            for file in os.listdir(dir_path):
                if '.mp3' in file:
                    pred = predict_raaga(model, os.path.join(dir_path, file))
                    print(f'Actual = {genre}, Prediction = {pred}')

In [20]:
run_predictions(model, '../songs/test/')

Actual = mohanam, Prediction = 1
Actual = mohanam, Prediction = 0
Actual = mohanam, Prediction = 1
Actual = mohanam, Prediction = 1
Actual = mohanam, Prediction = 1
Actual = mohanam, Prediction = 1
Actual = mohanam, Prediction = 1
Actual = hindolam, Prediction = 1
Actual = hindolam, Prediction = 0
Actual = hindolam, Prediction = 1
Actual = hindolam, Prediction = 0
Actual = hindolam, Prediction = 1
Actual = hindolam, Prediction = 1


In [24]:
import pickle

with open('raaga_model_1.pkl', 'wb') as f:
    pickle.dump(model, f)

In [27]:
with open('raaga_model_1.pkl', 'rb') as f:
    p_mod = pickle.load(f)