In [11]:
import os
import pandas as pd
import warnings
import librosa
import matplotlib.pyplot as plt
import numpy as np
import random
import shutil
import keras.backend as K
from pydub import AudioSegment
from keras.preprocessing.image import ImageDataGenerator
from keras import layers
from keras.layers import (Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, 
                          Conv2D, AveragePooling2D, MaxPooling2D, GlobalMaxPooling2D, Dropout)
from keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam
from keras.initializers import glorot_uniform

warnings.filterwarnings('ignore')

In [12]:
os.makedirs('audio/genres', exist_ok=True)

df_tracks = pd.read_csv('gdrive/My Drive/tracks.csv', header=1)
df_tracks.rename(columns = {df_tracks.columns[0]:df_tracks.iloc[0][0]}, inplace=True)
df_tracks.drop(labels=0, axis=0, inplace=True)
df_tracks = df_tracks[['track_id', 'genre_top']]
df_tracks['track_id'] = df_tracks['track_id'].astype('str')

lst_track_id = list()
for folder in os.listdir('audio/fma_small'):
    if folder.isnumeric():
        for filename in os.listdir(os.path.join('audio/fma_small', folder)):
            if filename.split('.')[0].isnumeric():
                lst_track_id.append(str(int(filename.split('.')[0])))
df_track_id = pd.DataFrame({'track_id':lst_track_id})

df_track_genres = pd.merge(df_track_id, df_tracks, on='track_id')

genres = 'Hip-Hop Electronic Instrumental International Folk Rock Experimental Pop'.split()

for g in genres:
    os.makedirs(os.path.join('audio/genres', g), exist_ok=True)

lst_track_id = list()
for folder in os.listdir('audio/fma_small'):
    if folder.isnumeric():
        for filename in os.listdir(os.path.join('audio/fma_small', folder)):
            if filename.split('.')[0].isnumeric():
                
                g = df_track_genres.loc[df_track_genres['track_id'] == str(int(filename.split('.')[0]))]['genre_top'].iloc[0]
              
                try:  
                    song = AudioSegment.from_mp3(os.path.join('audio/fma_small', folder, filename))
                except:
                  pass
                
                for w in range(10):
                    
                    t1 = 3*w*1000  
                    t2 = 3*(w+1)*1000
                    
                    songSegment = song[t1:t2]

                    songSegment.export(f'audio/genres/{g}/{filename}_{str(w)}.wav', format='wav')                

In [None]:
os.makedirs('audio/spectrograms', exist_ok=True)

for g in genres:
  os.makedirs(f'audio/spectrograms/{g}', exist_ok=True)

for g in genres:
    
    i = 0
    for audio_3s_file in os.listdir(f'audio/genres/{g}'):
        try:
          i+=1
          
          y, sr=librosa.load(os.path.join(f'audio/genres/{g}', audio_3s_file), duration=3)
        except:
          pass
        
        try:
          mels = librosa.feature.melspectrogram(y=y, sr=sr)
        except:
          pass
        
        fig = plt.Figure()
        p = plt.imshow(librosa.power_to_db(mels,ref=np.max))

        plt.savefig(f'audio/spectrograms/{g}/' + audio_3s_file.split('.')[0] + '.png')
        
        if i == 250:
          break
      

In [29]:
os.makedirs('audio/train', exist_ok=True)
os.makedirs('audio/test', exist_ok=True)

for g in genres:
  os.makedirs(f'audio/train/{g}', exist_ok=True)

for g in genres:
  for spec_file in os.listdir(f'audio/spectrograms/{g}'):
    shutil.copy(os.path.join(f'audio/spectrograms/{g}', spec_file), os.path.join(f'audio/train/{g}/', spec_file))

for g in genres:
  os.makedirs(f'audio/test/{g}', exist_ok=True)
  spectrogram_files = os.listdir(f'audio/train/{g}')
  random.shuffle(spectrogram_files)
  test_spec_files = spectrogram_files[0:int(0.1*len(spectrogram_files))]
  for spec_file in test_spec_files:
    shutil.move(os.path.join(f'audio/train/{g}', spec_file), f'audio/test/{g}')

In [38]:
train_dir = "audio/train/"
train_datagen = ImageDataGenerator(rescale=1./255)
train_generator = train_datagen.flow_from_directory(train_dir,target_size=(288,432),color_mode="rgba",class_mode='categorical',batch_size=128)

validation_dir = "audio/test/"
vali_datagen = ImageDataGenerator(rescale=1./255)
vali_generator = vali_datagen.flow_from_directory(validation_dir,target_size=(288,432),color_mode='rgba',class_mode='categorical',batch_size=128)


Found 1610 images belonging to 8 classes.
Found 175 images belonging to 8 classes.


In [39]:
def GenreModel(input_shape = (288,432,4),classes=8):
  
  X_input = Input(input_shape)

  X = Conv2D(8,kernel_size=(3,3),strides=(1,1))(X_input)
  X = BatchNormalization(axis=3)(X)
  X = Activation('relu')(X)
  X = MaxPooling2D((2,2))(X)
  
  X = Conv2D(16,kernel_size=(3,3),strides = (1,1))(X)
  X = BatchNormalization(axis=3)(X)
  X = Activation('relu')(X)
  X = MaxPooling2D((2,2))(X)
  
  X = Conv2D(32,kernel_size=(3,3),strides = (1,1))(X)
  X = BatchNormalization(axis=3)(X)
  X = Activation('relu')(X)
  X = MaxPooling2D((2,2))(X)

  X = Conv2D(64,kernel_size=(3,3),strides=(1,1))(X)
  X = BatchNormalization(axis=-1)(X)
  X = Activation('relu')(X)
  X = MaxPooling2D((2,2))(X)
  
  X = Conv2D(128,kernel_size=(3,3),strides=(1,1))(X)
  X = BatchNormalization(axis=-1)(X)
  X = Activation('relu')(X)
  X = MaxPooling2D((2,2))(X)

  
  X = Flatten()(X)
  
  X = Dropout(rate=0.3)(X)

  X = Dense(classes, activation='softmax', name='fc' + str(classes))(X)

  model = Model(inputs=X_input,outputs=X,name='GenreModel')

  return model

In [None]:
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val
  
model = GenreModel(input_shape=(288,432,4),classes=8)
opt = Adam(learning_rate=0.0005)
model.compile(optimizer = opt,loss='categorical_crossentropy',metrics=['accuracy',get_f1]) 

model.fit_generator(train_generator,epochs=70,validation_data=vali_generator)