In [1]:
import os
import numpy as np
import pandas as pd



In [2]:
import librosa

In [3]:
import matplotlib.pyplot as plt

In [4]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [5]:
from tensorflow.keras.models import Sequential


In [6]:
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

In [7]:
from tensorflow.keras.utils import to_categorical


In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
#Using the librosa library to extract the MFCC(Mel-Frequency Cepstral Coefficients) from each of the provided .wav file in the GTZAN dataset
def extract_features(file_path, n_mfcc=40):
    audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
    return np.mean(mfccs.T, axis=0)

In [10]:
dataset_path = './Data/genres_original/'
genres = 'blues classical country disco hiphop jazz metal pop reggae rock'.split()
data = []
for genre in genres:
    genre_dir = os.path.join(dataset_path,genre)
    for file in os.listdir(genre_dir):
        file_path = os.path.join(genre_dir,file)
        features = extract_features(file_path)
        data.append([features,genre])



In [11]:
data

[[array([-1.1359882e+02,  1.2157067e+02, -1.9162262e+01,  4.2363941e+01,
         -6.3622661e+00,  1.8621931e+01, -1.3699734e+01,  1.5339802e+01,
         -1.2274304e+01,  1.0970945e+01, -8.3260612e+00,  8.8020878e+00,
         -3.6699412e+00,  5.7446756e+00, -5.1627831e+00,  7.5170636e-01,
         -1.6878542e+00, -4.0872991e-01, -2.3026767e+00,  1.2224671e+00,
         -3.5328746e+00, -1.1397806e+00, -4.2828279e+00, -4.2260842e+00,
          9.1518766e-01,  9.1406381e-01, -5.7618070e+00, -3.9328036e+00,
          1.5094346e+00,  2.6984656e+00,  5.6560731e+00, -3.2690079e+00,
         -2.0606375e-01, -2.9996979e+00,  4.4763169e+00, -4.7685498e-01,
          6.0062852e+00, -5.9690375e-02, -3.4585848e+00, -1.8418322e+00],
        dtype=float32),
  'blues'],
 [array([-2.0752383e+02,  1.2398514e+02,  8.9470186e+00,  3.5867149e+01,
          2.9095948e+00,  2.1519472e+01, -8.5565128e+00,  2.3370676e+01,
         -1.0103608e+01,  1.1899242e+01, -5.5588241e+00,  5.3778763e+00,
         -2.23

In [12]:
#Create the dataframe from the created data
df = pd.DataFrame(data,columns=['feature','genre'])

In [13]:
#Creating a csv from the df:
df.to_csv("Music_Data.csv")

In [14]:
#Let us encode the labels into numerical values using LabelEncoder

In [15]:
label_encoder = LabelEncoder()

In [16]:
df['genre'] = label_encoder.fit_transform(df['genre'])

In [17]:
df

Unnamed: 0,feature,genre
0,"[-113.59882, 121.57067, -19.162262, 42.36394, ...",0
1,"[-207.52383, 123.98514, 8.947019, 35.86715, 2....",0
2,"[-90.757164, 140.44087, -29.084547, 31.686693,...",0
3,"[-199.57513, 150.0861, 5.663404, 26.855278, 1....",0
4,"[-160.35417, 126.20948, -35.581394, 22.139256,...",0
...,...,...
994,"[-153.63004, 109.904785, -23.091644, 59.014496...",9
995,"[-142.41621, 116.20546, -32.160263, 49.11151, ...",9
996,"[-124.989655, 115.18234, -47.985886, 52.820343...",9
997,"[-225.03336, 123.657265, -9.745124, 56.613846,...",9


In [18]:
#Now let us prepare the data to be fed into our CNN Deep learning model:
X = np.array(df['feature'].tolist())
y = np.array(df['genre'].tolist())
#This step is performed to generate the data in a one hot encoded form=> Value of that is present is 1 while the others are 0
y = to_categorical(y)

#Now let us split the data into train and test
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)
#Now to create the data in the form a CNN model will intake, we use the following method:(That is it adds an extra dimension for working with CNN)
X_train = X_train[..., np.newaxis, np.newaxis]
X_test = X_test[..., np.newaxis, np.newaxis]

In [19]:
y

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [20]:
X.shape

(999, 40)

In [45]:
#Now let us build the CNN model 

In [21]:
model = Sequential([
    #Building the convolutional layer Conv2D used to detect patterns, edges and textures.
    #Here 32,64,128,256 represents the number of filters used in the model
    #relu to introduce non linearity
    #Shape the input data for entry into the model.
    #Maxpooling is introduced to reduce the dimensions of the input features by a specified amount.Here we reduce by half.
    #Dropout is a feature that is utilised to prevent overfitting of the data, by make a set of the input data as zeros.Here we input that value as 30%
    #Added the concept of padding to ensure that the output is in the same size as the input to ensure that there is no case of negative output.
    
    
    Conv2D(32,(3,3), activation='relu',padding='same' , input_shape=(X_train.shape[1],1,1)),
    MaxPooling2D((2,2), padding='same'),
    Dropout(0.3),

    Conv2D(64,(3,3), activation='relu', padding='same'),
    MaxPooling2D((2,2), padding='same'),
    Dropout(0.3),

    Conv2D(128,(3,3), activation='relu', padding='same'),
    MaxPooling2D((2,2), padding='same'),
    Dropout(0.3),

    Conv2D(256,(3,3), activation='relu', padding='same'),
    MaxPooling2D((2,2), padding='same'),
    Dropout(0.3),

    #Converting the present 2D matrices into 1D matrices for input to the model by using flatten()
    Flatten(),
    #Create a layer of 256 neurons by using dense()
    Dense(256, activation="relu"),
    Dropout(0.3),
    #Now creating the output layer using Dense, we specify the output to be the no. of genres we have in our dataset, so we have one output for each genre.
    #Setting activation = softmax, to covert the retreived output into a probability distribution.
    Dense(len(genres), activation="softmax")
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [22]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [23]:

model.summary()

In [24]:
# Training the model
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - accuracy: 0.1392 - loss: 2.2922 - val_accuracy: 0.2050 - val_loss: 2.1554
Epoch 2/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.2196 - loss: 2.0984 - val_accuracy: 0.2650 - val_loss: 1.9754
Epoch 3/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.3066 - loss: 1.8947 - val_accuracy: 0.3300 - val_loss: 1.8706
Epoch 4/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.3301 - loss: 1.8700 - val_accuracy: 0.3450 - val_loss: 1.7941
Epoch 5/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.3554 - loss: 1.7372 - val_accuracy: 0.3650 - val_loss: 1.7542
Epoch 6/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.3526 - loss: 1.7401 - val_accuracy: 0.4350 - val_loss: 1.6670
Epoch 7/30
[1m25/25[0m [32m━━━━

In [25]:
# Predicting the genre
def predict_genre(file_path):
    features = extract_features(file_path)
    features = features.reshape(1, -1, 1, 1)
    prediction = model.predict(features)
    predicted_genre = label_encoder.inverse_transform([np.argmax(prediction)])
    return predicted_genre[0]

In [34]:
file_path = './#Sarpattaparambarai-Neeya oli 4K Video song.mp3'
predicted_genre = predict_genre(file_path)
print(f'The predicted genre is: {predicted_genre}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
The predicted genre is: hiphop


In [57]:
#Let us create this whole model as one single function we can use in our website:
import os
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import pickle
def music_predictor():
    #Using the librosa library to extract the MFCC(Mel-Frequency Cepstral Coefficients) from each of the provided .wav file in the GTZAN dataset
    def music_features(file_path, n_mfcc=40):
        audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
        return np.mean(mfccs.T, axis=0)
    dataset_path = './Data/genres_original/'
    genres = 'blues classical country disco hiphop jazz metal pop reggae rock'.split()
    data = []
    for genre in genres:
        genre_dir = os.path.join(dataset_path,genre)
        for file in os.listdir(genre_dir):
            file_path = os.path.join(genre_dir,file)
            features = music_features(file_path)
            data.append([features,genre])

    #Create the dataframe from the created data
    df = pd.DataFrame(data,columns=['feature','genre'])

    #Let us encode the labels into numerical values using LabelEncoder
    label_encoder = LabelEncoder()
    df['genre'] = label_encoder.fit_transform(df['genre'])


    #Now let us prepare the data to be fed into our CNN Deep learning model:
    X = np.array(df['feature'].tolist())
    y = np.array(df['genre'].tolist())
    #This step is performed to generate the data in a one hot encoded form=> Value of that is present is 1 while the others are 0
    y = to_categorical(y)

    #Now let us split the data into train and test
    X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)
    #Now to create the data in the form a CNN model will intake, we use the following method:(That is it adds an extra dimension for working with CNN)
    X_train = X_train[..., np.newaxis, np.newaxis]
    X_test = X_test[..., np.newaxis, np.newaxis]

    #Now let us build the CNN model 

    model = Sequential([
        #Building the convolutional layer Conv2D used to detect patterns, edges and textures.
        #Here 32,64,128,256 represents the number of filters used in the model
        #relu to introduce non linearity
        #Shape the input data for entry into the model.
        #Maxpooling is introduced to reduce the dimensions of the input features by a specified amount.Here we reduce by half.
        #Dropout is a feature that is utilised to prevent overfitting of the data, by make a set of the input data as zeros.Here we input that value as 30%
        #Added the concept of padding to ensure that the output is in the same size as the input to ensure that there is no case of negative output.
        
        
        Conv2D(32,(3,3), activation='relu',padding='same' , input_shape=(X_train.shape[1],1,1)),
        MaxPooling2D((2,2), padding='same'),
        Dropout(0.3),
    
        Conv2D(64,(3,3), activation='relu', padding='same'),
        MaxPooling2D((2,2), padding='same'),
        Dropout(0.3),
    
        Conv2D(128,(3,3), activation='relu', padding='same'),
        MaxPooling2D((2,2), padding='same'),
        Dropout(0.3),
    
        Conv2D(256,(3,3), activation='relu', padding='same'),
        MaxPooling2D((2,2), padding='same'),
        Dropout(0.3),
    
        #Converting the present 2D matrices into 1D matrices for input to the model by using flatten()
        Flatten(),
        #Create a layer of 256 neurons by using dense()
        Dense(256, activation="relu"),
        Dropout(0.3),
        #Now creating the output layer using Dense, we specify the output to be the no. of genres we have in our dataset, so we have one output for each genre.
        #Setting activation = softmax, to covert the retreived output into a probability distribution.
        Dense(len(genres), activation="softmax")
    ])


    #Compile the model

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Training the model
    history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))


    ###Saving the Model
    pickle.dump(model,open('./model_final.pickle',mode='wb'))

    
    
    
    

    

In [58]:
music_predictor()

Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.1350 - loss: 2.3407 - val_accuracy: 0.1600 - val_loss: 2.1753
Epoch 2/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.1875 - loss: 2.1460 - val_accuracy: 0.2550 - val_loss: 2.0401
Epoch 3/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.2446 - loss: 1.9615 - val_accuracy: 0.2450 - val_loss: 1.9456
Epoch 4/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.3096 - loss: 1.9020 - val_accuracy: 0.3450 - val_loss: 1.8122
Epoch 5/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.3202 - loss: 1.7958 - val_accuracy: 0.3450 - val_loss: 1.7742
Epoch 6/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.3547 - loss: 1.7454 - val_accuracy: 0.4250 - val_loss: 1.6751
Epoch 7/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━

In [59]:
model =  pickle.load(open('./model_final.pickle',mode='rb'))

In [60]:
def song_genre(file_path):
    features = extract_features(file_path)
    features = features.reshape(1, -1, 1, 1)
    prediction = model.predict(features)
    music_genre = label_encoder.inverse_transform([np.argmax(prediction)])
    return music_genre[0]

In [66]:
song_genre('./Naanum Rowdy Dhaan - Thangamey  Official Video  Anirudh  Vijay Sethupathi  Vignesh Shivan.mp3')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step


'rock'

In [67]:
song_genre('./fur-elise-music-box-70375.mp3')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step


'classical'

In [68]:
song_genre('./Indian 2 - Kadharalz Lyric Video  Kamal Haasan  Shankar  Anirudh  Subaskaran  Lyca.mp3')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step


'hiphop'

In [69]:
song_genre('./Maan Karate - Un Vizhigalil Video  Anirudh  Sivakarthikeyan.mp3')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step


'country'

In [70]:
song_genre('./new-edm-music-beet-mr-sandeep-rock-141616.mp3')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step


'hiphop'

In [71]:
song_genre('./sunflower-street-drumloop-85bpm-163900.mp3')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step


'hiphop'

In [72]:
song_genre('./Vivegam - Thalai Viduthalai Official Song Video - Ajith Kumar  Anirudh  Siva.mp3')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step


'hiphop'

In [73]:
song_genre('./We Wish You a Merry Christmas with Lyrics  Christmas Carol & Song.mp3')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step


'disco'