In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import librosa
import numpy as np

In [2]:
unscaled = pd.read_csv('data.csv')
unscaled

Unnamed: 0.1,Unnamed: 0,tempo,beats_mean,beats_var,zero_crossings_mean,zero_crossings_var,spectral_centroids_mean,spectral_centroids_var,spectral_rolloff_mean,spectral_rolloff_var,...,mfcc_36_var,mfcc_37_mean,mfcc_37_var,mfcc_38_mean,mfcc_38_var,mfcc_39_mean,mfcc_39_var,mfcc_40_mean,mfcc_40_var,genre
0,hiphop.00023.wav,92.285156,615.644444,127869.962469,0.106795,0.095389,2240.289987,2.507330e+05,4728.444932,9.157069e+05,...,47.585796,1.154457,50.316574,-1.381446,23.372005,1.305299,36.742570,0.337945,71.727540,0
1,hiphop.00005.wav,71.777344,620.352941,128424.110727,0.124813,0.109234,2709.171522,2.480529e+05,5794.946346,4.929356e+05,...,27.654715,-2.486334,21.648327,-3.563625,24.654837,0.979186,25.516537,-1.214071,20.588590,0
2,hiphop.00038.wav,184.570312,696.183908,125604.012155,0.097494,0.087989,2208.657920,7.392277e+05,4416.680187,3.317985e+06,...,55.536156,2.605921,40.015804,0.314736,37.543396,-1.903816,69.482570,-1.689978,56.672527,0
3,hiphop.00089.wav,123.046875,629.274194,134176.682882,0.117709,0.103853,2669.947537,3.381432e+05,5526.430800,9.360238e+05,...,27.342222,2.616406,26.674131,3.882169,31.033888,0.000797,29.586660,-3.057104,27.250423,0
4,hiphop.00011.wav,135.999178,626.742424,131923.494261,0.094250,0.085367,2095.493684,4.303503e+05,4581.942544,1.717747e+06,...,22.264261,-1.381398,29.754260,-3.853168,43.618480,-1.329632,43.874313,-1.523081,44.711205,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,reggae.00077.wav,143.554688,631.126761,127756.814918,0.060978,0.057259,1778.172234,3.300590e+05,3821.785501,1.442120e+06,...,47.298325,-0.170242,25.619661,-0.525215,28.561947,-1.983141,33.000610,-3.230880,27.734670,9
995,reggae.00088.wav,107.666016,642.000000,133582.264151,0.189495,0.153587,3880.960743,4.861260e+05,8249.673214,7.519219e+05,...,21.731133,-1.073448,29.302908,-2.115449,43.037712,-2.562399,40.909020,-1.027565,76.040300,9
996,reggae.00061.wav,92.285156,624.244444,129301.695802,0.133775,0.115880,3123.715746,1.222962e+06,6474.566211,4.403254e+06,...,51.249687,0.210070,55.459160,-2.398417,50.192474,-1.948333,65.469580,-1.779201,110.525980,9
997,reggae.00009.wav,161.499023,627.828947,129036.799688,0.060649,0.056971,1609.837286,1.052387e+06,3270.931858,4.916736e+06,...,36.730537,-1.229882,42.058662,0.953682,45.284046,-1.704754,38.796140,-2.359887,60.576107,9


In [3]:
df = pd.read_csv('normalized_data.csv')

In [4]:
X = df.drop(columns=['Unnamed: 0', 'genre'])

In [5]:
y = df['genre']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
from sklearn import svm

# Train an SVM model
clf = svm.SVC(kernel='linear', C=0.2)
clf.fit(X_train, y_train)

# Evaluate the model on the test data
accuracy = clf.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.73


In [8]:
def split_audio(y_norm, sr):
    n_samples = int(30 * sr)  # Number of samples in 30 seconds
    n_channels = y_norm.shape[0]  # Number of channels in the audio
    pieces = []

    for i in range(0, len(y_norm), n_samples * n_channels):
        piece = y_norm[i:i + n_samples * n_channels]
        pieces.append(piece)
        
    return pieces, sr

In [9]:
def normalize_volume(file_path):
    y, sr = librosa.load(file_path)
    y_norm = librosa.util.normalize(y, axis=0)
    return y_norm, sr

In [10]:
def extract_features(y_norm, sr):

    features = []

    # Tempo and beats
    tempo, beats = librosa.beat.beat_track(y=y_norm, sr=sr)
    beats_mean = beats.mean()
    beats_var = beats.var()
    features.extend((tempo, beats_mean, beats_var))

    # Zero crossings
    zero_crossings = librosa.zero_crossings(y=y_norm, pad=False)
    zero_crossings_mean = zero_crossings.mean()
    zero_crossings_var = zero_crossings.var()
    features.extend((zero_crossings_mean, zero_crossings_var))

    # Spectral centroid
    spectral_centroids = librosa.feature.spectral_centroid(y=y_norm, sr=sr)[0]
    spectral_centroids_mean = spectral_centroids.mean()
    spectral_centroids_var = spectral_centroids.var()
    features.extend((spectral_centroids_mean,spectral_centroids_var))

    # Specral Rolloff
    spectral_rolloff = librosa.feature.spectral_rolloff(y=y_norm, sr=sr)[0]
    spectral_rolloff_mean = spectral_rolloff.mean()
    spectral_rolloff_var = spectral_rolloff.var()
    features.extend((spectral_rolloff_mean, spectral_rolloff_var))

    # MFCCs
    mfccs = librosa.feature.mfcc(y=y_norm, sr=sr, n_mfcc=40)
    for mfcc in mfccs:
        features.append(mfcc.mean())
        features.append(mfcc.var())

    return features

In [11]:
def predict_genre(file_path):
    predictions = []
    y_new, sr = normalize_volume(file_path)
    pieces = np.array(split_audio(y_new, sr))
    for piece in pieces:
        piece_features = extract_features(piece, sr)
        piece_features = pd.DataFrame(piece_features)
        predictions.append(clf.predict(piece_features.T))
    return predictions

In [12]:
y_new, sr = normalize_volume('Slipknot - Psychosocial [OFFICIAL VIDEO] [HD].wav')

In [13]:
y_new_clip = y_new[30 * sr: 30 * sr *2]

In [14]:
y_new_clip

array([ 0.07429586,  0.10781234,  0.09167857, ..., -0.40762517,
       -0.36363664, -0.18119341], dtype=float32)

In [15]:
y_new_clip_features = extract_features(np.array(y_new_clip), sr)

In [16]:
y_new_clip_features = np.array(y_new_clip_features).T
y_new_clip_features

array([ 1.35999178e+02,  6.38348485e+02,  1.32236651e+05,  1.53939531e-01,
        1.30242152e-01,  2.90468882e+03,  3.20752185e+05,  5.97350555e+03,
        1.20646185e+06,  2.50628614e+00,  1.25076892e+03,  7.20101624e+01,
        3.74355133e+02, -1.33805809e+01,  1.93212814e+02,  3.68437576e+01,
        1.47537872e+02,  6.74254119e-01,  1.50141235e+02,  1.64846992e+01,
        5.10680504e+01, -3.99426603e+00,  5.22837410e+01,  1.26674080e+01,
        4.77942162e+01, -5.62663507e+00,  3.30876961e+01,  1.00120392e+01,
        3.62085152e+01, -6.42077017e+00,  3.46047783e+01,  1.03205500e+01,
        3.13195877e+01, -4.21915340e+00,  2.43507652e+01,  4.04138803e+00,
        2.38231373e+01, -9.31008148e+00,  2.38516998e+01,  5.73434687e+00,
        2.31838932e+01, -9.45936203e+00,  2.18566895e+01,  4.30589533e+00,
        2.62099094e+01, -6.74682999e+00,  2.71604462e+01,  6.85018063e+00,
        2.07810841e+01, -9.81876373e+00,  2.28279762e+01,  3.45144701e+00,
        2.47978992e+01, -

In [17]:
y_new_clip_features.shape

(89,)

In [18]:
y_new_clip_features = y_new_clip_features.reshape(1, 89)

In [19]:
y_new_clip_features.shape

(1, 89)

In [20]:
columns = ['tempo', 'beats_mean', 'beats_var', 
    'zero_crossings_mean', 'zero_crossings_var',
    'spectral_centroids_mean', 'spectral_centroids_var', 'spectral_rolloff_mean',
    'spectral_rolloff_var']
for i in range(40):
    columns.extend((f'mfcc_{i+1}_mean', f'mfcc_{i+1}_var'))

In [21]:
y_new_clip_features

array([[ 1.35999178e+02,  6.38348485e+02,  1.32236651e+05,
         1.53939531e-01,  1.30242152e-01,  2.90468882e+03,
         3.20752185e+05,  5.97350555e+03,  1.20646185e+06,
         2.50628614e+00,  1.25076892e+03,  7.20101624e+01,
         3.74355133e+02, -1.33805809e+01,  1.93212814e+02,
         3.68437576e+01,  1.47537872e+02,  6.74254119e-01,
         1.50141235e+02,  1.64846992e+01,  5.10680504e+01,
        -3.99426603e+00,  5.22837410e+01,  1.26674080e+01,
         4.77942162e+01, -5.62663507e+00,  3.30876961e+01,
         1.00120392e+01,  3.62085152e+01, -6.42077017e+00,
         3.46047783e+01,  1.03205500e+01,  3.13195877e+01,
        -4.21915340e+00,  2.43507652e+01,  4.04138803e+00,
         2.38231373e+01, -9.31008148e+00,  2.38516998e+01,
         5.73434687e+00,  2.31838932e+01, -9.45936203e+00,
         2.18566895e+01,  4.30589533e+00,  2.62099094e+01,
        -6.74682999e+00,  2.71604462e+01,  6.85018063e+00,
         2.07810841e+01, -9.81876373e+00,  2.28279762e+0

In [22]:
from sklearn.preprocessing import StandardScaler

# Create an instance of StandardScaler
scaler = StandardScaler()

scaled = scaler.fit_transform(unscaled.drop(columns=['Unnamed: 0', 'genre']))

In [23]:
y_clip_norm = pd.DataFrame(scaler.transform(y_new_clip_features))



In [24]:
y_clip_norm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,79,80,81,82,83,84,85,86,87,88
0,0.595093,0.205854,0.351198,1.197409,1.205979,0.981209,-0.371985,0.889671,-0.447298,1.554603,...,1.482198,-1.307874,-1.545134,-1.399069,1.143481,-1.237163,-1.855199,-1.050008,0.817807,-1.017176


In [25]:
clf.predict(y_clip_norm)



array([3])

In [26]:
y_2, sr = normalize_volume('song_input.wav')

In [27]:
y_2_clip = y_2[30 * sr: 30 * sr *2]

In [28]:
y_2_clip_features = extract_features(np.array(y_2_clip), sr)

In [29]:
y_2_clip_features = np.array(y_2_clip_features).T

In [30]:
y_2_clip_features = np.array(y_2_clip_features).reshape(1, 89)

In [31]:
y_2_clip_features.shape

(1, 89)

In [32]:
y_2_clip_features = pd.DataFrame(y_2_clip_features, columns=columns)

In [33]:
y_2_clip_features

Unnamed: 0,tempo,beats_mean,beats_var,zero_crossings_mean,zero_crossings_var,spectral_centroids_mean,spectral_centroids_var,spectral_rolloff_mean,spectral_rolloff_var,mfcc_1_mean,...,mfcc_36_mean,mfcc_36_var,mfcc_37_mean,mfcc_37_var,mfcc_38_mean,mfcc_38_var,mfcc_39_mean,mfcc_39_var,mfcc_40_mean,mfcc_40_var
0,99.384014,660.734694,132699.419409,0.013252,0.013076,430.269077,71857.545656,605.696337,458943.955196,-355.190552,...,-2.705887,57.514336,-1.265992,70.413933,-0.61601,56.971466,-0.333326,42.124027,-2.113365,47.629322


In [34]:
y_2_clip_norm = pd.DataFrame(scaler.transform(y_2_clip_features), columns=columns)

In [35]:
y_2_clip_norm

Unnamed: 0,tempo,beats_mean,beats_var,zero_crossings_mean,zero_crossings_var,spectral_centroids_mean,spectral_centroids_var,spectral_rolloff_mean,spectral_rolloff_var,mfcc_1_mean,...,mfcc_36_mean,mfcc_36_var,mfcc_37_mean,mfcc_37_var,mfcc_38_mean,mfcc_38_var,mfcc_39_mean,mfcc_39_var,mfcc_40_mean,mfcc_40_var
0,-0.709195,0.850835,0.389331,-2.1629,-2.420445,-2.476043,-0.993131,-2.520024,-0.972103,-3.247052,...,-0.352611,0.4587,0.093523,1.051274,0.497674,0.21734,0.282185,-0.446295,-0.145526,-0.344306


In [36]:
prediction = clf.predict(y_2_clip_norm)

In [37]:
prediction[0]

4

In [38]:
def predict_song(file_path, model):
    
    number_to_genre = {0:'hiphop',
                     1:'classical',
                     2:'blues',
                     3:'metal',
                     4:'jazz',
                     5:'country',
                     6:'pop',
                     7:'rock',
                     8:'disco',
                     9:'reggae'}
    
    columns = ['tempo', 'beats_mean', 'beats_var', 
    'zero_crossings_mean', 'zero_crossings_var',
    'spectral_centroids_mean', 'spectral_centroids_var', 'spectral_rolloff_mean',
    'spectral_rolloff_var']
    for i in range(40):
        columns.extend((f'mfcc_{i+1}_mean', f'mfcc_{i+1}_var'))
    
    X_norm, sr = normalize_volume(file_path)
    X_clip = X_norm[30 * sr: 30 * sr *2]
    X_clip_features = extract_features(np.array(X_clip), sr)
    X_clip_features = np.array(X_clip_features).T
    X_clip_features = np.array(X_clip_features).reshape(1, 89)
    X_clip_features = pd.DataFrame(X_clip_features, columns=columns)
    X_clip_norm = pd.DataFrame(scaler.transform(X_clip_features), columns=columns)
    prediction = model.predict(X_clip_norm)[0]
    
    return number_to_genre[prediction]

In [39]:
predict_song('Slipknot - Psychosocial [OFFICIAL VIDEO] [HD].wav', clf)

'metal'

In [40]:
predict_song('song_input.wav', clf)

'jazz'

In [41]:
predict_song('Shania Twain - Man! I Feel Like A Woman (Official Music Video).wav',clf)

'disco'

In [42]:
predict_song('Kenny Rogers - The Gambler.wav',clf)

'blues'

In [55]:
# Train an SVM model
rbf = svm.SVC(kernel='rbf', C=3)
rbf.fit(X_train, y_train)

# Evaluate the model on the test data
accuracy = rbf.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.73


In [56]:
predict_song('Slipknot - Psychosocial [OFFICIAL VIDEO] [HD].wav', rbf)

'metal'

In [57]:
predict_song('song_input.wav', rbf)

'blues'

In [58]:
predict_song('Shania Twain - Man! I Feel Like A Woman (Official Music Video).wav',rbf)

'country'

In [59]:
predict_song('Kenny Rogers - The Gambler.wav',rbf)

'reggae'