Step 1 - Read full features file

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the features CSV file
file_path = "../../Misc/features_3_sec.csv"
full_features_df = pd.read_csv(file_path)

# Display basic information about the dataset
print(full_features_df.head())         # Display the first few rows
print(full_features_df.info())         # Check for null values and data types
print(full_features_df.describe())     # Summary statistics for numerical columns


Step 2 - Check for missing values

In [None]:

if full_features_df.isnull().sum().sum() > 0:
    print(full_features_df.isnull().sum())
    #Option 1: Remove rows with missing values
    data = full_features_df.dropna()

    #Option 2: Impute missing values (e.g., with the column mean)
    #data = full_features_df.fillna(full_features_df.mean())
    #print(full_features_df.isnull().sum())
else:
    print("No missing values found in the dataset")

# Verify there are no missing values left


Step 3 -  Train an XGBoost classifier to identify top 'n' number important features in Full features file. 
No need to split the dataset at this point

In [None]:


# Encode the categorical label (genre) using LabelEncoder
label_encoder = LabelEncoder()
full_features_df['label_encoded'] = label_encoder.fit_transform(full_features_df['label'])

# Drop irrelevant columns
# Remove columns that don't contribute to the prediction
processed_df = full_features_df.drop(columns=['filename', 'label'])

# Separate features (X) and target (y)
X = processed_df.drop(columns=['label_encoded'])
y = processed_df['label_encoded']



# Train the model - no 
model = XGBClassifier(eval_metric="mlogloss", random_state=42)
model.fit(X, y)

# Sort Feature based on importance
importances = model.feature_importances_
feature_names = X.columns
indices = np.argsort(importances)[::-1]

sorted_feature_names = [feature_names[i] for i in indices]
sorted_importances = importances[indices]

# Plot feature importances
plt.figure(figsize=(10, 8))
plt.barh(sorted_feature_names, sorted_importances, color='blue')
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Feature Importance Analysis (Sorted)")
plt.gca().invert_yaxis()  # Invert y-axis to display important features at the top
#plt.tight_layout()
plt.show()



Step 4 - Extract top 'n' features and drop the rest

In [None]:
# Extract the top 'n' features - in this case it is 15
top_n = 15
top_features = [feature_names[i] for i in indices[:top_n]]

print("Top Features:")
for i, feature in enumerate(top_features, 1):
    print(f"{i}. {feature}: {importances[indices[i-1]]:.4f}")

# Drop non-important features
X_reduced = X[top_features]  # Select only the top features

# Combine the reduced features with the target (label) column safely
X_reduced_with_label = pd.concat([X_reduced, y], axis=1)

# Save the reduced features file
reduced_file_path = "reduced_features.csv"
X_reduced_with_label.to_csv(reduced_file_path, index=False)

print(f"Reduced features CSV saved at: {reduced_file_path}")


Step 5 -  Encode categorical label (genre) on the reduced features file

In [None]:
from joblib import dump, load
#Read reduced feature file saved above
reduced_features_df = pd.read_csv(reduced_file_path)

# Split the reduced features into features and target
X = reduced_features_df.drop(columns=["label_encoded"])
y = reduced_features_df["label_encoded"]

# Encode the categorical label (genre) using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save the scaler and label encoder
dump(scaler, 'scaler.joblib')
dump(label_encoder, 'label_encoder.joblib')

print(f"Shape of features (X): {X.shape}")
print(f"Shape of target (y): {y.shape}")


Step 6 - Split the reduced features dataset into Train and Test

In [None]:
from sklearn.model_selection import train_test_split


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split( X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Print dataset shapes and verify
print(f"Training Features Shape: {X_train.shape}")
print(f"Testing Features Shape: {X_test.shape}")
print(f"Training Labels Shape: {y_train.shape}")
print(f"Testing Labels Shape: {y_test.shape}")


Step 7 - Run XGBoost Classifier on Train and Test

In [None]:
from xgboost import XGBClassifier
import seaborn as sns

# Initialize the XGBoost model
xgb_model = XGBClassifier(eval_metric='mlogloss', random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
print("XGBoost Classifier")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb) * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb))
#print("\nConfusion Matrix:")
#print(confusion_matrix(y_test, y_pred_xgb))



# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_xgb), annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()


Step 8 - Save the XGBoost Model

In [None]:
from joblib import dump, load

# Save the model to a file
dump(xgb_model, 'genre_class_model_xgboost.joblib')




Final Step - Test using a sample file

In [None]:
import librosa
import numpy as np
from joblib import load
import pandas as pd

# Step 1: Load the pre-trained XGBoost model, scaler, and label encoder
xgb_model = load('genre_class_model_xgboost.joblib')
scaler = load('scaler.joblib')
label_encoder = load('label_encoder.joblib')

# Define the genre names corresponding to the encoded labels
genre_names = ['blues','classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']  

# Step 2: Function to extract specified features from a .wav file
def extract_features_from_wav(wav_file):
    # Load audio file using librosa
    y, sr = librosa.load(wav_file, sr=None)  # sr=None preserves the original sample rate
    
    # Extract various audio features per reduced feature set 
    
    # Spectral Bandwidth
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    spectral_bandwidth_mean = np.mean(spectral_bandwidth)
    
    # Chroma STFT (Short-Time Fourier Transform)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    chroma_stft_mean = np.mean(chroma_stft)
    chroma_stft_var = np.var(chroma_stft)
    
    # MFCCs (Mel-frequency cepstral coefficients)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=6)
    mfcc1_mean = np.mean(mfcc[0])
    mfcc1_var = np.var(mfcc[0])
    mfcc4_mean = np.mean(mfcc[3])
    mfcc5_var = np.var(mfcc[4])
    mfcc6_mean = np.mean(mfcc[5])
    
    # Roll-off (Spectral roll-off point)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.85)
    rolloff_mean = np.mean(rolloff)
    rolloff_var = np.var(rolloff)
    
    # Root Mean Square (RMS)
    rms = librosa.feature.rms(y=y)
    rms_var = np.var(rms)
    
    # Spectral Centroid
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    spectral_centroid_var = np.var(spectral_centroid)
    
    # Tempo (beats per minute)
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    
    # Harmony (harmonic-to-noise ratio)
    harmony = librosa.effects.harmonic(y)
    harmony_mean = np.mean(harmony)
    
    # A simple estimate of perceptual variance could be the variance in the RMS or spectral centroid
    perceptr_var = np.var(rms)  # Using RMS variance as a proxy for perceptual variance
    
    # Combine all extracted features
    features = [
        perceptr_var, spectral_bandwidth_mean, chroma_stft_mean, mfcc4_mean, chroma_stft_var, 
        mfcc1_var, rolloff_var, rms_var, rolloff_mean, mfcc1_mean, 
        spectral_centroid_var, mfcc5_var, mfcc6_mean, tempo, harmony_mean
    ]
    
    return features

# Step 3: Extract features from a sample input .wav file
wav_file = 'khisco.00008.wav'  # Replace with your file path
extracted_features = extract_features_from_wav(wav_file)

# Step 4: Convert the features into a DataFrame with the correct column names
reduced_feature_columns = [
    "perceptr_var", "spectral_bandwidth_mean", "chroma_stft_mean", "mfcc4_mean", 
    "chroma_stft_var", "mfcc1_var", "rolloff_var", "rms_var", "rolloff_mean", 
    "mfcc1_mean", "spectral_centroid_var", "mfcc5_var", "mfcc6_mean", "tempo", 
    "harmony_mean"
]

# Ensure the features list has the correct number of features
extracted_features_df = pd.DataFrame([extracted_features], columns=reduced_feature_columns)

# Step 5: Scale the extracted features using the saved scaler
X_scaled = scaler.transform(extracted_features_df)

# Step 6: Use the trained model to predict the genre
predicted_label_encoded = xgb_model.predict(X_scaled)

# Step 7: Decode the predicted label to get the genre name
predicted_label = label_encoder.inverse_transform(predicted_label_encoded)

# Step 8: Map the encoded label to the actual genre name
predicted_genre_name = genre_names[predicted_label[0]]  # Map the predicted label to the actual genre name

# Step 9: Print the predicted genre
print(f"The predicted genre of the song is: {predicted_genre_name}")
