Preprocessing of fma

In [6]:
import os
import librosa
import numpy as np
from sklearn.preprocessing import StandardScaler
import csv
from pymongo import MongoClient

def extract_features(audiofile):

    y, sr = librosa.load(audiofile, sr=None) 
    
    #extract mfccs 
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfccsmean = np.mean(mfccs, axis=1)
    
    #extract spectral centroid
    spectralcentroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
    spectralcentroidmean = np.mean(spectralcentroid)
    
    #extract zero crossing rate
    zerocrossingrate = librosa.feature.zero_crossing_rate(y)[0]
    zerocrossingratemean = np.mean(zerocrossingrate)
    
    features = np.concatenate([mfccsmean, [spectralcentroidmean], [zerocrossingratemean]])
    
    #also adding the filename in the csv
    filename = os.path.basename(audiofile)
    
    return filename, features

def getaudiofiles(directory):
    audiofiles = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.mp3'):
                audiofiles.append(os.path.join(root, file))
    return audiofiles

mainaudiofolder = 'fma_sample'

allfeatures = []

#iterating over all subfolders
for folder in os.listdir(mainaudiofolder):
    subfolder = os.path.join(mainaudiofolder, folder)
    if os.path.isdir(subfolder):
        #obtaining all files from the subfolder
        audiofiles = getaudiofiles(subfolder)
        #going over each single file
        for audiofile in audiofiles:
            try:
                filename, features = extract_features(audiofile)
                allfeatures.append((filename, features))
            except Exception as e:
                print(f"Error processing {audiofile}: {e}")

#normalizing all features 
scaler = StandardScaler()
normalizedfeatures = scaler.fit_transform(np.array([features for _, features in allfeatures]))

#writing data to csv 
outputfile = 'normalizedfeaturessample.csv'
with open(outputfile, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    # Write column names
    writer.writerow(['Audio File', 'MFCC1', 'MFCC2', 'MFCC3', 'MFCC4', 'MFCC5', 'MFCC6', 'MFCC7', 'MFCC8', 'MFCC9', 'MFCC10', 'MFCC11', 'MFCC12', 'MFCC13', 'Spectral Centroid', 'Zero Crossing Rate'])
    # Write normalized features
    for file_name, features in allfeatures:
        writer.writerow([file_name, *features])

print(f"Normalized features have been saved to {outputfile}")

#connecting to mongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['audio_features']
collection = db['features']

#inserting data into mongodb
for filename, features in allfeatures:
    doc = {'file name': filename, 'features': features.tolist()}
    collection.insert_one(doc)

print("Normalized features have been stored in MongoDB")


  y, sr = librosa.load(audiofile, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing fma_sample/001/001486.mp3: 
Normalized features have been saved to normalizedfeaturessample.csv
Normalized features have been stored in MongoDB


Recommendation using spark api dataframe on the preprocessed csv file

In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType

spark = SparkSession.builder \
    .appName("AudioFileSimilarity") \
    .getOrCreate()

df = spark.read.csv("normalizedfeaturessample.csv", header=True)

#convert features to double type
for featurecols in df.columns[1:]:
    df = df.withColumn(featurecols, col(featurecols).cast(DoubleType()))

#function to calculate similarity by finding euclideon distance btw the two features
def calculatesimilarity(features1, features2):
    #calculating the sum of squared differences between features
    squared_diff_sum = sum((x - y) ** 2 for x, y in zip(features1, features2))
    #return the square root of the sum of squared differences
    return squared_diff_sum ** 0.5

#calculating similarity of every pair in the folder 
similarities = {}
for row1 in df.collect():
    audiofile1 = row1["Audio File"]
    features1 = [row1[col] for col in df.columns[1:]]
    similarities[audiofile1] = []
    for row2 in df.collect():
        audiofile2 = row2["Audio File"]
        features2 = [row2[col] for col in df.columns[1:]]
        similarity = calculatesimilarity(features1, features2)
        similarities[audiofile1].append((audiofile2, similarity))


outputdata = []
for audiofile, similarfiles in similarities.items():
    similarfiles.sort(key=lambda x: x[1])  #sorting by similarity
    similarfilesstr = ",".join(f"{file}:{similarity:.2f}" for file, similarity in similarfiles[:5])  #only showing the top 5 similarity pairs
    outputdata.append((audiofile, similarfilesstr))

#storing the data into csv
outputdf = spark.createDataFrame(outputdata, ["Audio File", "Similar Files"])
outputdf.coalesce(1).write.csv("similaraudiosample.csv", header=True)

spark.stop()


24/05/12 15:10:59 WARN Utils: Your hostname, riyyan-HP-ENVY-x360-2-in-1-Laptop-13-bf0xxx resolves to a loopback address: 127.0.1.1; using 192.168.100.254 instead (on interface wlo1)
24/05/12 15:10:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/12 15:10:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                