In [5]:
import librosa
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import maximum_filter, generate_binary_structure, binary_erosion
import soundfile as sf

In [7]:
def load_audio(file_path, sr=11025):
    audio, sample_rate = librosa.load(file_path, sr=sr, mono=True)
    return audio, sample_rate

import os

def extract_songId(file_path):
    # Extracts the file name without extension
    file_name = os.path.basename(file_path)
    name_without_extension = os.path.splitext(file_name)[0]
    return name_without_extension

def compute_spectrogram(audio, n_fft=2048, hop_length=512):
    stft = np.abs(librosa.stft(audio, n_fft=n_fft, hop_length=hop_length))
    return stft

def find_peaks(spectrogram,threshold=20):
    # Define a connected neighborhood
    neighborhood = generate_binary_structure(2, 2)
    # Apply a maximum filter to find local maxima
    local_max = maximum_filter(spectrogram, footprint=neighborhood) == spectrogram
    # Erode the background to isolate peaks
    background = (spectrogram == 0)
    eroded_background = binary_erosion(background, structure=neighborhood, border_value=1)
    # Remove the background from the local maxima
    detected_peaks = local_max ^ eroded_background
    # Apply threshold
    peaks = np.where(detected_peaks & (spectrogram > threshold))
    # Convert output of find_peaks to a list of (freq, time) tuples
    return list(zip(peaks[0], peaks[1]))


In [8]:

def plot_waveform(audio, sr):
    plt.figure(figsize=(10, 4))
    librosa.display.waveshow(audio, sr=sr)
    plt.title('Waveform')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.tight_layout()
    plt.show()

def plot_spectrogram(stft, sr, hop_length):
    plt.figure(figsize=(10, 6))
    librosa.display.specshow(librosa.amplitude_to_db(stft, ref=np.max),
                             sr=sr, hop_length=hop_length,
                             y_axis='log', x_axis='time')
    plt.title('Spectrogram (Time v/s Frequency)')
    plt.colorbar(format='%+2.0f dB')
    plt.tight_layout()
    plt.show()

def plot_peaks_on_spectrogram(stft, sr, hop_length, peaks):
    plt.figure(figsize=(10, 6))
    librosa.display.specshow(librosa.amplitude_to_db(stft, ref=np.max),
                             sr=sr, hop_length=hop_length,
                             y_axis='log', x_axis='time')
    plt.scatter(peaks[1] * hop_length / sr, peaks[0], color='green', marker='x')
    plt.title('Spectrogram with Peaks')
    plt.colorbar(format='%+2.0f dB')
    plt.tight_layout()
    plt.show()

In [17]:
def generate_addresses(song_id,peaks, fanout=20, anchor_offset=3):
    """
    Generate Shazam-style fingerprint addresses in dictionary format.

    Args:
        peaks (list of tuples): List of (frequency, time) tuples.
        song_id (str): The ID of the song to associate with each address.
        fanout (int): Number of points to include in each target zone.
        anchor_offset (int): Number of points before the zone to use as anchor.

    Returns:
        dict: { (f1, f2, delta_t): (t_anchor, song_id) }
    """
    peaks = sorted(peaks, key=lambda x: (x[1], x[0]))  # Sort by time then freq
    address_dict = {}

    for i in range(anchor_offset, len(peaks) - fanout):
        anchor_freq, anchor_time = peaks[i - anchor_offset]

        for j in range(1, fanout + 1):
            target_freq, target_time = peaks[i + j]
            delta_time = round(target_time - anchor_time, 5)

            if delta_time >= 0:
                key = (anchor_freq, target_freq, delta_time)
                value = (anchor_time, song_id)
                address_dict[key] = value  # one value per address
                # To allow multiple values per address, use a list instead

    return address_dict



def print_addresses(address_dict):
    """
    Prints all addresses stored in the address dictionary.
    
    Args:
        address_dict (dict): A dictionary of addresses where key is (f1, f2, delta_t) 
                              and value is (t_anchor, song_id).
    """
    print("Generated Addresses (Address -> (Anchor Time, Song ID)):\n")
    for address, (anchor_time, song_id) in address_dict.items():
        print(f"Address: {address} -> Anchor Time: {anchor_time}, Song ID: {song_id}")

In [18]:

def initialize_fingerprint_database():
    """
    Creates an empty fingerprint database.
    
    Returns:
        dict: An empty dictionary to store fingerprint addresses.
    """
    return {}


def add_song_to_database(database, file_path):
    """
    Processes a single audio file and adds its fingerprints to the database.
    
    Args:
        database (dict): The fingerprint database to add to.
        file_path (str): Path to the audio file to process.
        
    Returns:
        tuple: (song_id, num_addresses_added)
    """
    # Extract song ID from file name
    song_id = extract_songId(file_path)
    
    # Process audio and generate fingerprints
    audio, sample_rate = load_audio(file_path)
    spectrogram = compute_spectrogram(audio)
    peaks = find_peaks(spectrogram)
    addresses = generate_addresses(song_id, peaks)
    
    # Track how many addresses were added for this song
    addresses_added = 0
    
    # Add addresses to database (allowing multiple songs to share an address)
    for address, (anchor_time, song_id) in addresses.items():
        if address not in database:
            database[address] = []
        database[address].append((anchor_time, song_id))
        addresses_added += 1
    
    print(f"Added song '{song_id}' with {addresses_added} addresses")
    return song_id, addresses_added

In [19]:
db = initialize_fingerprint_database()


In [21]:
file_path = "shape_of_you.mp3"
id = extract_songId(file_path)
audio, sample_rate = load_audio(file_path)
spectrogram = compute_spectrogram(audio)
peaks = find_peaks(spectrogram)
# Use a temporary song ID for the sample
addresses = generate_addresses(id,peaks)
add_song_to_database(db, file_path)


Added song 'shape_of_you' with 140010 addresses


('shape_of_you', 140010)

In [24]:
def match_sample(sample_file_path, database):
    """
    Identifies a song from a sample audio file using the fingerprint database.
    
    Args:
        sample_file_path (str): Path to the sample audio file.
        database (dict): The fingerprint database to match against.
        
    Returns:
        tuple: (identified_song_id, confidence_score, match_details)
    """
    # Process sample audio
    audio, sample_rate = load_audio(sample_file_path)
    spectrogram = compute_spectrogram(audio)
    peaks = find_peaks(spectrogram)
    
    # Use a temporary song ID for the sample
    sample_addresses = generate_addresses(id,peaks)
    
    # Counter for each song ID
    song_matches = {}
    total_matches = 0
    
    # Check each sample address against the database
    for address in sample_addresses:
        if address in database:
            # For each matched address, increment the count for that song
            for _, song_id in database[address]:
                if song_id not in song_matches:
                    song_matches[song_id] = 0
                song_matches[song_id] += 1
                total_matches += 1
    
    # Find the song with the most matches
    most_common_song = None
    max_count = 0
    
    for song_id, count in song_matches.items():
        if count > max_count:
            max_count = count
            most_common_song = song_id
    
    # Calculate confidence score (percentage of matched addresses)
    confidence = (max_count / len(sample_addresses)) * 100 if sample_addresses else 0
    
    match_details = {
        "total_sample_addresses": len(sample_addresses),
        "total_matches": total_matches,
        "song_matches": song_matches
    }
    
    print(f"Best match: {most_common_song} with {max_count} matches ({confidence:.2f}% confidence)")
    return most_common_song, confidence, match_details

# Example usage
match_sample(sample_file_path="shapeOfYou_recorded.wav",database=db)

Best match: shape_of_you with 294 matches (12.49% confidence)


('shape_of_you',
 12.489379779099405,
 {'total_sample_addresses': 2354,
  'total_matches': 330,
  'song_matches': {'shape_of_you': 294, 'The Emptiness Machine': 36}})