In [14]:
import array
import contextlib
import matplotlib
import numpy
import wave
import hashlib

from matplotlib import pyplot, mlab

from scipy.ndimage.filters import maximum_filter
from scipy.ndimage.morphology import generate_binary_structure, binary_erosion, iterate_structure

In [2]:
WINDOW_SIZE = 4096
WINDOW_OVERLAP = 2048

In [3]:
def read_wav(file):
    """Read a Wave file into an array
    :param file: location of the Wave file
    :return: array containing the amplitude at each sample
    """
    with contextlib.closing(wave.open(file)) as f:
        params = f.getparams()
        frames = f.readframes(params[3])
    return array.array("h", frames), params

In [4]:
def get_spectrogram(signal):
    """Run FFT to get the spectrogram of a signal.
    :param signal: array of amplitudes
    :param window_size:
    :param window_overlap:
    :return: numpy 2D array x: time, y: frequency
    """
    result = matplotlib.mlab.specgram(
        signal,
        NFFT=WINDOW_SIZE,
        Fs=44100,
        window=matplotlib.mlab.window_hanning,
        noverlap=WINDOW_OVERLAP
    )[0]

    result = 10 * numpy.log10(result)
    result[result == -numpy.inf] = 0
    return result

In [5]:
def get_peaks(image, plot=False):
    """Get the peaks from the 2D array, and do some filtering to reduce the number of peaks.
    :param image: 2D array representing the image
    :return: List of (x, y)
    """
    # http://stackoverflow.com/questions/3684484/peak-detection-in-a-2d-array
    structure = generate_binary_structure(2, 1)
    neighborhood = iterate_structure(structure, 20)

    local_max = maximum_filter(image, footprint=neighborhood)==image
    background = (image == 0)
    eroded_background = binary_erosion(background, structure=neighborhood, border_value=1)
    detected_peaks = np.bitwise_xor(local_max,eroded_background)

    # detected_peaks is a 2D mask
    amplitudes = image[detected_peaks].flatten()
    freq, t = numpy.where(detected_peaks)

    # Filter all peaks with amplitude less then 10
    unfiltered_peaks = zip(t, freq, amplitudes)
    filtered_peaks = [x for x in unfiltered_peaks if x[2] > 10]

    # get indices for frequency and time
    time = [x[0] for x in filtered_peaks]
    frequency = [x[1] for x in filtered_peaks]

    # scatter of the peaks
    if plot:
        fig, ax = pyplot.subplots()
        ax.imshow(image)
        ax.scatter(time, frequency)
        ax.set_xlabel('Time')
        ax.set_ylabel('Frequency')
        ax.set_title("Spectrogram")
        pyplot.gca().invert_yaxis()
        pyplot.show()

    return list(zip(time, frequency))

In [6]:
def create_fingerprints(peaks):
    hashes = []
    for i in range(len(peaks)):
        for j in range(1, 15):
            if (i + j) < len(peaks):
                t1 = peaks[i][0]
                t2 = peaks[i + j][0]
                freq1 = peaks[i][1]
                freq2 = peaks[i + j][1]
                delta = t2 - t1

                hash = hashlib.sha1(str(str(freq1) + str(freq2) + str(delta)).encode('utf-8'))

                hashes.append((hash.hexdigest(), t1))

    return hashes

In [9]:
data, params = read_wav('../Audio-files/Violin.wav')

In [10]:
metadata = {'title':'title', 'author':'author', 'album':'album'}

In [15]:
spec = get_spectrogram(data)

In [16]:
peaks = get_peaks(spec)

In [18]:
fp1 = create_fingerprints(peaks)

In [21]:
fp1

[('3b928622f8479780e1593e188d5315edb7d6f786', 16),
 ('e303b7b8250f3a02cdbd79d73e95658424392d81', 16),
 ('dc8b57eaba544e461fe7c5650f5893605f8ce11f', 16),
 ('e584ae42b8ed52092696bcc75e3c4df2ab188252', 16),
 ('757578fbf23ffa4d748e0800dd7c424a46feb0cc', 16),
 ('5e6b18be1a1965de8c0a0584f8a562752e6d97ee', 16),
 ('efe629421443bb85b08ff60cf37affa2b3b448df', 16),
 ('7c8da4c128f59e06d632d9399c683eef58bf29a7', 16),
 ('a11717eeea81bf773b343cf698955d21f8fe3b7c', 16),
 ('f72eac7bf25ba6d14078fde862cd752731fb2303', 16),
 ('8ff04a560c6d9293caf967d4495bb9a551861f3e', 16),
 ('4bacef4f0792e65ce5b1e5a6c92442ba172d9871', 16),
 ('4abd266684d2612dff8a659969252abc9a25ea28', 16),
 ('0a46055d0d4442a765c05ba6aeba22d5428ca8b5', 16),
 ('b7553c6080f116f0115b1f080e03484b98292d8b', 114),
 ('899776728673ef5edc1b9050b3a4c8469bc5b740', 114),
 ('8a25788ac0f9439c596d51e0bb6d1c4dccdc707a', 114),
 ('6c361bd9fd7939910f17c16f4f2d3bac84213e83', 114),
 ('3977fd8d4fabf170200aa09e29777aecd20b7a01', 114),
 ('72c30aa4f3578a1cd4c183a

In [74]:
class Fingerprint(object):
    def __init__(self, hashvalue, timeoffset):
        self.hashvalue = hashvalue
        self.timeoffset = timeoffset

    def __eq__(self, other):
        return self.hashvalue == other.hashvalue

    def __str__(self):
        return "Fingerprint: {0} at offset {1}".format(self.hashvalue, self.timeoffset)

In [75]:
def find_matches(fingerprints_1, fingerprints_2):
    matches = {}

    for fp_1 in fingerprints_1:
        for fp_2 in fingerprints_2:
            if fp_1 == fp_2:
                matches[fp_1.timeoffset - fp_2.timeoffset] = fp_1.hashvalue

    return matches

In [22]:
data2, params2 = read_wav('RECORDING2.wav')

In [23]:
spec2 = get_spectrogram(data2)

In [24]:
peaks2 = get_peaks(spec2)

In [25]:
fp2 = create_fingerprints(peaks2)

In [28]:
fp2

[('40c483cdc22ec0410710eb8bcdf582c3333f8c0c', 477),
 ('7fef42a47a28d28672aa7619154ed8281e236ec2', 477),
 ('f62934cd4372e906a9d0e3e458e52023c3f82788', 477),
 ('1ff7c085d5141700d19137cc8b0d88ee8ddbd43a', 477),
 ('86ecf5f20f3e8ae59318bb431ad661f0fa785cfc', 477),
 ('7e5855ec14645440a2d707cebc1123dbcaafb5d2', 477),
 ('c9440be95ed53d9dc284175360f753e0e9bd512d', 477),
 ('b3586cd9e1afb38da6d6e5cffb9dbb755c8a0c48', 477),
 ('c4ea27dae579e170bd46383c5ff520a892b4e9fa', 477),
 ('603f600539fa3b7b2182949eca148e55c559b0ed', 477),
 ('ebba9fdb69c7448589741f81d8532d976291bc85', 477),
 ('9a6b59da46371d86eb75259c956aee4b580c0119', 477),
 ('e55c51226b32a82e2375801f68a215a1f170edee', 477),
 ('73d28516c42ee48bae8603c60ee1b531c2a1e005', 477),
 ('5ff78093655d38dd6857169731b121f014dbbf12', 11),
 ('ac8373a504222bffe1ae5c6750ef817f2fea668f', 11),
 ('fa9cf3b6deb16f127de06789bd6af3a2356ae0f0', 11),
 ('0c99842df15780bca357621bfd6f8eafaadd8a26', 11),
 ('0f708099e7bf4d1be4c4bcfdd6a4029d573d27b2', 11),
 ('596257044c3522