first commit

spotify · Jan 12, 2016 · aa0d963 · aa0d963
commit aa0d963
Show file tree

Hide file tree

Showing 117 changed files with 1,394 additions and 0 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,3 @@
+CMAKE_MINIMUM_REQUIRED (VERSION 2.6)
+PROJECT (spotify-libechoprintserver)
+ADD_LIBRARY(echoprintserver SHARED libechoprintserver.c)
diff --git a/README.md b/README.md
@@ -0,0 +1,187 @@
+# echoprint-server #
+
+A C library, with Python bindings, for fast indexing and querying of
+echoprint data.
+
+
+## Usage ##
+
+To build, run `python setup.py install`.
+The following documents convenience scripts in the `bin/` directory.
+
+#### WARNING ####
+
+The library uses a custom binary format for speed. At this point,
+**ENDIANNESS IS NOT CHECKED** so moving index files between machines
+with different architectures might cause problems.
+
+
+### `echoprint-decode` ###
+
+Convert a codestring as output by `echoprint-codegen` into
+the corresponding list of codes represented as comma-separated integers.
+
+Usage:
+
+	echoprint-codegen song.ogg > codegen_output.json
+	cat codegen_output.json | jq -r '.[0].code' | echoprint-decode > codes.txt
+
+`codes.txt` will look like:
+
+`150555,1035718,621673,794882,40662,955768,96899,166055,...`
+
+*N.B. This script only outputs only the echoprint codes, not the
+ offsets.*
+
+
+### `echoprint-inverted-index` ###
+
+Takes a series of echoprint strings (one per line) and
+an output path. Writes a compact index to disk.
+
+Usage:
+
+    cat ... | ./echoprint-inverted-index index.bin
+
+`index.bin` format is binary, see the implementation details below.
+
+If more than 65535 songs are indexed, the output will be split into
+blocks with the following naming scheme:
+
+    index.bin_0000
+	index.bin_0001
+	...
+
+Optionally the `-i` switch switches the input format to a
+comma-separated list of integer codes (one song per line).
+
+### `echoprint-inverted-query` ###
+
+Takes a series of echoprint strings (one per line) and a list of index
+blocks. For each query outputs results on stdout as json-encoded
+objects.
+
+Usage:
+
+    cat ... | ./echoprint-inverted-query index-file-1 [index-file-2 ...]
+
+where the input is an echoprint string per line;
+
+Each output line looks like the following:
+
+```
+{
+  "results": [
+    {
+      "index": 0,
+      "score": 0.69340412080287933,
+    },
+    {
+      "index": 8,
+      "score": 0.56301175890117883,
+    },
+    {
+      "index": 120,
+      "score": 0.31826272477954626,
+    },
+    ...
+```
+
+
+The `index` field represents the position of the matched song in the
+index.
+
+Optionally the `-i` switch switches the input format to a
+comma-separated list of integer codes (one song per line).
+
+
+## REST service ##
+
+The `echoprint-rest-service` script listens for POST requests (by
+default on port 5678), with an echoprint string as `echoprint`
+parameter. The `test-rest.sh` shows how to query using `curl`.
+
+The request is made to `host:query/<METHOD>` with `<METHOD>` one of
+
+- `jaccard`
+- `set_int`
+- `set_int_norm_length_first`
+
+Usage:
+
+	echoprint-rest-service index-file-1 [index-file-2 ...]
+
+The optional `--ids-file` accepts a path to a text file where each
+line represents an id for the correspondingly-indexed track in the
+index. If specified, the returned results will have an `id` field.
+
+## Example: querying from audio ##
+
+Assuming `0005dad86d4d4c6fb592d42d767e117f.ogg` is in the current
+directory, let's cut it from 00:30 to 4:30 and re-encode it as 128
+kbps mp3 (to show that echoprint is robust to alterations in the
+file):
+
+
+	ffmpeg -i 0005dad86d4d4c6fb592d42d767e117f.ogg \
+		-s 30 -t 240 \
+		0005dad86d4d4c6fb592d42d767e117f_cut_lowrate.mp3
+
+Run the echoprint codegen, extract the echoprint string:
+
+    ../echoprint-codegen/echoprint-codegen
+        0005dad86d4d4c6fb592d42d767e117f_cut_lowrate.mp3 \
+        | jq -r '.[0].code' \
+        > 0005dad86d4d4c6fb592d42d767e117f_cut_lowrate.echoprint```
+
+Query the service:
+
+    curl -s --data \
+        echoprint=`cat 0005dad86d4d4c6fb592d42d767e117f_cut_lowrate.echoprint` \
+        <server-path>:5678/query
+
+Results should be similar to
+
+    {
+      "results": [
+        {
+          "id": "0005dad86d4d4c6fb592d42d767e117f",
+          "index": 0,
+          "score": 0.34932565689086914
+        },
+        {
+          "id": "ee59c151d679413a80ac4e49ac92c662",
+          "index": 698096,
+          "score": 0.033668458461761475
+        },
+        {
+          "id": "026526e6a02648668ff9f410faab15be",
+          "index": 312466,
+          "score": 0.015930989757180214
+        },
+        ...
+      ]
+    }
+
+
+## Building the standalone C library ##
+
+Build with CMake.
+
+Depending on the platform, `libechoprinttools.so` (linux) or
+`libechoprinttools.dylib` will be created. On linux it might be
+necessary to put the library file in `LD_LIBRARY_PATH`.
+
+
+## Implementation details ##
+
+### Similarity ###
+
+The similarity between two echoprints is computed on their
+**bag-of-words** representations. This means that the codes' offsets
+are not considered, nor are the codes' multiplicities.
+
+### Inverted index binary format ###
+
+The inverted index is serialized as a memory dump of all the fields of
+the `EchoprintInvertedIndex` struct defined in the header file.
diff --git a/bin/echoprint-decode b/bin/echoprint-decode
@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+import sys
+import base64
+import zlib
+from itertools import izip_longest
+from echoprint_server import decode_echoprint
+
+if __name__ == '__main__':
+    for line in sys.stdin:
+        offsets, codes = decode_echoprint(line.strip())
+        print ','.join([str(c) for c in codes])
diff --git a/bin/echoprint-inverted-index b/bin/echoprint-inverted-index
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+# encoding: utf-8
+import sys
+import argparse
+from echoprint_server import load_inverted_index, create_inverted_index, \
+    parsed_code_streamer, parsing_code_streamer
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--already-parsed', action='store_true',
+                        help='input has been already parsed as a \
+                        comma-separated list of integer codes (no offset \
+                        information)')
+    parser.add_argument('indexfile', help='output path')
+    args = parser.parse_args()
+    streamer = parsed_code_streamer if args.already_parsed \
+               else parsing_code_streamer
+    create_inverted_index(streamer(sys.stdin), args.indexfile)
diff --git a/bin/echoprint-inverted-index-size b/bin/echoprint-inverted-index-size
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+# encoding: utf-8
+import argparse
+from echoprint_server import load_inverted_index, inverted_index_size
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('indexfiles', nargs='+', \
+                        help='inverted index files (in order)')
+    args = parser.parse_args()
+    inverted_index = load_inverted_index(args.indexfiles)
+    print inverted_index_size(inverted_index)
diff --git a/bin/echoprint-inverted-query b/bin/echoprint-inverted-query
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+# encoding: utf-8
+import argparse
+import sys
+import json
+from echoprint_server import \
+    load_inverted_index, query_inverted_index, \
+    parsed_code_streamer, parsing_code_streamer
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--already-parsed', action='store_true',
+                        help='input has been already parsed as a \
+                        comma-separated list of integer codes (no offset \
+                        information)')
+    parser.add_argument('indexfiles', nargs='+', \
+                        help='inverted index files (in order)')
+    args = parser.parse_args()
+    inverted_index = load_inverted_index(args.indexfiles)
+    streamer = parsed_code_streamer if args.already_parsed \
+               else parsing_code_streamer
+    for codes in streamer(sys.stdin):
+        print json.dumps(
+            {'results' : query_inverted_index(
+                codes, inverted_index, 'jaccard')})
diff --git a/bin/echoprint-rest-service b/bin/echoprint-rest-service
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# encoding: utf-8
+import argparse
+import sys
+from operator import itemgetter
+from flask import Flask, jsonify, request
+from echoprint_server import \
+    decode_echoprint, query_inverted_index, load_inverted_index
+
+use_tornado = False
+try:
+    from tornado.wsgi import WSGIContainer
+    from tornado.httpserver import HTTPServer
+    from tornado.ioloop import IOLoop
+    use_tornado = True
+except:
+    print 'cannot import tornado'
+
+app = Flask(__name__)
+
+
+@app.route('/query/<method>', methods=['POST'])
+def rest_query(method):
+    NRES = 20
+    echoprint_string = request.form['echoprint']
+    _, codes = decode_echoprint(str(echoprint_string))
+    results = query_inverted_index(codes, app.inverted_index, str(method))
+    # optionally augment results with gids
+    if app.gids is not None:
+        for r in results:
+            r['id'] = app.gids[r['index']]
+    return jsonify(results=results)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--flask', action='store_true')
+    parser.add_argument('-i', '--ids-file',
+                        help='ids_file contains track ids, one per line')
+    parser.add_argument('-p', '--port', type=int, default=5678,
+                        help='service port (default: 5678)')
+    parser.add_argument('inverted_index_paths', nargs='+')
+    args = parser.parse_args()
+
+    app.inverted_index = load_inverted_index(args.inverted_index_paths)
+    if app.inverted_index is None:
+        print >> sys.stderr, 'loading inverted index from %s failed' % \
+            args.inverted_index_dir
+        exit(1)
+    print 'loaded inverted index'
+
+    if args.ids_file is not None:
+        app.gids = [l.strip() for l in open(args.ids_file)]
+    else:
+        app.gids = None
+
+    if args.flask or (not use_tornado):
+        print 'starting app with flask'
+        app.run(debug=True, host='0.0.0.0', port=args.port)
+    else:
+        http_server = HTTPServer(WSGIContainer(app))
+        http_server.listen(args.port)
+        IOLoop.instance().start()
diff --git a/echoprint_server/__init__.py b/echoprint_server/__init__.py
@@ -0,0 +1,6 @@
+from .lib import \
+    decode_echoprint, create_inverted_index, \
+    parsed_code_streamer, parsing_code_streamer
+from echoprint_server_c import \
+    load_inverted_index, inverted_index_size, \
+    query_inverted_index
diff --git a/echoprint_server/lib.py b/echoprint_server/lib.py
@@ -0,0 +1,57 @@
+import base64
+import zlib
+import shutil
+import itertools
+from echoprint_server_c import _create_index_block
+
+
+def split_seq(iterable, size):
+    it = iter(iterable)
+    item = list(itertools.islice(it, size))
+    while item:
+        yield item
+        item = list(itertools.islice(it, size))
+
+
+def decode_echoprint(echoprint_b64_zipped):
+    '''
+    Decode an echoprint string as output by `echoprint-codegen`.
+    The function returns offsets and codes as list of integers.
+    '''
+    zipped = base64.urlsafe_b64decode(echoprint_b64_zipped)
+    unzipped = zlib.decompress(zipped)
+    N = len(unzipped)
+    offsets = [int(''.join(o), 16) for o in split_seq(unzipped[:N/2], 5)]
+    codes =  [int(''.join(o), 16) for o in split_seq(unzipped[N/2:], 5)]
+    return offsets, codes
+
+
+def create_inverted_index(songs, output_path):
+    '''
+    Create an inverted index from an iterable of song codes.
+    For large number of songs (>= 65535) several files will be created,
+    output_path_0001, output_path_0002, ...
+    '''
+    n_batches = 0
+
+    for batch_index, batch in enumerate(split_seq(songs, 65535)):
+        batch_output_path = output_path + ('_%04d' % batch_index)
+        _create_index_block(list(batch), batch_output_path)
+        n_batches += 1
+    if n_batches == 1:
+        shutil.move(batch_output_path, output_path)
+
+
+def parsed_code_streamer(fstream):
+    '''
+    Convenience generator for reading comma-separated list of integers
+    '''
+    for line in fstream:
+        yield [int(c) for c in line.strip().split(',')]
+
+def parsing_code_streamer(fstream):
+    '''
+    Convenience generator for converting echoprint strings into codes
+    '''
+    for line in fstream:
+        yield decode_echoprint(line.strip())[1]