Browse files

progress

  • Loading branch information...
1 parent 2d8ddc7 commit fc8cb264ad27b899f99364ac60db9712f1da6f14 @sdiehl committed Sep 21, 2012
Showing with 22,467 additions and 271 deletions.
  1. +18 −0 README.md
  2. +32 −6 example.py
  3. +95 −80 kaylee/client.py
  4. +203 −182 kaylee/server.py
  5. +8 −0 kaylee/utils.py
  6. +22,108 −0 mobydick.txt
  7. +3 −3 requirements.txt
View
18 README.md
@@ -1,3 +1,19 @@
+Motivation
+==========
+
+Kaylee is a small MapReduce implementation mostly meant as a
+proof of concept to illustrate the power of ZeroMQ and for
+education purpose
+
+My goal was not to write a Hadoop clone but to build a starting point
+that one could use to learn about MapReduce.
+
+The main bottleneck in this implementation is that the Shuffle
+phase requires all data to be moved to the ``server`` instance
+which is not generally a good idea for performance. But this lets
+us a implement a simple shuffler using a Python defaultdict in
+just a few lines of code which is easy to understand.
+
Directions:
===========
@@ -9,6 +25,8 @@ For Arch Linux
For Ubuntu Linux
+ $ add-apt-repository ppa:chris-lea/zeromq
+ $ apt-get update
$ apt-get install zeromq-bin libzmq-dev libzmq0
For Macintosh:
View
38 example.py
@@ -1,15 +1,25 @@
import time
-from kaylee import Server
+import numpy
+import mmap
+from itertools import count
-# Example
-# -----------------------------------------------
+from kaylee import Server
+# Note, we never load the whole file into memory.
f = open('mobydick.txt')
+mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
-data = dict(enumerate(f.readlines()))
+# This just enumerates all lines in the file, but is able to
+# get data from disk into ZeroMQ much faster than read/writes.
+def datafn():
+ i = count(0)
+ total = mm.size()
+ while mm.tell() < total:
+ yield next(i), memoryview(mm.readline())
+ mm.close()
def mapfn(k, v):
- for w in v.split():
+ for w in v.bytes.split():
yield w, 1
def reducefn(k, v):
@@ -19,13 +29,29 @@ def reducefn(k, v):
s = Server()
s.connect()
+# yaml config
+# Datastore backend, Redis Moose kaylee://
+
+# kaylee
+# /key1
+# blob
+# blob
+# /key2
+# blob
+# blob
+# /key3
+
s.mapfn = mapfn
s.reducefn = reducefn
-s.data = data
+s.datafn = datafn
start = time.time()
s.start()
stop = time.time()
print stop-start
#print s.results()
+print sorted(s.results().iteritems(), key=lambda x: x[1], reverse=True)[1:25]
+
+# Use a multiprocessing Pool example! Not the general use case
+# though!
View
175 kaylee/client.py
@@ -1,25 +1,29 @@
import sys
import uuid
-import cPickle as pickle
+import numpy
import marshal
import types
import logging
+
import gevent
-from gevent_zeromq import zmq
-from utils import cat
+import zmq.green as zmq
+
+try:
+ import msgpack as srl
+except ImportError:
+ import cPickle as srl
+
from collections import defaultdict
-class Client:
+class Client(object):
def __init__(self):
self.worker_id = str(uuid.uuid4())
self.push_socket = None
self.pull_socket = None
- self.control_socket = None
- self.delim = '::'
+ self.ctrl_socket = None
- # only listen for instructions for this specific worker
self.threaded = False
self.have_bytecode = False
@@ -86,147 +90,158 @@ def connect(self, push_addr = None,
print addr
- self.control_socket = c.socket(zmq.SUB)
- self.control_socket.connect(addr)
- self.control_socket.setsockopt(zmq.SUBSCRIBE, self.worker_id)
-
- def spawn(self):
- self.threaded = True
+ self.ctrl_socket = c.socket(zmq.ROUTER)
+ self.ctrl_socket.setsockopt(zmq.IDENTITY, self.worker_id)
+ self.ctrl_socket.connect(addr)
def start(self):
''' Start processing work '''
self.logging.info('Started Worker %s' % self.worker_id)
self.collect()
def _kill(self):
- # Garbage collect the sockets to avoid weirdness
- self.control_socket.close()
+ self.ctrl_socket.close()
self.pull_socket.close()
self.push_socket.close()
- self.control_socket = None
- self.pull_socket = None
- self.push_socket = None
+ self.ctrl_socket = None
+ self.pull_socket = None
+ self.push_socket = None
self.logging.info('Stopped Worker')
- if self.threaded:
- gevent.getcurrent().kill()
- else:
- sys.exit(1)
+ sys.exit(0)
def collect(self):
- self.register()
poller = zmq.Poller()
poller.register(self.pull_socket, zmq.POLLIN)
- poller.register(self.control_socket, zmq.POLLIN)
+ poller.register(self.ctrl_socket, zmq.POLLIN)
# multiplex the pull and control ports
pull_socket = self.pull_socket
- control_socket = self.control_socket
+ ctrl_socket = self.ctrl_socket
while True:
- # Wait until the server pushes bytecode to use to
- # listening for data, ( this is a race condition
- # otherwise )
+
if self.have_bytecode:
+
try:
- socks = dict(poller.poll())
+ events = dict(poller.poll())
except zmq.ZMQError:
# Die gracefully if the user sends a SIGQUIT
self._kill()
break
- if pull_socket in socks and socks[pull_socket] == zmq.POLLIN:
- msg = self.pull_socket.recv()
+ if events.get(pull_socket) == zmq.POLLIN:
+
+ command = self.pull_socket.recv(flags=zmq.SNDMORE)
+ key = self.pull_socket.recv(flags=zmq.SNDMORE)
+ data = self.pull_socket.recv(copy=False)
+ payload = (key, data)
- if msg:
- command, data = msg.split(self.delim)
- self.process_command(command, data)
+ self.process_command(command, payload)
- if control_socket in socks and socks[control_socket] == zmq.POLLIN:
- msg = self.control_socket.recv()
+ if events.get(ctrl_socket) == zmq.POLLIN:
+ worker_id, command = self.ctrl_socket.recv_multipart()
+ self.process_command(command, data)
- if msg:
- worker, command, data = msg.split(self.delim)
- self.process_command(command, data)
else:
- msg = self.control_socket.recv()
+ self.logging.info('Waiting for server')
- if msg:
- worker, command, data = msg.split(self.delim)
- self.process_command(command, data)
+ msg = srl.dumps(('connect', self.worker_id))
+ self.push_socket.send_multipart(['connect', self.worker_id])
- def register(self):
- '''
- Register the node with the server.
- '''
- self.send_command('connect', self.worker_id)
+ worker_id, payload = self.ctrl_socket.recv_multipart()
+ command, (mapbc, reducebc) = srl.loads(payload)
+
+ assert command == 'bytecode'
+ self.set_bytecode(mapbc, reducebc)
+ self.logging.info('Received Bytecode')
def send_command(self, command, data=None):
'''
- Push a command to the sever.
+ Push a command to the server.
'''
- _d = self.delim
-
if data:
- pdata = pickle.dumps(data)
- self.push_socket.send(cat(command,_d,pdata))
- #logging.debug(command)
+ msg = srl.dumps((command, data))
+ self.push_socket.send(msg)
else:
- self.push_socket.send(cat(command,_d))
- #logging.debug(command)
+ msg = command
+ self.push_socket.send(msg)
- def set_bytecode(self, command, data):
+ def set_bytecode(self, mapbc, reducebc):
'''
Load the bytecode sent by the server and flag that we are
ready for work.
'''
- #self.logging.info('Received Bytecode')
- mapfn_bc, reducefn_bc = data
self.mapfn = types.FunctionType(
- marshal.loads(mapfn_bc),
+ marshal.loads(mapbc),
globals(),
'mapfn'
)
self.reducefn = types.FunctionType(
- marshal.loads(reducefn_bc),
+ marshal.loads(reducebc),
globals(),
'reducefn'
)
self.have_bytecode = True
def on_done(self, command=None, data=None):
- #self.logging.info('Done')
self._kill()
def call_mapfn(self, command, data):
- results = defaultdict(list)
+ #results = defaultdict(list)
key, value = data
for k, v in self.mapfn(key, value):
- results[k].append(v)
+ print 'mapping', k, v
+ # Probably don't actually want to do this, but
+ # instead collect up a temporray batch and then do a
+ # tight loop where we send everything.
+
+ self.push_socket.send('mapdone', flags=zmq.SNDMORE)
+ self.push_socket.send(key, flags=zmq.SNDMORE)
+ self.push_socket.send(k, flags=zmq.SNDMORE)
+ self.push_socket.send(srl.dumps(v))
+ #results[k].append(v)
- self.send_command('mapdone', (key, results))
+ self.push_socket.send('keydone', flags=zmq.SNDMORE)
+ self.push_socket.send(key)
+
+ #print 'mapping', key
+ #import pdb; pdb.set_trace()
+
+ #if isinstance(results, numpy.ndarray):
+ #self.push_socket.send(results, copy=False)
+ #else:
+ #self.push_socket.send(srl.dumps(results))
def call_reducefn(self, command, data):
key, value = data
- results = self.reducefn(key, value)
- self.send_command('reducedone', (key, results))
-
- def process_command(self, command, data=None):
- commands = {
- 'bytecode': self.set_bytecode,
- 'done': self.on_done,
- 'map': self.call_mapfn,
- 'reduce': self.call_reducefn,
- }
-
- if command in commands:
- if data:
- data = pickle.loads(data)
- commands[command](command, data)
+
+ from itertools import imap
+ it = imap(srl.loads, srl.loads(value))
+
+ results = self.reducefn(key, it)
+
+ print 'reducing', key
+ self.push_socket.send('reducedone', flags=zmq.SNDMORE)
+ self.push_socket.send(key, flags=zmq.SNDMORE)
+
+ if isinstance(results, numpy.ndarray):
+ self.push_socket.send(results, copy=False)
+ else:
+ self.push_socket.send(srl.dumps(results))
+
+ def process_command(self, command, payload=None):
+ self.commands[command](self, command, payload)
+
+ commands = {
+ 'done' : on_done,
+ 'map' : call_mapfn,
+ 'reduce' : call_reducefn,
+ }
if __name__ == "__main__":
c = Client()
View
385 kaylee/server.py
@@ -1,18 +1,24 @@
import random
import marshal
-import cPickle as pickle
import logging
import gevent
-from gevent_zeromq import zmq
-from utils import cat
+
+import zmq.green as zmq
from collections import defaultdict
-START = 0
-MAP = 1
-REDUCE = 2
-FINISHED = 3
+START = 0
+MAP = 1
+SHUFFLE = 2
+PARTITION = 3
+REDUCE = 4
+COLLECT = 5
+
+try:
+ import msgpack as srl
+except ImportError:
+ import cPickle as srl
-class Server:
+class Server(object):
def __init__(self):
@@ -21,259 +27,274 @@ def __init__(self):
self.mapfn = None
self.reducefn = None
- self.data = None
+ self.datafn = None
+
self.bytecode = None
self.started = False
self.completed = False
- self.delim = '::'
+
+ self.working_maps = {}
logging.basicConfig(logging=logging.DEBUG,
- format="%(asctime)s [%(levelname)s] %(message)s")
+ format="%(asctime)s [%(levelname)s] %(message)s")
logging.getLogger("").setLevel(logging.INFO)
self.logging = logging
- def connect(self, push_addr = None,
- pull_addr = None,
- control_addr = None):
-
- c = zmq.Context()
-
- # Pull tasks across manager
- if not pull_addr:
- prot = 'tcp://'
- ip = '127.0.0.1'
- port = '6666'
- addr = ''.join([prot,ip,':',port])
- elif len(pull_addr) > 1:
- prot, ip, port = pull_addr
- addr = ''.join([prot,ip,':',port])
- else:
- addr = pull_addr
-
- print addr
-
- self.pull_socket = c.socket(zmq.PULL)
- self.pull_socket.bind(addr)
-
- # Pull tasks across manager
- if not push_addr:
- prot = 'tcp://'
- ip = '127.0.0.1'
- port = '5555'
- addr = ''.join([prot,ip,':',port])
- elif len(push_addr) > 1:
- prot, ip, port = push_addr
- addr = ''.join([prot,ip,':',port])
- else:
- addr = push_addr
-
- print addr
-
- self.push_socket = c.socket(zmq.PUSH)
- self.push_socket.bind(addr)
-
- # Pull tasks across manager
- if not control_addr:
- prot = 'tcp://'
- ip = '127.0.0.1'
- port = '7777'
- addr = ''.join([prot,ip,':',port])
- elif len(control_addr) > 1:
- prot, ip, port = control_addr
- addr = ''.join([prot,ip,':',port])
- else:
- addr = control_addr
+ def main_loop(self):
+ self.started = True
+
+ poller = zmq.Poller()
- print addr
+ poller.register(self.pull_socket, zmq.POLLIN)
+ poller.register(self.push_socket, zmq.POLLOUT)
+ poller.register(self.ctrl_socket, zmq.POLLOUT)
- self.control_socket = c.socket(zmq.PUB)
- self.control_socket.bind(addr)
+ while self.started and not self.completed:
+ try:
+ events = dict(poller.poll())
+ except zmq.ZMQError:
+ self._kill()
+ break
+
+ # Specify number of nodes to requeset
+ if len(self.workers) > 0:
+ if events.get(self.push_socket) == zmq.POLLOUT:
+ self.start_new_task()
+ if events.get(self.ctrl_socket) == zmq.POLLIN:
+ self.manage()
+ if events.get(self.pull_socket) == zmq.POLLIN:
+ self.collect_task()
+ else:
+ if events.get(self.pull_socket) == zmq.POLLIN:
+ self.collect_task()
+ if events.get(self.ctrl_socket) == zmq.POLLIN:
+ self.manage()
+
+ def connect(self, push_addr = None, pull_addr = None, control_addr = None):
+ c = zmq.Context()
+
+ # Pull tasks across manager
+ if not pull_addr:
+ prot = 'tcp://'
+ ip = '127.0.0.1'
+ port = '6666'
+ addr = ''.join([prot,ip,':',port])
+ elif len(pull_addr) > 1:
+ prot, ip, port = pull_addr
+ addr = ''.join([prot,ip,':',port])
+ else:
+ addr = pull_addr
+ print addr
- def start(self, timeout=None):
+ self.pull_socket = c.socket(zmq.PULL)
+ self.pull_socket.bind(addr)
- self.started = True
- self.logging.info('Started Server')
+ if not push_addr:
+ prot = 'tcp://'
+ ip = '127.0.0.1'
+ port = '5555'
+ addr = ''.join([prot,ip,':',port])
+ elif len(push_addr) > 1:
+ prot, ip, port = push_addr
+ addr = ''.join([prot,ip,':',port])
+ else:
+ addr = push_addr
+
+ print addr
+
+ self.push_socket = c.socket(zmq.PUSH)
+ self.push_socket.bind(addr)
+
+ # Pull tasks across manager
+ if not control_addr:
+ prot = 'tcp://'
+ ip = '127.0.0.1'
+ port = '7777'
+ addr = ''.join([prot,ip,':',port])
+ elif len(control_addr) > 1:
+ prot, ip, port = control_addr
+ addr = ''.join([prot,ip,':',port])
+ else:
+ addr = control_addr
- try:
- if timeout:
- timeout = gevent.Timeout(timeout, gevent.Timeout)
+ print 'Control Socket', addr
- self.start_new_task()
- # Block until we collect all data
- gevent.spawn(self.collect).join(timeout=timeout)
+ self.ctrl_socket = c.socket(zmq.ROUTER)
+ self.ctrl_socket.bind(addr)
- except KeyboardInterrupt:
- self.started = False
- self.logging.info('Stopped Server')
+ def start(self, timeout=None):
+ self.gen_bytecode()
+ self.logging.info('Started Server')
- except gevent.Timeout:
- self.started = False
- self.logging.info('Timed out')
+ main = gevent.spawn(self.main_loop)
+ main.join()
self.done()
def done(self):
for worker in self.workers:
- self.send_control('done',None,worker)
+ self.ctrl_socket.send_multipart([worker, 'done'])
def _kill(self):
gevent.getcurrent().kill()
- def collect(self):
- while True:
- msg = self.pull_socket.recv()
-
- if msg:
- command, data = msg.split(self.delim)
- self.process_command(command, data)
-
def results(self):
if self.completed:
return self.reduce_results
else:
return None
- #@print_timing
- def send_control(self, command, data, worker):
- _d = self.delim
- self.logging.debug('Sending to: %s' % worker)
- if data:
- pdata = pickle.dumps(data)
- #logging.debug( "<- %s" % command)
- self.control_socket.send(cat(worker,_d,command,_d,pdata))
+ def send_datum(self, command, key, data):
+ self.push_socket.send(command, flags=zmq.SNDMORE)
+ self.push_socket.send(str(key), flags=zmq.SNDMORE)
+ # Do a multipart message since we want to do
+ # zero-copy of data.
+
+ if self.state == MAP:
+ self.push_socket.send(data, copy=False)
else:
- #logging.debug( "<- %s" % command)
- self.control_socket.send(cat(worker,_d,command ,_d))
-
- #@print_timing
- def send_command(self, command, data=None):
- _d = self.delim
- if data:
- pdata = pickle.dumps(data)
- #logging.debug( "<- %s" % command)
- self.push_socket.send(cat(command,_d, pdata))
+ self.push_socket.send(srl.dumps(data))
+
+ def send_command(self, command, payload=None):
+ if payload:
+ self.send_datum(command, *payload)
else:
- #logging.debug( "<- %s" % command)
- self.push_socket.send(cat(command ,_d))
+ self.push_socket.send(command)
def start_new_task(self):
- command, data = self.next_task()
- if command:
+ action = self.next_task()
+ if action:
+ command, data = action
self.send_command(command, data)
- #gevent.spawn(self.send_command, command, data)
def next_task(self):
if self.state == START:
- self.map_iter = iter(self.data)
- self.working_maps = {}
+ #self.job_id = 'foo'
+ self.map_iter = self.datafn()
self.map_results = defaultdict(list)
self.state = MAP
self.logging.info('Mapping')
if self.state == MAP:
+
try:
+ map_key, map_item = self.map_iter.next()
+ self.working_maps[str(map_key)] = map_item
+ #print 'sending', map_key
+ return 'map', (map_key, map_item)
+ except StopIteration:
+ self.logging.info('Shuffling')
+ self.state = SHUFFLE
- map_key = self.map_iter.next()
- map_item = map_key, self.data[map_key]
- self.working_maps[map_item[0]] = map_item[1]
- return 'map', map_item
+ if self.state == SHUFFLE:
+ self.reduce_iter = self.map_results.iteritems()
+ self.working_reduces = set()
+ self.reduce_results = {}
- except StopIteration:
- if len(self.working_maps) > 0:
- key = random.choice(self.working_maps.keys())
- return 'map', (key, self.working_maps[key])
- self.state = REDUCE
- self.reduce_iter = self.map_results.iteritems()
- self.working_reduces = {}
- self.reduce_results = {}
+ if len(self.working_maps) == 0:
self.logging.info('Reducing')
+ self.state = PARTITION
+ #else:
+ #self.logging.info('Still shuffling %s ' % len(self.working_maps))
+
+ if self.state == PARTITION:
+ self.state = REDUCE
if self.state == REDUCE:
+
try:
+ reduce_key, reduce_value = self.reduce_iter.next()
+ self.working_reduces.add(reduce_key)
+ return 'reduce', (reduce_key, reduce_value)
+ except StopIteration:
+ self.logging.info('Collecting')
+ self.state = COLLECT
- reduce_item = self.reduce_iter.next()
- self.working_reduces[reduce_item[0]] = reduce_item[1]
- return 'reduce', reduce_item
+ if self.state == COLLECT:
- except StopIteration:
+ if len(self.working_reduces) == 0:
+ self.completed = True
+ self.logging.info('Finished')
+ #else:
+ #self.logging.info('Still collecting %s' % len(self.working_reduces))
- if len(self.working_reduces) > 0:
- key = random.choice(self.working_reduces.keys())
- return 'reduce', (key, self.working_reduces[key])
+ def collect_task(self):
+ # Don't use the results if they've already been counted
+ command = self.pull_socket.recv(flags=zmq.SNDMORE)
- self.state = FINISHED
+ if command == 'connect':
+ payload = self.pull_socket.recv()
+ self.on_connect(payload)
- if self.state == FINISHED:
- self.completed = True
- # Destroy the collector thread
- self._kill()
+ elif command == 'keydone':
+ key = self.pull_socket.recv()
+ del self.working_maps[key]
- def map_done(self, data):
- # Don't use the results if they've already been counted
- key, value = data
- if key not in self.working_maps:
- return
+ elif command == 'mapdone':
+ key = self.pull_socket.recv(flags=zmq.SNDMORE)
+ tkey = self.pull_socket.recv(flags=zmq.SNDMORE)
+ value = self.pull_socket.recv()
- for k, v in value.iteritems():
- self.map_results[k].extend(v)
+ #print tkey, key, value
+ self.map_results[tkey].extend(value)
- del self.working_maps[key]
+ #del self.working_maps[key]
- def reduce_done(self, data):
- # Don't use the results if they've already been counted
- key, value = data
- if key not in self.working_reduces:
- return
+ elif command == 'reducedone':
+ key = self.pull_socket.recv(flags=zmq.SNDMORE)
+ value = srl.loads(self.pull_socket.recv())
+
+ # Don't use the results if they've already been counted
+ if key not in self.working_reduces:
+ return
+
+ self.reduce_results[key] = value
+ self.working_reduces.remove(key)
- self.reduce_results[key] = value
- del self.working_reduces[key]
+ else:
+ raise RuntimeError()
def on_map_done(self, command, data):
self.map_done(data)
- self.start_new_task()
def on_reduce_done(self, command, data):
self.reduce_done(data)
- self.start_new_task()
def gen_bytecode(self):
self.bytecode = (
marshal.dumps(self.mapfn.func_code),
marshal.dumps(self.reducefn.func_code),
)
- def on_connect(self, command, data):
- self.logging.info('Worker Registered: %s' % data)
- self.workers.add(data)
- worker_id = data
+ def on_connect(self, worker_id):
+ if worker_id not in self.workers:
+ self.logging.info('Worker Registered: %s' % worker_id)
+ self.workers.add(worker_id)
- # Store this so we don't call it for every worker
- if not self.bytecode:
- self.gen_bytecode()
+ payload = ('bytecode', self.bytecode)
+ self.ctrl_socket.send_multipart([worker_id, srl.dumps(payload)])
+ self.logging.info('Sending Bytecode')
+ else:
+ print worker_id
- self.send_control(
- 'bytecode',
- self.bytecode,
- worker_id
- )
+ def process_command(self, command, data=None):
+ self.commands[command](self, command, data)
- self.logging.info('Sending Bytecode')
- self.start_new_task()
+ commands = {
+ 'mapdone' : on_map_done,
+ 'reducedone' : on_reduce_done,
+ 'connect' : on_connect
+ }
- def process_command(self, command, data=None):
- commands = {
- 'mapdone': self.on_map_done,
- 'reducedone': self.on_reduce_done,
- 'connect': self.on_connect
- }
-
- if command in commands:
- if data:
- data = pickle.loads(data)
- commands[command](command, data)
- else:
- self.process_command(self, command, data)
+if __name__ == '__main__':
+
+ # Support Cython!
+ import sys
+ import imp
+
+ path = sys.argv[1]
+ imp.load_module(path)
View
8 kaylee/utils.py
@@ -1,4 +1,5 @@
import time
+import msgpack
def cat(*xs):
return "".join(xs)
@@ -11,3 +12,10 @@ def wrapper(*arg):
print '%s took %0.3f ms' % (func.func_name, (t2-t1)*1000.0)
return res
return wrapper
+
+def sub_subscription_prefix(worker_id, n=3):
+ """
+ Listen for n-tuples with the worker id prefix without
+ deserialization. Very fast.
+ """
+ return msgpack.dumps(tuple([worker_id] + [None]*(n-1)))[0:2]
View
22,108 mobydick.txt
22,108 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
View
6 requirements.txt
@@ -1,3 +1,3 @@
-gevent
-pyzmq
-gevent-zeromq
+gevent==0.13.8
+pyzmq>=2.2
+msgpack-python==0.2.1

0 comments on commit fc8cb26

Please sign in to comment.