In [11]:
#Both of the styles below are same.
#Style 1
a=[1,2,3]
def fun1():
    for i in a:
        yield i
        
f = fun1()
print(next(f))
print(next(f))
print(next(f))
print(next(f))

1
2
3


StopIteration: 

In [12]:
#Style 2
a=[1,2,3]
def fun2():
    yield from a
        
f = fun2()
print(next(f))
print(next(f))
print(next(f))
print(next(f))

1
2
3


StopIteration: 

# https://github.com/dabeaz/generators/tree/master/examples

In [13]:
# nongenlog.py
#
# Sum up the number of bytes transferred in an Apache log file
# using a simple for-loop.   We're not using generators here.

with open("access-log") as wwwlog:
    total = 0
    for line in wwwlog:
        bytes_sent = line.rsplit(None,1)[1]
        if bytes_sent != '-':
            total += int(bytes_sent)
    print("Total", total)

FileNotFoundError: [Errno 2] No such file or directory: 'access-log'

In [14]:
# genlog.py
#
# Sum up the bytes transferred in an Apache server log using
# generator expressions

with open("access-log") as wwwlog:
    bytecolumn = (line.rsplit(None,1)[1] for line in wwwlog)
    bytes_sent = (int(x) for x in bytecolumn if x != '-')
    print("Total", sum(bytes_sent))

FileNotFoundError: [Errno 2] No such file or directory: 'access-log'

In [15]:
# Make a big log file for testing

import sys

if len(sys.argv) != 2:
    print("Usage : makebig.py repetitions", file=sys.stderr)
    raise SystemExit(1)

data = open("access-log").read()

with open("big-access-log","w") as f:
    for i in range(int(sys.argv[1])):
        f.write(data)

Usage : makebig.py repetitions


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [16]:
# genfind.py
#
# A function that generates files that match a given filename pattern

from pathlib import Path

def gen_find(filepat, top):
    yield from Path(top).rglob(filepat)

# Example use

if __name__ == '__main__':
    lognames = gen_find("access-log*","www")
    for name in lognames:
        print(name)

In [19]:
# genopen.py
#
# Takes a sequence of filenames as input and yields a sequence of file
# objects that have been suitably open

import gzip, bz2

def gen_open(paths):
    for path in paths:
        if path.suffix == '.gz':
            yield gzip.open(path, 'rt')
        elif path.suffix == '.bz2':
            yield bz2.open(path, 'rt')
        else:
            yield open(path, 'rt')

# Example use

if __name__ == '__main__':
    from pathlib import Path
    lognames = Path('www').rglob('access-log*')
    logfiles = gen_open(lognames)
    for f in logfiles:
        print(f)

In [18]:
# gencat.py
#
# Concatenate multiple generators into a single sequence

def gen_cat(sources):
    for src in sources:
        yield from src

# Example use

if __name__ == '__main__':
    from pathlib import Path
    from genopen import gen_open

    lognames = Path('www').rglob('access-log*')
    logfiles = gen_open(lognames)
    loglines = gen_cat(logfiles)
    for line in loglines:
        print(line,end='')

ModuleNotFoundError: No module named 'genopen'

In [20]:
# gengrep.py
#
# Grep a sequence of lines that match a re pattern

import re
def gen_grep(pat, lines):
    patc = re.compile(pat)
    return (line for line in lines if patc.search(line))

# Example use

if __name__ == '__main__':
    from pathlib import Path
    from genopen import  gen_open
    from gencat  import  gen_cat

    lognames = Path('www').rglob('access-log*')
    logfiles = gen_open(lognames)
    loglines = gen_cat(logfiles)

    # Look for ply downloads (PLY is my own Python package)
    plylines = gen_grep(r'ply-.*\.gz',loglines)
    for line in plylines:
        print(line, end='')

ModuleNotFoundError: No module named 'genopen'

In [21]:
# bytesgen.py
#
# An example of chaining together different generators into a processing
# pipeline.    

from genfind import *
from genopen import *
from gencat import *
from gengrep import *

pat    = r'ply-.*\.gz'
logdir = 'www'

filenames = gen_find("access-log*",logdir)
logfiles  = gen_open(filenames)
loglines  = gen_cat(logfiles)
patlines  = gen_grep(pat,loglines)
bytecol   = (line.rsplit(None,1)[1] for line in patlines)
bytes_sent= (int(x) for x in bytecol if x != '-')

print("Total", sum(bytes_sent))

ModuleNotFoundError: No module named 'genfind'

In [22]:
# retuple.py
#
# Read a sequence of log lines and parse them into a sequence of tuples

loglines = open("access-log")

import re

logpats  = r'(\S+) (\S+) (\S+) \[(.*?)\] ' \
           r'"(\S+) (\S+) (\S+)" (\S+) (\S+)'

logpat   = re.compile(logpats)

groups   = (logpat.match(line) for line in loglines)
tuples   = (g.groups() for g in groups if g)

if __name__ == '__main__':
    for t in tuples:
        print(t)

FileNotFoundError: [Errno 2] No such file or directory: 'access-log'

In [23]:
# redict.py
#
# Read a sequence of log lines and parse them into a sequence of dictionaries

loglines = open("access-log")

import re

logpats  = r'(\S+) (\S+) (\S+) \[(.*?)\] ' \
           r'"(\S+) (\S+) (\S+)" (\S+) (\S+)'

logpat   = re.compile(logpats)

groups   = (logpat.match(line) for line in loglines)
tuples   = (g.groups() for g in groups if g)

colnames = ('host','referrer','user','datetime',
            'method', 'request','proto','status','bytes')

log      = (dict(zip(colnames, t)) for t in tuples)

if __name__ == '__main__':
    for x in log:
        print(x)

FileNotFoundError: [Errno 2] No such file or directory: 'access-log'

In [24]:
# fieldmap.py
#
# Take a sequence of dictionaries and remap one of the fields

def field_map(dictseq, name, func):
    for d in dictseq:
        d[name] = func(d[name])
        yield d

# Example

if __name__ == '__main__':

    loglines = open("access-log")

    import re

    logpats  = r'(\S+) (\S+) (\S+) \[(.*?)\] ' \
               r'"(\S+) (\S+) (\S+)" (\S+) (\S+)'

    logpat   = re.compile(logpats)

    groups   = (logpat.match(line) for line in loglines)
    tuples   = (g.groups() for g in groups if g)

    colnames = ('host','referrer','user','datetime',
                'method', 'request','proto','status','bytes')

    log      = (dict(zip(colnames, t)) for t in tuples)

    log      = field_map(log,"status",int)
    log      = field_map(log,"bytes",
                         lambda s: int(s) if s != '-' else 0)

    
    for x in log:
        print(x)

FileNotFoundError: [Errno 2] No such file or directory: 'access-log'

In [25]:
# linesdir.py
#
# Generate a sequence of lines from files in a directory

from pathlib import Path
from gencat import *
from genopen import *

def lines_from_dir(filepat, dirname):
    names = Path(dirname).rglob(filepat)
    files = gen_open(names)
    lines = gen_cat(files)
    return lines

# Example use

if __name__ == '__main__':
    loglines = lines_from_dir("access-log*","www")
    for line in loglines:
        print(line, end='')

ModuleNotFoundError: No module named 'gencat'

In [26]:
# apachelog.py
#
# Parse an apache log file into a sequence of dictionaries

from fieldmap import field_map

import re

logpats  = r'(\S+) (\S+) (\S+) \[(.*?)\] ' \
           r'"(\S+) (\S+) (\S+)" (\S+) (\S+)'

logpat   = re.compile(logpats)

def apache_log(lines):
    groups = (logpat.match(line) for line in lines)
    tuples = (g.groups() for g in groups if g)
    
    colnames = ('host','referrer','user','datetime',
                'method', 'request','proto','status','bytes')

    log      = (dict(zip(colnames,t)) for t in tuples)
    log      = field_map(log,"status",int)
    log      = field_map(log,"bytes",
                         lambda s: int(s) if s != '-' else 0)

    return log

# Example use:

if __name__ == '__main__':
    from linesdir import lines_from_dir
    lines = lines_from_dir("access-log*","www")
    log = apache_log(lines)
    for r in log:
        print(r)


ModuleNotFoundError: No module named 'fieldmap'

In [27]:
# query404.py
#
# Find the set of all documents that 404 in a log file

from linesdir import lines_from_dir
from apachelog import apache_log

lines = lines_from_dir("access-log*","www")
log = apache_log(lines)

stat404 =  {  r['request'] for r in log
              if r['status'] == 404 }

for r in sorted(stat404):
    print(r)

ModuleNotFoundError: No module named 'linesdir'

In [28]:
# largefiles.py
#
# Find all transfers over a megabyte

from linesdir import *
from apachelog import *

lines = lines_from_dir("access-log*","www")
log = apache_log(lines)

large = (r for r in log
         if r['bytes'] > 1000000)

for r in large:
    print(r['request'],r['bytes'])

ModuleNotFoundError: No module named 'linesdir'

In [29]:
# largest.py
#
# Find the largest file

from linesdir import lines_from_dir
from apachelog import apache_log

lines = lines_from_dir("access-log*","www")
log = apache_log(lines)

print("%d %s" % max((r['bytes'],r['request'])
                    for r in log))

ModuleNotFoundError: No module named 'linesdir'

In [30]:
# hosts.py
#
# Find unique host IP addresses

from linesdir import lines_from_dir
from apachelog import apache_log

lines = lines_from_dir("access-log*","www")
log = apache_log(lines)

hosts = set(r['host'] for r in log)
for h in hosts:
    print(h)

ModuleNotFoundError: No module named 'linesdir'

In [31]:
# downloads.py
#
# Find out how many downloads of a specific request

from linesdir import lines_from_dir
from apachelog import apache_log

lines = lines_from_dir("access-log*","www")
log = apache_log(lines)

request = 'ply/ply-2.3.tar.gz'

total = sum(1 for r in log
              if r['request'] == '/ply/ply-2.3.tar.gz')

print("Total", total)

ModuleNotFoundError: No module named 'linesdir'

In [32]:
# robots.py
#
# Find out who has been hitting robots.txt

from linesdir import lines_from_dir
from apachelog import apache_log

lines = lines_from_dir("access-log*","www")
log = apache_log(lines)

addrs = { r['host'] for r in log
            if 'robots.txt' in r['request'] }

import socket
for addr in addrs:
    try:
        print(socket.gethostbyaddr(addr)[0])
    except socket.herror:
        print(addr)

ModuleNotFoundError: No module named 'linesdir'

In [33]:
# follow.py
#
# Follow a file like tail -f.

import time
import os

def follow(thefile):
    thefile.seek(0, os.SEEK_END)
    while True:
        line = thefile.readline()
        if not line:
            time.sleep(0.1)
            continue
        yield line

# Example use
# Note : This example requires the use of an apache log simulator.
# 
# Go to the directory run/foo and run the program 'logsim.py' from
# that directory.   Run this program as a background process and
# leave it running in a separate window.  We'll write program
# that read the output file being generated
# 

if __name__ == '__main__':
    logfile = open("run/foo/access-log","r")
    loglines = follow(logfile)
    for line in loglines:
        print(line, end='')

FileNotFoundError: [Errno 2] No such file or directory: 'run/foo/access-log'

In [34]:
# realtime404.py
#
# Print all 404 requests as they happen in the log

from apachelog import apache_log
from follow import follow

logfile  = open("run/foo/access-log")
loglines = follow(logfile)
log      = apache_log(loglines)

r404 = (r for r in log if r['status'] == 404)

for r in r404:
    print(r['host'], r['datetime'], r['request'])

ModuleNotFoundError: No module named 'apachelog'

In [None]:
# genreceive.py
#
# A generator that yields connections to a TCP socket

import socket
def receive_connections(addr):
    s = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
    s.setsockopt(socket.SOL_SOCKET,socket.SO_REUSEADDR,1)
    s.bind(addr)
    s.listen(5)
    while True:
        client = s.accept()
        yield client

# Example use

if __name__ == '__main__':
    for c, a in receive_connections(("",9000)):
        print("Got connection from", a)
        c.send(b"Hello World\n")
        c.close()

In [None]:
# genmessages.py
#
# A generator that yields messages on a UDP socket

import socket
def receive_messages(addr,maxsize):
    s = socket.socket(socket.AF_INET,socket.SOCK_DGRAM)
    s.bind(addr)
    while True:
        msg = s.recvfrom(maxsize)
        yield msg

# Example use
# To send a message to this generator, use the code "msgtest.py"

if __name__ == '__main__':
    for msg, addr in receive_messages(("",10000),1024):
        print(msg, "from", addr)

In [None]:
# genpickle.py
#
# Turn a sequence of objects into a sequence of pickle strings

import pickle

def gen_pickle(source):
    for item in source:
        yield pickle.dumps(item)

def gen_unpickle(infile):
    while True:
        try:
            item = pickle.load(infile)
            yield item
        except EOFError:
            return

In [None]:
# sendto.py
#
# Send items to a remote machine

import socket
from genpickle import gen_pickle

def sendto(source,addr):
    s = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
    s.connect(addr)
    for pitem in gen_pickle(source):
        s.sendall(pitem)
    s.close()

# Example use.   This requires you to run receivefrom.py
# in a different process/window

if __name__ == '__main__':
    from apachelog import apache_log
    from follow import follow

    lines = follow(open("run/foo/access-log"))
    log = apache_log(lines)
    sendto(log,("",15000))

In [None]:
# receivefrom.py
#
# Receive objects from a different machine

import socket
from genpickle import gen_unpickle

def receivefrom(addr):
    s = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
    s.setsockopt(socket.SOL_SOCKET,socket.SO_REUSEADDR, 1)
    s.bind(addr)
    s.listen(5)
    c, a = s.accept()
    for item in gen_unpickle(c.makefile('rb')):
        yield item
    c.close()

# Example use:
if __name__ == '__main__':
    for r in receivefrom(("",15000)):
        print(r['host'], r['request'])

In [None]:
# genqueue.py
#
# Generate a sequence of items that put onto a queue

def genfrom_queue(thequeue):
    while True:
        item = thequeue.get()
        if item is StopIteration: 
            break
        yield item

def sendto_queue(items, thequeue):
    for item in items:
        thequeue.put(item)
    thequeue.put(StopIteration)

# Example
if __name__ == '__main__':
    import queue, threading
    def consumer(q):
        for item in genfrom_queue(q):
            print("Consumed", item)
        print("done")

    in_q = queue.Queue()
    con_thr = threading.Thread(target=consumer,args=(in_q,))
    con_thr.start()

    # Now, pipe a bunch of data into the queue
    sendto_queue(range(100), in_q)
    

In [None]:
# genmulti.py
#
# Generate items from multiple generators (multiplex)
#

import queue, threading
from genqueue import genfrom_queue, sendto_queue
from gencat import gen_cat

def multiplex(sources):
    in_q = queue.Queue()
    consumers = []
    for src in sources:
        thr = threading.Thread(target=sendto_queue,
                               args=(src, in_q))
        thr.start()
        consumers.append(genfrom_queue(in_q))
    return gen_cat(consumers)

def gen_multiplex(genlist):
    item_q = queue.Queue()
    def run_one(source):
        for item in source: 
            item_q.put(item)

    def run_all():
        thrlist = []
        for source in genlist:
            t = threading.Thread(target=run_one, args=(source,))
            t.start()
            thrlist.append(t)
        for t in thrlist: 
            t.join()
        item_q.put(StopIteration)

    threading.Thread(target=run_all).start()
    while True:
        item = item_q.get()
        if item is StopIteration: 
            return
        yield item


# Example use
#
# This example requires you to perform these setup steps:
#
# 1.  Go to run/foo and run logsim.py
# 2.  Go to run/bar and run logsim.py
#
# These two steps will start writing two different Apache log files.
# Now, we're going to read from both at the same time.

if __name__ == '__main__':
    from follow import follow
    
    log1 = follow(open("run/foo/access-log"))
    log2 = follow(open("run/bar/access-log"))
    
    log = multiplex([log1,log2])
    
    for line in log:
        print(line, end='')

In [None]:
# broadcast.py
#
# Broadcast a generator source to a collection of consumers

def broadcast(source, consumers):
    for item in source:
        for c in consumers:
            c.send(item)


# Example
if __name__ == '__main__':

    class Consumer(object):
        def send(self,item):
            print(self, "got", item)

    c1 = Consumer()
    c2 = Consumer()
    c3 = Consumer()

    from follow import follow
    lines = follow(open("run/foo/access-log"))
    broadcast(lines,[c1,c2,c3])

In [None]:
# netsend.py
#
# Consume items and send them to a remote machine

import socket, pickle

class NetConsumer(object):
    def __init__(self,addr):
        self.s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.s.connect(addr)
    def send(self,item):
        pitem = pickle.dumps(item)
        self.s.sendall(pitem)
    def close(self):
        self.s.close()

# Example use.  This requires you to run receivefrom.py first.

if __name__ == '__main__':
    from broadcast import broadcast
    from follow import follow
    from apachelog import apache_log

    # A class that sends 404 requests to another host
    class Stat404(NetConsumer):
        def send(self,item):
            if item['status'] == 404:
                NetConsumer.send(self,item)
    
    stat404 = Stat404(("",15000))
    
    lines = follow(open("run/foo/access-log"))
    log   = apache_log(lines)
    broadcast(log,[stat404])


In [None]:
# thrsend.py
#
# Send items to consumer threads

import queue, threading
class ConsumerThread(threading.Thread):
    def __init__(self, target):
        threading.Thread.__init__(self)
        self.setDaemon(True)
        self.in_queue = queue.Queue()
        self.target = target

    def send(self,item):
        self.in_queue.put(item)

    def generate(self):
        while True:
            item = self.in_queue.get()
            yield item

    def run(self):
        self.target(self.generate())

# Example Use

if __name__ == '__main__':
    from follow import follow
    from apachelog import apache_log
    from broadcast import broadcast
    
    def find_404(log):
        r404 = (r for r in log if r['status'] == 404)
        for r in r404:
            print(r['status'],r['datetime'],r['request'])

    def bytes_transferred(log):
        total = 0
        for r in log:
            total += r['bytes']
            print("Total bytes", total)
            
    c1 = ConsumerThread(find_404)
    c1.start()
    c2 = ConsumerThread(bytes_transferred)
    c2.start()
    
    lines = follow(open("run/foo/access-log"))
    log   = apache_log(lines)
    broadcast(log,[c1,c2])

In [None]:
# gentrace.py
#
# Trace a generator by printing items received

def trace(source):
    for item in source:
        print(item)
        yield item

# Example use
if __name__ == '__main__':
    from apachelog import *

    lines = open("access-log")
    log =  trace(apache_log(lines))
    r404 = (r for r in log if r['status'] == 404)

    for r in r404:
        pass

In [None]:
# storelast.py
#
# An iterator that stores the last value returned.  

class storelast(object):
    def __init__(self,source):
        self.source = source
    def __next__(self):
        item = self.source.__next__()
        self.last = item
        return item
    def __iter__(self):
        return self

# Example
if __name__ == '__main__':
    from follow import follow
    from apachelog import apache_log

    lines = storelast(follow(open("run/foo/access-log")))
    log   = apache_log(lines)

    for r in log:
        print(r)
        print(lines.last)

In [None]:
# genshutdown.py
#
# Example of shutting down a generator
#
# Requires you to run run/foo/logsim.py to get a real-time source

from follow import *

lines = follow(open("run/foo/access-log"))
for i, line in enumerate(lines):
    print(line, end='')
    if i == 10: 
        lines.close()

In [None]:
# shutdownevt.py
#
# Example of a generator that uses an event to shut down

import time

def follow(thefile,shutdown=None):
    thefile.seek(0,2)
    while True:
        if shutdown and shutdown.isSet(): break
        line = thefile.readline()
        if not line:
           time.sleep(0.1)
           continue
        yield line

import threading

shutdown_event = threading.Event()

def run():
    lines = follow(open("run/foo/access-log"),shutdown_event)
    for line in lines:
        print line,

    print "Done"


# Run the above in a separate thread
t = threading.Thread(target=run)
t.start()

# Wait a while then shut down


time.sleep(60)
print "Shutting down"

shutdown_event.set()

In [None]:
####Coroutine Example

def grep(pattern):
    print("Searching for:: ", pattern)
    while True:
        line = (yield)
        if pattern in line:
            print(line)
            
search = grep('coroutine')
next(search)
# Output: Searching for coroutine
search.send("I love you")
search.send("Don't you love me?")
search.send("I love coroutines instead!")
# Output: I love coroutines instead!
search.send("Another line for coroutine.")
# Output: "Another line for coroutine.
search.close() #closing the coroutine 


# The sent values are accessed by yield. Why did we run next()? It is required in order to start the coroutine. 
# Just like generators, coroutines do not start the function immediately. 
# Instead they run it in response to the __next__() and .send() methods. 
# Therefore, you have to run next() so that the execution advances to the yield expression.

In [None]:
# recvcount.py
#
# Example of a co-routine

def recv_count():
    try:
        while True:
            n = (yield)
            print("T-minus", n)
    except GeneratorExit:
        print("Kaboom!")

r = recv_count()
r.send(None)
for i in range(5,0,-1):
    r.send(i)

r.close()

In [None]:
# consumer.py
#
# consumer decorator and co-routine example

def consumer(func):
    def start(*args,**kwargs):
        c = func(*args,**kwargs)
        c.send(None)
        return c
    return start

# Example
if __name__ == '__main__':

    @consumer
    def recv_count():
        try:
            while True:
                n = yield
                print("T-minus", n)
        except GeneratorExit:
            print("Kaboom!")

    r = recv_count()
    for i in range(5,0,-1):
        r.send(i)

    r.close()

In [1]:
# logcoroutine.py
#
# An example of using co-routines to define consumers for the Apache log data

from consumer import *
from apachelog import *
from follow import *
from broadcast import *

@consumer
def find_404():
    while True:
        r = (yield)
        if r['status'] == 404:
            print(r['status'],r['datetime'],r['request'])

@consumer
def bytes_transferred():
    total = 0
    while True:
        r = (yield)
        total += r['bytes']
        print("Total bytes", total)

lines = follow(open("run/foo/access-log"))
log   = apache_log(lines)

broadcast(log, [find_404(),bytes_transferred()])

ModuleNotFoundError: No module named 'consumer'