Browse files

Various speedups and optimizations

The largest single performance increase comes from a sleazy hack
in pages.py to make virtual directories use the newly-cached
context.cache_page_children() when walking the real directory
hierarchy. This change drastically speeds up all virtual directory
based things in Wandering Thoughts.

Other changes are smaller but include fiddling in storage.py and
pages.py to basically eliminate pointless lstat() calls (these are
a surprisingly hot path in many cases) and some code bumming.
  • Loading branch information...
1 parent 1e2a021 commit edc074e6ee8ce706e554533c442a10502c61b072 @siebenmann committed Jan 31, 2013
Showing with 124 additions and 38 deletions.
  1. +10 −9 htmlview.py
  2. +53 −18 pageranges.py
  3. +25 −2 pages.py
  4. +20 −8 storage.py
  5. +16 −1 utils.py
View
19 htmlview.py
@@ -34,6 +34,13 @@ class WebServices:
def __init__(self, cfg, model):
self.cfg = cfg
self.model = model
+ # .url_from_path() is such a hot path that it is worth
+ # pre-computing the rooturl rather than doing it every
+ # time.
+ ru = self.cfg.get('publicurl', self.cfg['rooturl'])
+ if ru[-1] != '/':
+ ru = ru + '/'
+ self.rooturl = ru
def prefDirView(self, page):
return self.model.pref_view_and_dir(page, pub_dir_views())[0]
@@ -51,20 +58,14 @@ def url_from_path(self, path, view = None, viewparams = None):
if page.type == "dir" and path:
path = path + '/'
url = urllib.quote(path)
- if not viewparams:
- viewparams = {}
if view:
+ if not viewparams:
+ viewparams = {}
t = ["?%s" % urllib.quote(view)]
for k, v in viewparams.items():
t.append("%s=%s" % (k, urllib.quote_plus(v)))
url = url + "&".join(t)
- if 'publicurl' in self.cfg:
- ru = self.cfg['publicurl']
- else:
- ru = self.cfg['rooturl']
- if ru[-1] != '/':
- ru = ru + '/'
- return ru + url
+ return self.rooturl + url
# uri_from_path() returns something that is usable in a
# redirection.
View
71 pageranges.py
@@ -3,7 +3,7 @@
# (Actually it's a list of (modtime, path) tuples, but small difference.)
import re
-import time, calendar
+import time, calendar, datetime
import derrors, utils
import htmlrends
@@ -131,17 +131,35 @@ def restriction(context):
else:
return rtype
-# Compare our calendar range to a modtime.
-def calendar_cmp(cargs, modtime):
- t = time.localtime(modtime)
- for i in (0, 1, 2):
- if cargs[i] is None:
- break
- res = cmp(t[i], cargs[i])
- if res != 0:
- return res
- # Must be equal, we haven't failed yet.
- return 0
+# Convert a calendar range to two Date objects, one the start and
+# one the end day of the range.
+# crange[0] = year, crange[1] = month, crange[2] = day.
+# None/zero is 'not set', ie the range covers the entire month or the entire
+# year. Year is always set.
+def crange_to_limits(crange):
+ if crange[2]:
+ t = datetime.date(crange[0], crange[1], crange[2])
+ return (t, t)
+ elif crange[1]:
+ _, dcnt = calendar.monthrange(crange[0], crange[1])
+ return (datetime.date(crange[0], crange[1], 1),
+ datetime.date(crange[0], crange[1], dcnt))
+ else:
+ return (datetime.date(crange[0], 1, 1),
+ datetime.date(crange[0], 12, 31))
+
+# basically cmp(modtime, (start, end)):
+# returns: 0 if modtime falls within start to end
+# 1 if modtime > end
+# -1 if modtime < start
+def calendar_cmp(start, end, modtime):
+ t = datetime.date.fromtimestamp(modtime)
+ if t < start:
+ return -1
+ elif t > end:
+ return 1
+ else:
+ return 0
# Filter a (modtime, path) list based on the restriction chosen through
# virtualization.
@@ -170,8 +188,9 @@ def filter_files(context, flist):
rl = []
just_before = None
just_later = None
+ r1, r2 = crange_to_limits(rargs)
for e in flist:
- r = calendar_cmp(rargs, e[0])
+ r = calendar_cmp(r1, r2, e[0])
if r > 0:
just_before = e[0]
elif r < 0:
@@ -319,8 +338,23 @@ def entriesIn(context, ctuple):
dl = context.cache_page_children(context.page.me())
if not dl:
return False
+ cstart, cend = crange_to_limits(ctuple)
+
+ # We search for days a lot; this case is worth optimizing
+ # specifically.
+ # NOTE: this is lame.
+ if cstart == cend:
+ for e in dl:
+ t = datetime.date.fromtimestamp(e[0])
+ if cstart == t:
+ return e[0]
+ elif t < cstart:
+ return False
+ return False
+
+ # general case.
for e in dl:
- res = calendar_cmp(ctuple, e[0])
+ res = calendar_cmp(cstart, cend, e[0])
if res == 0:
return e[0]
# If we've passed the time, we can stop now.
@@ -361,12 +395,13 @@ def genBar(context, scopelist):
def outsideRange(context, cstart, cend):
before = None
after = None
+ cstart1, _ = crange_to_limits(cstart)
+ _, cend2 = crange_to_limits(cend)
for e in context.cache_page_children(context.page.me()):
- r1 = calendar_cmp(cend, e[0])
- r2 = calendar_cmp(cstart, e[0])
- if r1 > 0:
+ r = calendar_cmp(cstart1, cend2, e[0])
+ if r > 0:
before = e[0]
- elif r2 < 0:
+ elif r < 0:
after = e[0]
return (before, after)
return (before, after)
View
27 pages.py
@@ -77,6 +77,12 @@ def __init__(self, path, model):
self.timestamp = self.pfile.timestamp()
self.modstamp = self.pfile.modstamp()
+ # Internal use only, for virtual pages that do not want to
+ # load a nonexistant file from the page store.
+ def _setup(self, path, model):
+ self.name, self.path = utils.canon_path(path)
+ self.model = model
+
# Internal use only:
def _get(self, page):
return self.model.get_page(page)
@@ -107,6 +113,18 @@ def curdir(self):
def children(self, whattype = None):
if self.type != "dir":
return []
+ if whattype:
+ # Worth optimizing, since the common case is
+ # whattype == "dir" and in a large directory
+ # of blog entries we'd otherwise unnecessarily
+ # set up a lot of file pages.
+ pathlist = [utils.pjoin(self.path, z)
+ for z in self.pfile.contents()]
+ clist = [self._get(x) for x in pathlist if
+ self.model.pstore.get_type(x) == whattype]
+ return clist
+
+ # no whattype
clist = [self._get(utils.pjoin(self.path, z)) for
z in self.pfile.contents()]
if whattype:
@@ -192,7 +210,7 @@ def is_util(self):
class VirtDir(Page):
def __init__(self, path, model, root):
- super(VirtDir, self).__init__(path, model)
+ super(VirtDir, self)._setup(path, model) # sorta evil
self.root = root
self.type = "dir"
self.timestamp = root.timestamp
@@ -222,5 +240,10 @@ def me(self):
return self.root
def descendants(self, context):
+ # It is evil to call into the context so that we can load
+ # descendant information from the caches, but it vastly
+ # speeds up handling virtual directories. We'll live with
+ # it for now.
+ # (This just shows that the interface is wrong.)
return pageranges.filter_files(context,
- self.root.descendants(context))
+ context.cache_page_children(self.root))
View
28 storage.py
@@ -35,11 +35,12 @@ def join2(a, b):
# Implicitly, a plain file object is not displayable if it has no contents
# at all.
sBad, sInconsist, sGood, sLocked, sNoRCS = range(5)
+noStat = object()
class FileObj(object):
type = "file"
- def __init__(self, fname, st = None):
+ def __init__(self, fname, st = noStat):
# fname is fully resolvable
- if not st:
+ if st is noStat:
st = fillStat(fname)
self.real = bool(st)
self.name = fname
@@ -95,6 +96,8 @@ def modstamp(self):
# The owner of a normal file is a pretty simple idea in
# theory, but kind of annoying to implement in practice.
+ # FIXME: pwd.getpwuid() is a hot path for Atom feed generation
+ # and should be cacheable somehow.
def owner(self):
if not self.real:
return None
@@ -255,12 +258,7 @@ def __init__(self, cfginfo):
def validname(self, relname):
if relname == '':
return True
- if utils.boguspath(relname):
- return False
- for d in relname.split("/"):
- if not utils.good_path_elem(d):
- return False
- return True
+ return utils.goodpath(relname)
def get(self, relname, missIsNone = False):
# We don't need to revalidate the damn name if it's
@@ -291,6 +289,8 @@ def flush(self):
def set_cache(self, state):
self.cache_on = state
+ # INTERNAL INTERFACE: takes a full path name, not a relative
+ # name. This is dangerous.
def getStat(self, fname):
if fname in self.stcache:
return self.stcache[fname]
@@ -299,6 +299,18 @@ def getStat(self, fname):
self.stcache[fname] = st
return st
+ # Get the type of a name as a string (or None if it doesn't exist).
+ # The name is *relative*.
+ def get_type(self, relname):
+ fname = join2(self.root, relname)
+ st = self.getStat(fname)
+ if not st:
+ return None
+ elif stat_isdir(st):
+ return "dir"
+ else:
+ return "file"
+
def fromdir(self, relname, fname, st):
__pychecker__ = "no-argsused"
return DirObj(fname, st)
View
17 utils.py
@@ -1,4 +1,5 @@
#
+import re
# The less helpful version of os.path.join, for two arguments only, and
# if the second argument is an absolute path we do not discard the first.
@@ -33,6 +34,7 @@ def name_path(path):
return path.split('/')[-1]
# Like os.walk, but on pages and does not return directories.
+# TODO: unused and worth removing?
def walk(page):
res = []
for child in page.children():
@@ -67,22 +69,35 @@ def canonpath(dirn, path):
# Is a path a good path?
# This is called SO OFTEN that it is worth some micro optimizations.
+# Note: good_path_elem() beats an RE-based matcher.
badElem = dict.fromkeys(('.', '..', '', 'RCS'))
def good_path_elem(pelem):
return not (pelem in badElem or \
pelem[0] == '.' or \
pelem[-1] == '~' or \
pelem[-2:] == ",v")
-def goodpath(path):
+def goodpath_old(path):
if path == '':
return True
pelems = [x for x in path.split('/') if not good_path_elem(x)]
return not bool(pelems)
+# note that the '.'-at-start pattern takes out '..' as well.
+# a path that starts with a / is not good, but this is tricky
+# in the re; it matches '^<empty>/'.
+# This beats goodpath_old().
+badpath_re = re.compile(r"(^|/)(\.[^/]*|RCS||[^/]*~|[^/]*,v)(/|$)")
+def goodpath(path):
+ if badpath_re.search(path) and path != "":
+ return False
+ else:
+ return True
+
# A bogus path is one that has directory motion elements in it that
# make us grind our teeth.
# Note that split's behavior means that disallowing '' as a path
# element also disallows paths starting with '/'.
+# As surprising as it might be, this implementation beats a regexp.
def boguspath(path):
if path == '':
return False

0 comments on commit edc074e

Please sign in to comment.