Permalink
Browse files

improved toolserver status api and added a few relative dom stats

  • Loading branch information...
1 parent d0d1720 commit f969185dca5918039214089fb7a58b4ac9155816 @slaporte committed Oct 2, 2012
Showing with 69 additions and 13 deletions.
  1. +32 −4 all_revisions.py
  2. +16 −5 dashboard.py
  3. +1 −1 input_server.py
  4. +16 −1 inputs/dom.py
  5. +4 −2 loupe.py
View
@@ -25,6 +25,7 @@
DESIRED_PROPS = ["rev_sha1", "rev_len", "rev_timestamp", "rev_minor_edit", "rev_user_text", "rev_comment", "rev_deleted", "rev_user", "rev_id"]
+LOG_OUTSTANDING_LIMIT = 7200 # 2 hours
class AccessLogger(object):
def __init__(self, logfile):
@@ -34,13 +35,33 @@ def __init__(self, logfile):
write_handler = logging.FileHandler(logfile)
self.logger.addHandler(write_handler)
- def log(self, action, hostname, params):
- self.logger.info(json.dumps({'time': time.asctime(), 'action': action, 'hostname': hostname, 'params': params}))
+ def log(self, action, hostname, params, start_time):
+ self.logger.info(json.dumps({'time': time.time(), 'action': action, 'hostname': hostname, 'params': params, 'start_time': start_time}))
+
+ def outstanding(self):
+ # TODO: fix
+ lines = self.read(30)
+ recents = [line for line in lines if line['age'] < LOG_OUTSTANDING_LIMIT]
+ starts = set([(start['hostname'], start['params'], start['start_time']) for start in recents if start['action'] == 'start'])
+ completes = set([(finish['hostname'], finish['params'], finish['start_time']) for finish in recents if finish['action'] == 'complete'])
+ outstanding = starts - completes
+ return {'openlog': len(outstanding)}
def read(self, no):
+ ret = []
history = open(self.logfile, 'r')
lines = history.readlines()
- return [json.loads(line) for line in lines[-no:]][::-1]
+ if not lines[-no:]:
+ return []
+ else:
+ for line in lines[-no:]:
+ line = json.loads(line)
+ time_s = line['time']
+ line['time'] = time.ctime(time_s)
+ line['age'] = round(time.time() - time_s)
+ ret.append(line)
+ return ret[::-1]
+
LOG = AccessLogger('access.log')
@@ -101,8 +122,9 @@ def write_log():
action = request.query.action
hostname = request.query.hostname
params = request.query.params
+ start_time = request.query.start_time
if action and hostname and params:
- LOG.log(action, hostname, params)
+ LOG.log(action, hostname, params, start_time)
return {'log': LOG.read(1), 'write': 'success'}
else:
return {'write': 'failure'}
@@ -120,6 +142,12 @@ def read_log(lines=10):
return {'log': LOG.read(lines)}
+@route('/openlog')
+@route('/openlog/')
+def print_open():
+ return LOG.outstanding()
+
+
@route('/revisions/<title:path>')
def get_revisions(title):
article = ArticleHistory(title)
View
@@ -47,13 +47,16 @@ def __init__(self, louper, *args, **kwargs):
self.inputs = louper.input_classes
self.failed_stats = louper.failed_stats
self.fetch_failures = louper.fetch_failures
-
+ self.start_time = time.time()
self.tpool = None
self.toolserver_uptime = self.get_toolserver_uptime()
self.start_time = kwargs.get('start_time') or time.time()
self.start_cmd = kwargs.get('start_cmd') or ' '.join(sys.argv)
self.host_machine = kwargs.get('hostname') or socket.gethostname()
- self.send_toolserver_log('start')
+ self.open_toolserver_queries = self.get_toolserver_openlog()
+ if self.open_toolserver_queries > 0:
+ print '\nNote: there are', self.open_toolserver_queries, 'open queries on toolserver\n'
+ self.send_toolserver_log('start', start_time=self.start_time)
self.route('/', callback=self.render_dashboard, template='dashboard')
self.route('/summary', callback=self.get_summary_dict, template='summary')
self.route('/all_results', callback=self.get_all_results)
@@ -142,12 +145,20 @@ def get_report(self):
def get_toolserver_uptime(self):
try:
res = wapiti.get_json('http://ortelius.toolserver.org:8089/uptime')
+ res['open_queries'] = self.open_toolserver_queries
except Exception as e:
print 'Error getting toolserver stats:', e
return res
- def send_toolserver_log(self, action):
- params = {'action': action, 'hostname': self.host_machine, 'params': self.start_cmd}
+ def get_toolserver_openlog(self):
+ try:
+ res = wapiti.get_json('http://ortelius.toolserver.org:8089/openlog')
+ except Exception as e:
+ print 'Error getting toolserver stats:', e
+ return res['openlog']
+
+ def send_toolserver_log(self, action, start_time=0):
+ params = {'action': action, 'hostname': self.host_machine, 'params': self.start_cmd, 'start_time': start_time}
try:
wapiti.get_url('http://ortelius.toolserver.org:8089/writelog/', params=params)
except Exception as e:
@@ -157,7 +168,7 @@ def render_dashboard(self, final=False):
ret = self.get_dict()
if final:
ret['toolserver_final'] = self.get_toolserver_uptime()
- self.send_toolserver_log('complete')
+ self.send_toolserver_log('complete', start_time=self.start_time)
else:
ret['toolserver_final'] = False
return ret
View
@@ -52,4 +52,4 @@ def do_input(input_name, page_title='', page_id=None):
if __name__ == '__main__':
bottle.debug(True)
- run(host='0.0.0.0', port=8700, server='gevent', reloader=True)
+ run(host='0.0.0.0', port=8701, reloader=True)
View
@@ -59,10 +59,17 @@ def xpath_search(f):
return xpath_search
+def per_word(feature, f):
+ words = float(len(f('p').text().split()))
+ if words > 0:
+ return len(f(feature)) / words
+ else:
+ return 0
+
class DOM(Input):
prefix = 'd'
-
+
def api_fetch(self):
"""
Deprecated fetch() that gets parsed content from the API.
@@ -89,6 +96,13 @@ def process(self, f_res):
'word_count': lambda f: len(f('p').text().split()),
'p': lambda f: dist_stats(paragraph_counts(f)),
+ # Key stats relative to word count
+ 'img_per_w': lambda f: per_word('.image', f),
+ 'cite_per_w': lambda f: per_word('li[id^="cite_note"]', f),
+ 'int_link_per_w': lambda f: per_word('p a[href^="/wiki/"]', f),
+ 'red_link_per_w': lambda f: per_word('.new', f),
+ 'ext_link_per_w': lambda f: per_word('.external', f),
+
# Section-based page structure stats
'h2': lambda f: section_stats(f('h2')),
'h3': lambda f: section_stats(f('h3')),
@@ -153,6 +167,7 @@ def process(self, f_res):
'unicode_count': lambda f: len(f('.unicode, .Unicode')),
# Template inspection, mostly fault detection
+ 'tmpl_general': lambda f: len(f('.ambox')),
'tmpl_delete': lambda f: len(f('.ambox-delete')),
'tmpl_autobiography': lambda f: len(f('.ambox-autobiography')),
'tmpl_advert': lambda f: len(f('.ambox-Advert')),
View
@@ -49,9 +49,10 @@ def add(self, grn, *args, **kwargs):
class ArticleLoupe(Greenlet):
- def __init__(self, title, page_id, input_classes=None, input_pool=None, *args, **kwargs):
+ def __init__(self, title, page_id, page_ns, input_classes=None, input_pool=None, *args, **kwargs):
self.title = title
self.page_id = page_id
+ self.page_ns = page_ns
if input_classes is None:
input_classes = DEFAULT_INPUTS
self.inputs = [i(title=self.title,
@@ -154,7 +155,7 @@ def run(self):
print 'Creating Loupes for', len(self.page_ds), 'articles...'
create_i = 0
for pd in self.page_ds:
- al = ArticleLoupe(pd.title, pd.page_id, input_pool=self.input_pool, input_classes=self.input_classes)
+ al = ArticleLoupe(pd.title, pd.page_id, pd.ns, input_pool=self.input_pool, input_classes=self.input_classes)
create_i += 1
al.create_i = create_i
al.link(self.on_loupe_complete)
@@ -181,6 +182,7 @@ def on_loupe_complete(self, loupe):
output_dict = loupe.results
output_dict['title'] = loupe.title
output_dict['id'] = loupe.page_id
+ output_dict['ns'] = loupe.page_ns
output_dict['times'] = loupe.times
self.output_file.write(json.dumps(output_dict, default=str))

0 comments on commit f969185

Please sign in to comment.