Fetching contributors…
Cannot retrieve contributors at this time
executable file 138 lines (112 sloc) 4.43 KB
"""Generate growth data since launch using datastore admin backup files.
TODO: port to BigQuery queries.
Outputs TSV files for each entity kind and a growth.tsv file with daily counts
for all kinds and other features. It's ugly, inadequately commented, poorly
tested, etc. Don't use it for anything remotely important!
Warning, takes >6h to run (on e.g. a 2014 MBP) and GBs of memory!
I used this to generate the graphs in
(by importing the output into )
Datastore admin backups are LevelDB log files. This code is based on:
More details: (search for 3.)
To download the files:
gsutil cp -r gs://\* .
import collections
import csv
import datetime
import glob
import itertools
import logging
import sys
import urlparse
from google.appengine.api.files import records
from google.appengine.datastore import entity_pb
from google.appengine.api import datastore
from google.appengine.api import datastore_errors
KINDS = SOURCE_KINDS + ('Response', 'BlogPost', 'Publish', 'BlogWebmention')
FEATURES = ('listen', 'publish', 'webmention')
INCLUDE_PROPS = {'features', 'sent', 'unsent', 'error', 'failed', 'skipped', 'links', 'domains', 'created', 'updated'}
# maps string kind to list of entities (property dicts)
all_entities = collections.defaultdict(list)
# read app engine datastore admin backup files
for filename in glob.glob('datastore_backup_*/*/*'):
print filename
with open(filename, 'rb') as raw:
reader = records.RecordsReader(raw)
for record in reader:
entity_proto = entity_pb.EntityProto(contents=record)
entity = datastore.Entity.FromPb(entity_proto)
except datastore_errors.Error:
logging.error('!!! Skipped an entity !!! %s' % entity.key().to_path(),
kind = entity.kind()
if kind not in KINDS:
props = {k: ' '.join(v.splitlines()).encode('utf-8')
if isinstance(v, basestring) else v
for k, v in entity.items() if k in INCLUDE_PROPS}
# generate time series growth data for number of users, wms sent, etc. by day
# sort chronologically
for values in all_entities.values():
values.sort(key=lambda e: e['created'])
# domains that have successfully received a webmention
domains = set()
# walk days from launch to now, accumulate counts per day
with open('growth.tsv', 'w') as file:
writer = csv.writer(file, dialect='excel-tab')
columns = KINDS + FEATURES + ('links', 'webmentions', 'domains')
writer.writerow(('created',) + columns)
# maps string column to count
counts = {c: 0 for c in columns}
date =, 12, 1)
while date <
for kind in KINDS:
entities = all_entities[kind]
while entities and entities[0]['created'].date() == date:
counts[kind] += 1
e = entities.pop(0)
if kind in SOURCE_KINDS:
for f in e.get('features', []):
counts[f] += 1
elif kind in ('Response', 'BlogPost'):
sent = e.get('sent', [])
counts['webmentions'] += len(sent)
links = list(itertools.chain(*[e.get(field, []) for field in
'sent', 'unsent', 'error', 'failed', 'skipped']))
counts['links'] += len(links)
domains.update(urlparse.urlparse(l).netloc for l in sent)
counts['domains'] = len(domains)
writer.writerow([date] + [counts[c] for c in columns])
date += datetime.timedelta(days=1)
if == 1:
print date
for kind, entities in all_entities.items():
if entities:
print '%d %s entities left over! e.g. %s' % (len(entities), kind, entities[0])