From 13dfa4020dc8adce8eb1e4df581603c5e8454339 Mon Sep 17 00:00:00 2001 From: Gunnar Schaefer Date: Wed, 10 Feb 2016 22:40:18 -0800 Subject: [PATCH 01/11] Prep for http-based bootstrapping - run paster asynchronously - improve mongo installation - pull out mongo version --- bin/run.sh | 50 +++++++++++++++++++++++++++++---------------- mongodb_version.txt | 1 + 2 files changed, 33 insertions(+), 18 deletions(-) create mode 100644 mongodb_version.txt diff --git a/bin/run.sh b/bin/run.sh index 143245e75..0be4d26a6 100755 --- a/bin/run.sh +++ b/bin/run.sh @@ -5,7 +5,7 @@ set -e unset CDPATH cd "$( dirname "${BASH_SOURCE[0]}" )/.." -echo() { builtin echo "[SCITRAN] $@"; } +echo() { builtin echo -e "\033[1;34m\033[47mSCITRAN\033[0;0m\033[47m $@\033[0;0m"; } set -o allexport @@ -21,7 +21,7 @@ if [ "$#" -gt 1 ]; then fi -# Default config values +# Minimal default config values SCITRAN_RUNTIME_HOST=${SCITRAN_RUNTIME_HOST:-"127.0.0.1"} SCITRAN_RUNTIME_PORT=${SCITRAN_RUNTIME_PORT:-"8080"} SCITRAN_RUNTIME_PATH=${SCITRAN_RUNTIME_PATH:-"./runtime"} @@ -82,21 +82,13 @@ else fi if [ -d "$SCITRAN_RUNTIME_PATH" ]; then - echo "Virtualenv exists present at $SCITRAN_RUNTIME_PATH" + echo "Virtualenv exists at $SCITRAN_RUNTIME_PATH" else echo "Creating 'scitran' Virtualenv at $SCITRAN_RUNTIME_PATH" virtualenv -p `brew --prefix`/bin/python --prompt="(scitran)" $SCITRAN_RUNTIME_PATH echo "Created 'scitran' Virtualenv at $SCITRAN_RUNTIME_PATH" fi -if [ -f "$SCITRAN_RUNTIME_PATH/bin/mongod" ]; then - echo "MongoDB is installed" -else - echo "Installing MongoDB" - curl https://fastdl.mongodb.org/osx/mongodb-osx-x86_64-3.0.7.tgz | tar xz -C $SCITRAN_RUNTIME_PATH --strip-components 1 - echo "MongoDB installed" -fi - echo "Activating Virtualenv" source $SCITRAN_RUNTIME_PATH/bin/activate @@ -105,9 +97,29 @@ echo "Installing Python requirements" bin/install.sh -# Launch mongod +# Install and launch MongoDB +install_mongo() { + curl $MONGODB_URL | tar xz -C $VIRTUAL_ENV/bin --strip-components 2 + echo "MongoDB version $MONGODB_VERSION installed" +} + +MONGODB_VERSION=$(cat mongodb_version.txt) +MONGODB_URL="https://fastdl.mongodb.org/osx/mongodb-osx-x86_64-$MONGODB_VERSION.tgz" +if [ -x "$VIRTUAL_ENV/bin/mongod" ]; then + INSTALLED_MONGODB_VERSION=$($VIRTUAL_ENV/bin/mongod --version | grep "db version" | cut -d "v" -f 3) + echo "MongoDB version $INSTALLED_MONGODB_VERSION is installed" + if [ "$INSTALLED_MONGODB_VERSION" != "$MONGODB_VERSION" ]; then + echo "Upgrading MongoDB to version $MONGODB_VERSION" + install_mongo + fi +else + echo "Installing MongoDB" + install_mongo +fi + mongod --dbpath $SCITRAN_PERSISTENT_DB_PATH --smallfiles --port $SCITRAN_PERSISTENT_DB_PORT & -MONGO_PID=$! +MONGOD_PID=$! + # Set python path so scripts can work export PYTHONPATH=. @@ -159,12 +171,14 @@ ssl_pem=$SCITRAN_RUNTIME_SSL_PEM paste.app_factory = api.api:app_factory EOF -paster serve --reload $TEMP_INI_FILE +paster serve --reload $TEMP_INI_FILE & +PASTER_PID=$! + + +# Shutdown mongod and paster on SIGINT and SIGTERM +trap "{ echo 'Exit signal trapped'; kill $MONGOD_PID $PASTER_PID; wait; }" EXIT +wait # Clean up and exit out of the python virtualenv rm -f $TEMP_INI_FILE deactivate - -# Shutdown mongod on ctrl+C -kill $MONGO_PID -wait $MONGO_PID diff --git a/mongodb_version.txt b/mongodb_version.txt new file mode 100644 index 000000000..e4604e3af --- /dev/null +++ b/mongodb_version.txt @@ -0,0 +1 @@ +3.2.1 From f142f6dc4c88d72f00b7614ebd361caba4299cae Mon Sep 17 00:00:00 2001 From: Gunnar Schaefer Date: Wed, 10 Feb 2016 22:44:45 -0800 Subject: [PATCH 02/11] Reword job logging --- api/jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/jobs.py b/api/jobs.py index 240097f71..2f4bb9edc 100644 --- a/api/jobs.py +++ b/api/jobs.py @@ -156,7 +156,7 @@ def queue_job(db, algorithm_id, input, tags=[], attempt_n=1, previous_job_id=Non result = db.jobs.insert_one(job) _id = result.inserted_id - log.info('Running %s as job %s to process %s %s' % (gear.name, str(_id), input.container_type, input.container_id)) + log.info('Enqueuing %s as job %s to process %s %s' % (gear.name, str(_id), input.container_type, input.container_id)) return _id def retry_job(db, j, force=False): From afef7b520dcf4ae9c3ba72ac024a3e3e4ad6255c Mon Sep 17 00:00:00 2001 From: Gunnar Schaefer Date: Thu, 18 Feb 2016 22:02:56 -0600 Subject: [PATCH 03/11] Improve mongo and paster async behavior --- bin/run.sh | 59 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/bin/run.sh b/bin/run.sh index 0be4d26a6..ba9c2c0c2 100755 --- a/bin/run.sh +++ b/bin/run.sh @@ -124,7 +124,38 @@ MONGOD_PID=$! # Set python path so scripts can work export PYTHONPATH=. -# Boostrap users + +# Serve API with PasteScript +TEMP_INI_FILE=$(mktemp -t scitran_api) +cat << EOF > $TEMP_INI_FILE +[server:main] +use = egg:Paste#http +host = $SCITRAN_RUNTIME_HOST +port = $SCITRAN_RUNTIME_PORT +ssl_pem=$SCITRAN_RUNTIME_SSL_PEM + +[app:main] +paste.app_factory = api.api:app_factory +EOF + +paster serve --reload $TEMP_INI_FILE & +PASTER_PID=$! + + +# Set up exit and error trap to shutdown mongod and paster +trap "{ + echo 'Exit signal trapped'; + kill $MONGOD_PID $PASTER_PID; wait; + rm -f $TEMP_INI_FILE + deactivate +}" EXIT ERR + + +# Wait for everything to come up +sleep 1 + + +# Boostrap users and groups if [ $BOOTSTRAP_USERS -eq 1 ]; then echo "Bootstrapping users" bin/bootstrap.py users "$SCITRAN_RUNTIME_BOOTSTRAP" @@ -132,6 +163,8 @@ else echo "Database exists at $SCITRAN_PERSISTENT_PATH/db. Not bootstrapping users." fi + +# Boostrap test data TESTDATA_URL="https://github.com/scitran/testdata/archive/master.tar.gz" TESTDATA_VERSION=$(curl -sLI $TESTDATA_URL | grep ETag | tail -n 1 | cut -f 2 -d '"') if [ ! -d "$SCITRAN_PERSISTENT_PATH/testdata" ]; then @@ -158,27 +191,5 @@ else fi -# Serve API with PasteScript -TEMP_INI_FILE=$(mktemp -t scitran_api) -cat << EOF > $TEMP_INI_FILE -[server:main] -use = egg:Paste#http -host = $SCITRAN_RUNTIME_HOST -port = $SCITRAN_RUNTIME_PORT -ssl_pem=$SCITRAN_RUNTIME_SSL_PEM - -[app:main] -paste.app_factory = api.api:app_factory -EOF - -paster serve --reload $TEMP_INI_FILE & -PASTER_PID=$! - - -# Shutdown mongod and paster on SIGINT and SIGTERM -trap "{ echo 'Exit signal trapped'; kill $MONGOD_PID $PASTER_PID; wait; }" EXIT +# Wait for good or bad things to happen until exit or error trap catches wait - -# Clean up and exit out of the python virtualenv -rm -f $TEMP_INI_FILE -deactivate From e023dd742fa7ffe2c9e479fc33fe536c3a0550ce Mon Sep 17 00:00:00 2001 From: Gunnar Schaefer Date: Tue, 23 Feb 2016 11:23:47 -0800 Subject: [PATCH 04/11] Remove some input vs mongo schema redundancy --- api/schemas/input/group.json | 6 +++--- api/schemas/input/user.json | 10 +++++----- api/schemas/mongo/group.json | 6 ------ api/schemas/mongo/user.json | 9 --------- api/validators.py | 7 +++++-- 5 files changed, 13 insertions(+), 25 deletions(-) diff --git a/api/schemas/input/group.json b/api/schemas/input/group.json index 280c68eee..ab80f128d 100644 --- a/api/schemas/input/group.json +++ b/api/schemas/input/group.json @@ -4,15 +4,15 @@ "type": "object", "properties": { "_id": { - "maxLength": 32, + "maxLength": 64, "minLength": 2, "pattern": "^[0-9a-z][0-9a-z.@_-]{0,30}[0-9a-z]$", "title": "ID", "type": "string" }, "name": { - "maxLength": 32, - "minLength": 2, + "maxLength": 64, + "minLength": 1, "pattern": "^[0-9A-Za-z][0-9A-Za-z .@_-]{0,30}[0-9A-Za-z]$", "title": "Name", "type": "string" diff --git a/api/schemas/input/user.json b/api/schemas/input/user.json index d8b0f20fd..578f27d34 100644 --- a/api/schemas/input/user.json +++ b/api/schemas/input/user.json @@ -4,21 +4,21 @@ "type": "object", "properties": { "_id": { - "maxLength": 32, + "maxLength": 64, "minLength": 2, "pattern": "^[0-9a-z.@_-]*$", "title": "ID", "type": "string" }, "firstname": { - "maxLength": 32, - "minLength": 2, + "maxLength": 64, + "minLength": 1, "title": "First Name", "type": "string" }, "lastname": { - "maxLength": 32, - "minLength": 2, + "maxLength": 64, + "minLength": 1, "title": "Last Name", "type": "string" }, diff --git a/api/schemas/mongo/group.json b/api/schemas/mongo/group.json index cbfaf5686..5c360ecbe 100644 --- a/api/schemas/mongo/group.json +++ b/api/schemas/mongo/group.json @@ -4,18 +4,12 @@ "type": "object", "properties": { "_id": { - "maxLength": 32, - "minLength": 2, - "pattern": "^[0-9a-z][0-9a-z.@_-]{0,30}[0-9a-z]$", "title": "ID", "type": "string" }, "created": {}, "modified": {}, "name": { - "maxLength": 32, - "minLength": 2, - "pattern": "^[0-9A-Za-z][0-9A-Za-z .@_-]{0,30}[0-9A-Za-z]$", "title": "Name", "type": "string" }, diff --git a/api/schemas/mongo/user.json b/api/schemas/mongo/user.json index 21690fcec..8e2b2fd55 100644 --- a/api/schemas/mongo/user.json +++ b/api/schemas/mongo/user.json @@ -4,34 +4,25 @@ "type": "object", "properties": { "_id": { - "maxLength": 32, - "minLength": 2, - "pattern": "^[0-9a-z.@_-]*$", "title": "ID", "type": "string" }, "created": {}, "modified": {}, "firstname": { - "maxLength": 32, - "minLength": 2, "title": "First Name", "type": "string" }, "lastname": { - "maxLength": 32, - "minLength": 2, "title": "Last Name", "type": "string" }, "email": { - "format": "email", "title": "Email", "type": "string" }, "avatars": { "$ref": "avatars.json"}, "avatar": { - "format": "uri", "title": "Avatar", "type": "string" }, diff --git a/api/validators.py b/api/validators.py index ede73c5a0..6475f1fcd 100644 --- a/api/validators.py +++ b/api/validators.py @@ -1,5 +1,6 @@ import os import copy +import glob import jsonschema from . import config @@ -56,13 +57,15 @@ class DBValidationException(Exception): mongo_schemas = set() input_schemas = set() # validate and cache schemas at start time -for schema_file in os.listdir(schema_path + '/schemas/mongo/'): +for schema_filepath in glob.glob(schema_path + '/schemas/mongo/*.json'): + schema_file = os.path.basename(schema_filepath) mongo_schemas.add(schema_file) resolver_mongo.resolve(schema_file) assert mongo_schemas == expected_mongo_schemas, '{} is different from {}'.format(mongo_schemas, expected_mongo_schemas) -for schema_file in os.listdir(schema_path + '/schemas/input/'): +for schema_filepath in glob.glob(schema_path + '/schemas/input/*.json'): + schema_file = os.path.basename(schema_filepath) input_schemas.add(schema_file) resolver_input.resolve(schema_file) From ef00d4aa36e2e945af34800173443b06ab0431c3 Mon Sep 17 00:00:00 2001 From: Gunnar Schaefer Date: Tue, 23 Feb 2016 14:30:52 -0800 Subject: [PATCH 05/11] Bootstrap users, groups, and data via http --- api/handlers/grouphandler.py | 2 +- bin/bootstrap.py | 179 ++++++++++++++++++----------------- bin/run.sh | 22 ++--- requirements.txt | 1 + 4 files changed, 100 insertions(+), 104 deletions(-) diff --git a/api/handlers/grouphandler.py b/api/handlers/grouphandler.py index db62be4e2..f4c70b586 100644 --- a/api/handlers/grouphandler.py +++ b/api/handlers/grouphandler.py @@ -75,7 +75,7 @@ def post(self): payload_validator = validators.payload_from_schema_file('group.json') payload_validator(payload, 'POST') payload['created'] = payload['modified'] = datetime.datetime.utcnow() - payload['roles'] = [{'_id': self.uid, 'access': 'admin', 'site': self.user_site}] + payload['roles'] = [{'_id': self.uid, 'access': 'admin', 'site': self.user_site}] if self.uid else [] result = mongo_validator(permchecker(self.storage.exec_op))('POST', payload=payload) if result.acknowledged: return {'_id': result.inserted_id} diff --git a/bin/bootstrap.py b/bin/bootstrap.py index 8111f4814..9d791c8f0 100755 --- a/bin/bootstrap.py +++ b/bin/bootstrap.py @@ -3,30 +3,63 @@ """This script helps bootstrap users and data""" import os +import sys import json import shutil import hashlib +import logging import zipfile import argparse import datetime import requests +import requests_toolbelt +from api import tempdir as tempfile -from api.dao import reaperutil -from api import util -from api import rules -from api import config +logging.basicConfig( + format='%(asctime)s %(name)16.16s %(filename)24.24s %(lineno)5d:%(levelname)4.4s %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', + level=logging.DEBUG, +) +log = logging.getLogger('scitran.bootstrap') -log = config.log +logging.getLogger('requests').setLevel(logging.WARNING) # silence Requests library -def clean(args): - config.db.client.drop_database(config.db) +if 'SCITRAN_CORE_DRONE_SECRET' not in os.environ: + log.error('SCITRAN_CORE_DRONE_SECRET not configured') + sys.exit(1) -clean_desc = """ -example: -./bin/bootstrap.py clean -""" +if 'SCITRAN_RUNTIME_HOST' not in os.environ or 'SCITRAN_RUNTIME_PORT' not in os.environ: + log.error('SCITRAN_RUNTIME_HOST or SCITRAN_RUNTIME_PORT not configured') + sys.exit(1) +else: + API_URL = 'https://%s:%s/api' % (os.environ['SCITRAN_RUNTIME_HOST'], os.environ['SCITRAN_RUNTIME_PORT']) + +if 'SCITRAN_PERSISTENT_PATH' in os.environ and 'SCITRAN_PERSISTENT_DATA_PATH' not in os.environ: + os.environ['SCITRAN_PERSISTENT_DATA_PATH'] = os.path.join(os.environ['SCITRAN_PERSISTENT_PATH'], 'data') + +HTTP_HEADERS = {'X-SciTran-Auth': os.environ['SCITRAN_CORE_DRONE_SECRET'], 'User-Agent': 'SciTran Drone Bootstrapper'} + + +def metadata_encoder(o): + if isinstance(o, datetime.datetime): + if o.tzinfo is None: + o = pytz.timezone('UTC').localize(o) + return o.isoformat() + elif isinstance(o, datetime.tzinfo): + return o.zone + raise TypeError(repr(o) + ' is not JSON serializable') + + +def create_archive(content, arcname, metadata, outdir=None, filenames=None): + path = (os.path.join(outdir, arcname) if outdir else content) + '.zip' + with zipfile.ZipFile(path, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zf: + zf.comment = json.dumps(metadata, default=metadata_encoder) + zf.write(content, arcname) + for fn in filenames or os.listdir(content): + zf.write(os.path.join(content, fn), os.path.join(arcname, fn)) + return path def users(args): @@ -35,34 +68,20 @@ def users(args): input_data = json.load(json_dump) log.info('bootstrapping users...') with requests.Session() as rs: - rs.params = {'d': '404'} + rs.verify = not args.insecure + rs.headers = HTTP_HEADERS for u in input_data.get('users', []): log.info(' ' + u['_id']) - u['created'] = now - u['modified'] = now - u.setdefault('email', u['_id']) - u.setdefault('preferences', {}) - gravatar = 'https://gravatar.com/avatar/' + hashlib.md5(u['email']).hexdigest() + '?s=512' - if rs.head(gravatar): - u.setdefault('avatar', gravatar) - u.setdefault('avatars', {}) - u['avatars'].setdefault('gravatar', gravatar) - config.db.users.update_one({'_id': u['_id']}, {'$setOnInsert': u}, upsert=True) - log.info('bootstrapping groups...') - site_id = config.get_item('site', 'id') + rs.post(API_URL + '/users', json=u) + log.info('bootstrapping groups... foo') + site_id = 'local' #config.get_item('site', 'id') for g in input_data.get('groups', []): log.info(' ' + g['_id']) - g['created'] = now - g['modified'] = now - for r in g['roles']: + roles = g.pop('roles') + rs.post(API_URL + '/groups' , json=g) + for r in roles: r.setdefault('site', site_id) - config.db.groups.update_one({'_id': g['_id']}, {'$setOnInsert': g}, upsert=True) - log.info('bootstrapping drones...') - for d in input_data.get('drones', []): - log.info(' ' + d['_id']) - d['created'] = now - d['modified'] = now - config.db.drones.update_one({'_id': d['_id']}, {'$setOnInsert': d}, upsert=True) + rs.post(API_URL + '/groups/' + g['_id'] + '/roles' , json=r) log.info('bootstrapping complete') users_desc = """ @@ -72,52 +91,40 @@ def users(args): def data(args): - log.info('inspecting %s' % args.path) + log.info('Inspecting %s' % args.path) files = [] - for dirpath, dirnames, filenames in os.walk(args.path): - for filepath in [os.path.join(dirpath, fn) for fn in filenames if not fn.startswith('.')]: - if not os.path.islink(filepath): - files.append(filepath) - dirnames[:] = [dn for dn in dirnames if not dn.startswith('.')] # need to use slice assignment to influence walk behavior - file_cnt = len(files) - log.info('found %d files to sort (ignoring symlinks and dotfiles)' % file_cnt) - for i, filepath in enumerate(files): - log.info('Loading %s [%s] (%d/%d)' % (os.path.basename(filepath), util.hrsize(os.path.getsize(filepath)), i+1, file_cnt)) - hash_ = hashlib.sha384() - size = os.path.getsize(filepath) - try: - metadata = json.loads(zipfile.ZipFile(filepath).comment) - except ValueError as e: - log.warning(str(e)) - continue - target, file_ = reaperutil.create_container_hierarchy(metadata) - with open(filepath, 'rb') as fd: - for chunk in iter(lambda: fd.read(2**20), ''): - hash_.update(chunk) - computed_hash = 'v0-sha384-' + hash_.hexdigest() - destpath = os.path.join(config.get_item('persistent', 'data_path'), util.path_from_hash(computed_hash)) - dir_destpath = os.path.dirname(destpath) - filename = os.path.basename(filepath) - if not os.path.exists(dir_destpath): - os.makedirs(dir_destpath) - if args.copy: - shutil.copyfile(filepath, destpath) - else: - shutil.move(filepath, destpath) - created = modified = datetime.datetime.utcnow() - fileinfo = { - 'name': filename, - 'size': size, - 'hash': computed_hash, - 'type': 'dicom', # we are only bootstrapping dicoms at the moment - 'created': created, - 'modified': modified, - 'mimetype': util.guess_mimetype(filename), - } - fileinfo.update(file_) - target.add_file(fileinfo) - rules.create_jobs(config.db, target.container, 'acquisition', fileinfo) - + with requests.Session() as rs: + rs.verify = not args.insecure + rs.headers = HTTP_HEADERS + for dirpath, dirnames, filenames in os.walk(args.path): + dirnames[:] = [dn for dn in dirnames if not dn.startswith('.')] # use slice assignment to influence walk + if not dirnames and filenames: + for metadata_file in filenames: + if metadata_file.lower() == 'metadata.json': + filenames.remove(metadata_file) + break + else: + metadata_file = None + if not metadata_file: + log.warning('Skipping %s: No metadata found' % dirpath) + continue + with open(os.path.join(dirpath, metadata_file)) as fd: + try: + metadata = json.load(fd) + except ValueError: + log.warning('Skipping %s: Invalid metadata' % dirpath) + continue + # FIXME need schema validation + with tempfile.TemporaryDirectory() as tempdir: + log.info('Packaging %s' % dirpath) + filepath = create_archive(dirpath, os.path.basename(dirpath), metadata, tempdir, filenames) + filename = os.path.basename(filepath) + metadata.get('acquisition', {}).get('files', [{}])[0]['name'] = filename + log.info('Uploading %s' % filename) + with open(filepath, 'rb') as fd: + metadata_json = json.dumps(metadata, default=metadata_encoder) + mpe = requests_toolbelt.multipart.encoder.MultipartEncoder(fields={'metadata': metadata_json, 'file': (filename, fd)}) + rs.post(API_URL + '/uploader', data=mpe, headers={'Content-Type': mpe.content_type}) data_desc = """ example: @@ -128,14 +135,6 @@ def data(args): parser = argparse.ArgumentParser() subparsers = parser.add_subparsers(help='operation to perform') -clean_parser = subparsers.add_parser( - name='clean', - help='reset database to clean state', - description=clean_desc, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) -clean_parser.set_defaults(func=clean) - users_parser = subparsers.add_parser( name='users', help='bootstrap users and groups', @@ -151,9 +150,13 @@ def data(args): description=data_desc, formatter_class=argparse.RawDescriptionHelpFormatter, ) -data_parser.add_argument('-c', '--copy', action='store_true', help='copy data instead of moving it') data_parser.add_argument('path', help='filesystem path to data') data_parser.set_defaults(func=data) +parser.add_argument('-i', '--insecure', action='store_true', help='do not verify SSL connections') args = parser.parse_args() + +if args.insecure: + requests.packages.urllib3.disable_warnings() + args.func(args) diff --git a/bin/run.sh b/bin/run.sh index ba9c2c0c2..cd6e36140 100755 --- a/bin/run.sh +++ b/bin/run.sh @@ -158,34 +158,26 @@ sleep 1 # Boostrap users and groups if [ $BOOTSTRAP_USERS -eq 1 ]; then echo "Bootstrapping users" - bin/bootstrap.py users "$SCITRAN_RUNTIME_BOOTSTRAP" + bin/bootstrap.py -i users "$SCITRAN_RUNTIME_BOOTSTRAP" else echo "Database exists at $SCITRAN_PERSISTENT_PATH/db. Not bootstrapping users." fi # Boostrap test data -TESTDATA_URL="https://github.com/scitran/testdata/archive/master.tar.gz" -TESTDATA_VERSION=$(curl -sLI $TESTDATA_URL | grep ETag | tail -n 1 | cut -f 2 -d '"') +TESTDATA_REPO="https://github.com/scitran/testdata.git" if [ ! -d "$SCITRAN_PERSISTENT_PATH/testdata" ]; then - echo "Downloading testdata to $SCITRAN_PERSISTENT_PATH/testdata" - mkdir "$SCITRAN_PERSISTENT_PATH/testdata" - curl -L $TESTDATA_URL | tar xz -C "$SCITRAN_PERSISTENT_PATH/testdata" --strip-components 1 + echo "Cloning testdata to $SCITRAN_PERSISTENT_PATH/testdata" + git clone --single-branch --branch bootstrap $TESTDATA_REPO $SCITRAN_PERSISTENT_PATH/testdata else - if [ "$TESTDATA_VERSION" != "$(cat $SCITRAN_PERSISTENT_PATH/.testdata_version)" ]; then - echo "Testdata out of date; downloading" - curl -L $TESTDATA_URL | tar xz -C "$SCITRAN_PERSISTENT_PATH/testdata" --strip-components 1 - else - echo "Testdata up to date" - fi + echo "Updating testdata in $SCITRAN_PERSISTENT_PATH/testdata" + git -C $SCITRAN_PERSISTENT_PATH/testdata pull fi -builtin echo "$TESTDATA_VERSION" > "$SCITRAN_PERSISTENT_PATH/.testdata_version" - if [ -f "$SCITRAN_PERSISTENT_DATA_PATH/.bootstrapped" ]; then echo "Persistence store exists at $SCITRAN_PERSISTENT_PATH/data. Not bootstrapping data. Remove to re-bootstrap." else echo "Bootstrapping testdata" - bin/bootstrap.py data --copy $SCITRAN_PERSISTENT_PATH/testdata + bin/bootstrap.py -i data $SCITRAN_PERSISTENT_PATH/testdata echo "Bootstrapped testdata" touch "$SCITRAN_PERSISTENT_DATA_PATH/.bootstrapped" fi diff --git a/requirements.txt b/requirements.txt index f9e4d2332..921bf31e1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,7 @@ pyOpenSSL==0.15.1 python-dateutil==2.4.2 pytz==2015.7 requests==2.9.1 +requests-toolbelt==0.6.0 rfc3987==1.3.4 webapp2==2.5.2 WebOb==1.5.1 From 3fe74fc4260b250ed3c755f57e8c331d88609b40 Mon Sep 17 00:00:00 2001 From: Gunnar Schaefer Date: Wed, 24 Feb 2016 18:14:09 -0800 Subject: [PATCH 06/11] Add metadata schema validation requires #171 --- api/jobs.py | 2 +- bin/bootstrap.py | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/api/jobs.py b/api/jobs.py index 2f4bb9edc..117e03306 100644 --- a/api/jobs.py +++ b/api/jobs.py @@ -96,7 +96,7 @@ def create_fileinput_from_reference(container, container_type, file_): # File container information container_id = str(container['_id']) - log.info('File ' + filename + 'is in a ' + container_type + ' with id ' + container_id + ' and hash ' + filehash) + log.info('File ' + filename + ' is in a ' + container_type + ' with id ' + container_id + ' and hash ' + filehash) # Spawn rules currently do not look at container hierarchy, and only care about a single file. # Further, one algorithm is unconditionally triggered for each dirty file. diff --git a/bin/bootstrap.py b/bin/bootstrap.py index 9d791c8f0..11b38b769 100755 --- a/bin/bootstrap.py +++ b/bin/bootstrap.py @@ -14,6 +14,7 @@ import requests import requests_toolbelt +from api import validators from api import tempdir as tempfile logging.basicConfig( @@ -93,6 +94,7 @@ def users(args): def data(args): log.info('Inspecting %s' % args.path) files = [] + schema_validator = validators.payload_from_schema_file(None, 'uploader.json') with requests.Session() as rs: rs.verify = not args.insecure rs.headers = HTTP_HEADERS @@ -112,14 +114,19 @@ def data(args): try: metadata = json.load(fd) except ValueError: - log.warning('Skipping %s: Invalid metadata' % dirpath) + log.warning('Skipping %s: Unparsable metadata' % dirpath) continue - # FIXME need schema validation with tempfile.TemporaryDirectory() as tempdir: log.info('Packaging %s' % dirpath) filepath = create_archive(dirpath, os.path.basename(dirpath), metadata, tempdir, filenames) filename = os.path.basename(filepath) - metadata.get('acquisition', {}).get('files', [{}])[0]['name'] = filename + metadata['acquisition'].setdefault('files', [{}])[0]['name'] = filename + log.info('Validating %s' % filename) + try: + schema_validator(metadata, 'POST') + except Exception: + log.warning('Skipping %s: Invalid metadata' % dirpath) + continue log.info('Uploading %s' % filename) with open(filepath, 'rb') as fd: metadata_json = json.dumps(metadata, default=metadata_encoder) From 079e9f9093b62e329be246711b9ab8d25a23bf8c Mon Sep 17 00:00:00 2001 From: Ryan Sanford Date: Thu, 25 Feb 2016 06:28:09 -0600 Subject: [PATCH 07/11] Docker bootstrap updates Now supports API enabled bootstrapping. --- docker/bootstrap-accounts.sh | 2 +- docker/bootstrap-data.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/bootstrap-accounts.sh b/docker/bootstrap-accounts.sh index d6f57a0fa..65c5f5ab2 100755 --- a/docker/bootstrap-accounts.sh +++ b/docker/bootstrap-accounts.sh @@ -22,7 +22,7 @@ export PYTHONPATH=. # Bootstrap Users -./bin/bootstrap.py users ${bootstrap_user_file} +./bin/bootstrap.py -i users ${bootstrap_user_file} ) diff --git a/docker/bootstrap-data.sh b/docker/bootstrap-data.sh index 19a2ab1c4..9994f2a5a 100755 --- a/docker/bootstrap-data.sh +++ b/docker/bootstrap-data.sh @@ -52,6 +52,6 @@ builtin echo "$TESTDATA_VERSION" > "$TESTDATA_DIR/.testdata_version" ## load the test data in -./bin/bootstrap.py data --copy $TESTDATA_DIR/download +./bin/bootstrap.py -i data $TESTDATA_DIR/download ) From 5f0f5ecca90539b663d988119c83dbcb7eac4f56 Mon Sep 17 00:00:00 2001 From: Ryan Sanford Date: Thu, 25 Feb 2016 08:06:33 -0600 Subject: [PATCH 08/11] Remove branch reference Don't reference branches from within source. This allows core and testdata repo to move without breaking not up-to-date versions of core. Also allows branches to be fully merged. Still a similar branch reference in bin/run.sh --- docker/bootstrap-data.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/docker/bootstrap-data.sh b/docker/bootstrap-data.sh index 9994f2a5a..fa66f949b 100755 --- a/docker/bootstrap-data.sh +++ b/docker/bootstrap-data.sh @@ -15,7 +15,13 @@ GET_LATEST_DATA=${1:-N} # Hard code some other vars important for bootstrapping # -bootstrap_data_branch=master + +# Set the commit hash or tag or branch desired for scitran/testdata. +# Branch name should only be used for testing convenience. +# +# When changing scitran/testdata, merge that change to master first, +# then reference that resulting commit hash here. +bootstrap_data_label=45056c104caf85796e6138a5ca163c3937c7b5d9 # Move to API folder for relative path assumptions later on @@ -30,7 +36,7 @@ export PYTHONPATH=. # Bootstrap data # Compare hash of source test data to most recent download. Remove local copy to force re-download if they are different. -TESTDATA_URL="https://github.com/scitran/testdata/archive/${bootstrap_data_branch}.tar.gz" +TESTDATA_URL="https://github.com/scitran/testdata/archive/${bootstrap_data_label}.tar.gz" TESTDATA_VERSION=$(curl -sLI ${TESTDATA_URL} | grep ETag | tail -n 1 | cut -f 2 -d '"') # use hidden From cf13e16de92eb0dd2e4caf844981d7e2f0580869 Mon Sep 17 00:00:00 2001 From: Gunnar Schaefer Date: Thu, 25 Feb 2016 17:13:27 -0800 Subject: [PATCH 09/11] Finialize metadata validation --- bin/bootstrap.py | 6 +++--- bin/run.sh | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/bootstrap.py b/bin/bootstrap.py index 11b38b769..66dc88655 100755 --- a/bin/bootstrap.py +++ b/bin/bootstrap.py @@ -94,7 +94,7 @@ def users(args): def data(args): log.info('Inspecting %s' % args.path) files = [] - schema_validator = validators.payload_from_schema_file(None, 'uploader.json') + schema_validator = validators.payload_from_schema_file('uploader.json') with requests.Session() as rs: rs.verify = not args.insecure rs.headers = HTTP_HEADERS @@ -120,11 +120,11 @@ def data(args): log.info('Packaging %s' % dirpath) filepath = create_archive(dirpath, os.path.basename(dirpath), metadata, tempdir, filenames) filename = os.path.basename(filepath) - metadata['acquisition'].setdefault('files', [{}])[0]['name'] = filename + metadata.setdefault('acquisition', {}).setdefault('files', [{}])[0]['name'] = filename log.info('Validating %s' % filename) try: schema_validator(metadata, 'POST') - except Exception: + except validators.InputValidationException: log.warning('Skipping %s: Invalid metadata' % dirpath) continue log.info('Uploading %s' % filename) diff --git a/bin/run.sh b/bin/run.sh index cd6e36140..7162b9b92 100755 --- a/bin/run.sh +++ b/bin/run.sh @@ -168,7 +168,7 @@ fi TESTDATA_REPO="https://github.com/scitran/testdata.git" if [ ! -d "$SCITRAN_PERSISTENT_PATH/testdata" ]; then echo "Cloning testdata to $SCITRAN_PERSISTENT_PATH/testdata" - git clone --single-branch --branch bootstrap $TESTDATA_REPO $SCITRAN_PERSISTENT_PATH/testdata + git clone --single-branch $TESTDATA_REPO $SCITRAN_PERSISTENT_PATH/testdata else echo "Updating testdata in $SCITRAN_PERSISTENT_PATH/testdata" git -C $SCITRAN_PERSISTENT_PATH/testdata pull From 22183e1cd2bef578171837cd553c574e06d8f8c4 Mon Sep 17 00:00:00 2001 From: Gunnar Schaefer Date: Thu, 25 Feb 2016 17:59:40 -0800 Subject: [PATCH 10/11] Get site id from api --- bin/bootstrap.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/bin/bootstrap.py b/bin/bootstrap.py index 66dc88655..68e662720 100755 --- a/bin/bootstrap.py +++ b/bin/bootstrap.py @@ -67,22 +67,22 @@ def users(args): now = datetime.datetime.utcnow() with open(args.json) as json_dump: input_data = json.load(json_dump) - log.info('bootstrapping users...') with requests.Session() as rs: + log.info('bootstrapping users...') rs.verify = not args.insecure rs.headers = HTTP_HEADERS for u in input_data.get('users', []): log.info(' ' + u['_id']) rs.post(API_URL + '/users', json=u) - log.info('bootstrapping groups... foo') - site_id = 'local' #config.get_item('site', 'id') - for g in input_data.get('groups', []): - log.info(' ' + g['_id']) - roles = g.pop('roles') - rs.post(API_URL + '/groups' , json=g) - for r in roles: - r.setdefault('site', site_id) - rs.post(API_URL + '/groups/' + g['_id'] + '/roles' , json=r) + log.info('bootstrapping groups...') + site_id = rs.get(API_URL + '/config').json()['site']['id'] + for g in input_data.get('groups', []): + log.info(' ' + g['_id']) + roles = g.pop('roles') + rs.post(API_URL + '/groups' , json=g) + for r in roles: + r.setdefault('site', site_id) + rs.post(API_URL + '/groups/' + g['_id'] + '/roles' , json=r) log.info('bootstrapping complete') users_desc = """ From 0964355c5b62a3db5e54142d03028a574fef04b5 Mon Sep 17 00:00:00 2001 From: Ryan Sanford Date: Fri, 26 Feb 2016 06:51:17 -0600 Subject: [PATCH 11/11] Update testdata label (NOT FINAL) --- docker/bootstrap-data.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/bootstrap-data.sh b/docker/bootstrap-data.sh index fa66f949b..af1c39b55 100755 --- a/docker/bootstrap-data.sh +++ b/docker/bootstrap-data.sh @@ -21,7 +21,7 @@ GET_LATEST_DATA=${1:-N} # # When changing scitran/testdata, merge that change to master first, # then reference that resulting commit hash here. -bootstrap_data_label=45056c104caf85796e6138a5ca163c3937c7b5d9 +bootstrap_data_label=9362b768d54caf6e5cd35f00498208c3b2bff77d # Move to API folder for relative path assumptions later on