Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion api/handlers/grouphandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def post(self):
payload_validator = validators.payload_from_schema_file('group.json')
payload_validator(payload, 'POST')
payload['created'] = payload['modified'] = datetime.datetime.utcnow()
payload['roles'] = [{'_id': self.uid, 'access': 'admin', 'site': self.user_site}]
payload['roles'] = [{'_id': self.uid, 'access': 'admin', 'site': self.user_site}] if self.uid else []
result = mongo_validator(permchecker(self.storage.exec_op))('POST', payload=payload)
if result.acknowledged:
return {'_id': result.inserted_id}
Expand Down
4 changes: 2 additions & 2 deletions api/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def create_fileinput_from_reference(container, container_type, file_):
# File container information
container_id = str(container['_id'])

log.info('File ' + filename + 'is in a ' + container_type + ' with id ' + container_id + ' and hash ' + filehash)
log.info('File ' + filename + ' is in a ' + container_type + ' with id ' + container_id + ' and hash ' + filehash)

# Spawn rules currently do not look at container hierarchy, and only care about a single file.
# Further, one algorithm is unconditionally triggered for each dirty file.
Expand Down Expand Up @@ -156,7 +156,7 @@ def queue_job(db, algorithm_id, input, tags=[], attempt_n=1, previous_job_id=Non
result = db.jobs.insert_one(job)
_id = result.inserted_id

log.info('Running %s as job %s to process %s %s' % (gear.name, str(_id), input.container_type, input.container_id))
log.info('Enqueuing %s as job %s to process %s %s' % (gear.name, str(_id), input.container_type, input.container_id))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return _id

def retry_job(db, j, force=False):
Expand Down
6 changes: 3 additions & 3 deletions api/schemas/input/group.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
"type": "object",
"properties": {
"_id": {
"maxLength": 32,
"maxLength": 64,
"minLength": 2,
"pattern": "^[0-9a-z][0-9a-z.@_-]{0,30}[0-9a-z]$",
"title": "ID",
"type": "string"
},
"name": {
"maxLength": 32,
"minLength": 2,
"maxLength": 64,
"minLength": 1,
"pattern": "^[0-9A-Za-z][0-9A-Za-z .@_-]{0,30}[0-9A-Za-z]$",
"title": "Name",
"type": "string"
Expand Down
10 changes: 5 additions & 5 deletions api/schemas/input/user.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,21 @@
"type": "object",
"properties": {
"_id": {
"maxLength": 32,
"maxLength": 64,
"minLength": 2,
"pattern": "^[0-9a-z.@_-]*$",
"title": "ID",
"type": "string"
},
"firstname": {
"maxLength": 32,
"minLength": 2,
"maxLength": 64,
"minLength": 1,
"title": "First Name",
"type": "string"
},
"lastname": {
"maxLength": 32,
"minLength": 2,
"maxLength": 64,
"minLength": 1,
"title": "Last Name",
"type": "string"
},
Expand Down
6 changes: 0 additions & 6 deletions api/schemas/mongo/group.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,12 @@
"type": "object",
"properties": {
"_id": {
"maxLength": 32,
"minLength": 2,
"pattern": "^[0-9a-z][0-9a-z.@_-]{0,30}[0-9a-z]$",
"title": "ID",
"type": "string"
},
"created": {},
"modified": {},
"name": {
"maxLength": 32,
"minLength": 2,
"pattern": "^[0-9A-Za-z][0-9A-Za-z .@_-]{0,30}[0-9A-Za-z]$",
"title": "Name",
"type": "string"
},
Expand Down
9 changes: 0 additions & 9 deletions api/schemas/mongo/user.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,34 +4,25 @@
"type": "object",
"properties": {
"_id": {
"maxLength": 32,
"minLength": 2,
"pattern": "^[0-9a-z.@_-]*$",
"title": "ID",
"type": "string"
},
"created": {},
"modified": {},
"firstname": {
"maxLength": 32,
"minLength": 2,
"title": "First Name",
"type": "string"
},
"lastname": {
"maxLength": 32,
"minLength": 2,
"title": "Last Name",
"type": "string"
},
"email": {
"format": "email",
"title": "Email",
"type": "string"
},
"avatars": { "$ref": "avatars.json"},
"avatar": {
"format": "uri",
"title": "Avatar",
"type": "string"
},
Expand Down
7 changes: 5 additions & 2 deletions api/validators.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import copy
import glob
import jsonschema

from . import config
Expand Down Expand Up @@ -56,13 +57,15 @@ class DBValidationException(Exception):
mongo_schemas = set()
input_schemas = set()
# validate and cache schemas at start time
for schema_file in os.listdir(schema_path + '/schemas/mongo/'):
for schema_filepath in glob.glob(schema_path + '/schemas/mongo/*.json'):
schema_file = os.path.basename(schema_filepath)
mongo_schemas.add(schema_file)
resolver_mongo.resolve(schema_file)

assert mongo_schemas == expected_mongo_schemas, '{} is different from {}'.format(mongo_schemas, expected_mongo_schemas)

for schema_file in os.listdir(schema_path + '/schemas/input/'):
for schema_filepath in glob.glob(schema_path + '/schemas/input/*.json'):
schema_file = os.path.basename(schema_filepath)
input_schemas.add(schema_file)
resolver_input.resolve(schema_file)

Expand Down
194 changes: 102 additions & 92 deletions bin/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,66 +3,86 @@
"""This script helps bootstrap users and data"""

import os
import sys
import json
import shutil
import hashlib
import logging
import zipfile
import argparse
import datetime
import requests
import requests_toolbelt

from api import validators
from api import tempdir as tempfile

from api.dao import reaperutil
from api import util
from api import rules
from api import config
logging.basicConfig(
format='%(asctime)s %(name)16.16s %(filename)24.24s %(lineno)5d:%(levelname)4.4s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
level=logging.DEBUG,
)
log = logging.getLogger('scitran.bootstrap')

log = config.log
logging.getLogger('requests').setLevel(logging.WARNING) # silence Requests library


def clean(args):
config.db.client.drop_database(config.db)
if 'SCITRAN_CORE_DRONE_SECRET' not in os.environ:
log.error('SCITRAN_CORE_DRONE_SECRET not configured')
sys.exit(1)

clean_desc = """
example:
./bin/bootstrap.py clean
"""
if 'SCITRAN_RUNTIME_HOST' not in os.environ or 'SCITRAN_RUNTIME_PORT' not in os.environ:
log.error('SCITRAN_RUNTIME_HOST or SCITRAN_RUNTIME_PORT not configured')
sys.exit(1)
else:
API_URL = 'https://%s:%s/api' % (os.environ['SCITRAN_RUNTIME_HOST'], os.environ['SCITRAN_RUNTIME_PORT'])

if 'SCITRAN_PERSISTENT_PATH' in os.environ and 'SCITRAN_PERSISTENT_DATA_PATH' not in os.environ:
os.environ['SCITRAN_PERSISTENT_DATA_PATH'] = os.path.join(os.environ['SCITRAN_PERSISTENT_PATH'], 'data')

HTTP_HEADERS = {'X-SciTran-Auth': os.environ['SCITRAN_CORE_DRONE_SECRET'], 'User-Agent': 'SciTran Drone Bootstrapper'}


def metadata_encoder(o):
if isinstance(o, datetime.datetime):
if o.tzinfo is None:
o = pytz.timezone('UTC').localize(o)
return o.isoformat()
elif isinstance(o, datetime.tzinfo):
return o.zone
raise TypeError(repr(o) + ' is not JSON serializable')


def create_archive(content, arcname, metadata, outdir=None, filenames=None):
path = (os.path.join(outdir, arcname) if outdir else content) + '.zip'
with zipfile.ZipFile(path, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zf:
zf.comment = json.dumps(metadata, default=metadata_encoder)
zf.write(content, arcname)
for fn in filenames or os.listdir(content):
zf.write(os.path.join(content, fn), os.path.join(arcname, fn))
return path


def users(args):
now = datetime.datetime.utcnow()
with open(args.json) as json_dump:
input_data = json.load(json_dump)
log.info('bootstrapping users...')
with requests.Session() as rs:
rs.params = {'d': '404'}
log.info('bootstrapping users...')
rs.verify = not args.insecure
rs.headers = HTTP_HEADERS
for u in input_data.get('users', []):
log.info(' ' + u['_id'])
u['created'] = now
u['modified'] = now
u.setdefault('email', u['_id'])
u.setdefault('preferences', {})
gravatar = 'https://gravatar.com/avatar/' + hashlib.md5(u['email']).hexdigest() + '?s=512'
if rs.head(gravatar):
u.setdefault('avatar', gravatar)
u.setdefault('avatars', {})
u['avatars'].setdefault('gravatar', gravatar)
config.db.users.update_one({'_id': u['_id']}, {'$setOnInsert': u}, upsert=True)
log.info('bootstrapping groups...')
site_id = config.get_item('site', 'id')
for g in input_data.get('groups', []):
log.info(' ' + g['_id'])
g['created'] = now
g['modified'] = now
for r in g['roles']:
r.setdefault('site', site_id)
config.db.groups.update_one({'_id': g['_id']}, {'$setOnInsert': g}, upsert=True)
log.info('bootstrapping drones...')
for d in input_data.get('drones', []):
log.info(' ' + d['_id'])
d['created'] = now
d['modified'] = now
config.db.drones.update_one({'_id': d['_id']}, {'$setOnInsert': d}, upsert=True)
rs.post(API_URL + '/users', json=u)
log.info('bootstrapping groups...')
site_id = rs.get(API_URL + '/config').json()['site']['id']
for g in input_data.get('groups', []):
log.info(' ' + g['_id'])
roles = g.pop('roles')
rs.post(API_URL + '/groups' , json=g)
for r in roles:
r.setdefault('site', site_id)
rs.post(API_URL + '/groups/' + g['_id'] + '/roles' , json=r)
log.info('bootstrapping complete')

users_desc = """
Expand All @@ -72,52 +92,46 @@ def users(args):


def data(args):
log.info('inspecting %s' % args.path)
log.info('Inspecting %s' % args.path)
files = []
for dirpath, dirnames, filenames in os.walk(args.path):
for filepath in [os.path.join(dirpath, fn) for fn in filenames if not fn.startswith('.')]:
if not os.path.islink(filepath):
files.append(filepath)
dirnames[:] = [dn for dn in dirnames if not dn.startswith('.')] # need to use slice assignment to influence walk behavior
file_cnt = len(files)
log.info('found %d files to sort (ignoring symlinks and dotfiles)' % file_cnt)
for i, filepath in enumerate(files):
log.info('Loading %s [%s] (%d/%d)' % (os.path.basename(filepath), util.hrsize(os.path.getsize(filepath)), i+1, file_cnt))
hash_ = hashlib.sha384()
size = os.path.getsize(filepath)
try:
metadata = json.loads(zipfile.ZipFile(filepath).comment)
except ValueError as e:
log.warning(str(e))
continue
target, file_ = reaperutil.create_container_hierarchy(metadata)
with open(filepath, 'rb') as fd:
for chunk in iter(lambda: fd.read(2**20), ''):
hash_.update(chunk)
computed_hash = 'v0-sha384-' + hash_.hexdigest()
destpath = os.path.join(config.get_item('persistent', 'data_path'), util.path_from_hash(computed_hash))
dir_destpath = os.path.dirname(destpath)
filename = os.path.basename(filepath)
if not os.path.exists(dir_destpath):
os.makedirs(dir_destpath)
if args.copy:
shutil.copyfile(filepath, destpath)
else:
shutil.move(filepath, destpath)
created = modified = datetime.datetime.utcnow()
fileinfo = {
'name': filename,
'size': size,
'hash': computed_hash,
'type': 'dicom', # we are only bootstrapping dicoms at the moment
'created': created,
'modified': modified,
'mimetype': util.guess_mimetype(filename),
}
fileinfo.update(file_)
target.add_file(fileinfo)
rules.create_jobs(config.db, target.container, 'acquisition', fileinfo)

schema_validator = validators.payload_from_schema_file('uploader.json')
with requests.Session() as rs:
rs.verify = not args.insecure
rs.headers = HTTP_HEADERS
for dirpath, dirnames, filenames in os.walk(args.path):
dirnames[:] = [dn for dn in dirnames if not dn.startswith('.')] # use slice assignment to influence walk
if not dirnames and filenames:
for metadata_file in filenames:
if metadata_file.lower() == 'metadata.json':
filenames.remove(metadata_file)
break
else:
metadata_file = None
if not metadata_file:
log.warning('Skipping %s: No metadata found' % dirpath)
continue
with open(os.path.join(dirpath, metadata_file)) as fd:
try:
metadata = json.load(fd)
except ValueError:
log.warning('Skipping %s: Unparsable metadata' % dirpath)
continue
with tempfile.TemporaryDirectory() as tempdir:
log.info('Packaging %s' % dirpath)
filepath = create_archive(dirpath, os.path.basename(dirpath), metadata, tempdir, filenames)
filename = os.path.basename(filepath)
metadata.setdefault('acquisition', {}).setdefault('files', [{}])[0]['name'] = filename
log.info('Validating %s' % filename)
try:
schema_validator(metadata, 'POST')
except validators.InputValidationException:
log.warning('Skipping %s: Invalid metadata' % dirpath)
continue
log.info('Uploading %s' % filename)
with open(filepath, 'rb') as fd:
metadata_json = json.dumps(metadata, default=metadata_encoder)
mpe = requests_toolbelt.multipart.encoder.MultipartEncoder(fields={'metadata': metadata_json, 'file': (filename, fd)})
rs.post(API_URL + '/uploader', data=mpe, headers={'Content-Type': mpe.content_type})

data_desc = """
example:
Expand All @@ -128,14 +142,6 @@ def data(args):
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers(help='operation to perform')

clean_parser = subparsers.add_parser(
name='clean',
help='reset database to clean state',
description=clean_desc,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
clean_parser.set_defaults(func=clean)

users_parser = subparsers.add_parser(
name='users',
help='bootstrap users and groups',
Expand All @@ -151,9 +157,13 @@ def data(args):
description=data_desc,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
data_parser.add_argument('-c', '--copy', action='store_true', help='copy data instead of moving it')
data_parser.add_argument('path', help='filesystem path to data')
data_parser.set_defaults(func=data)

parser.add_argument('-i', '--insecure', action='store_true', help='do not verify SSL connections')
args = parser.parse_args()

if args.insecure:
requests.packages.urllib3.disable_warnings()

args.func(args)
Loading