Skip to content
Permalink
Browse files

moved to zoltpy: csv_rows_from_json_io_dict(), cdc_csv_rows_from_json…

…_io_dict()
  • Loading branch information...
matthewcornell committed Sep 10, 2019
1 parent 7e85a4b commit 2664f6d4c0bdd6f3e86ac35b39bdde60edf44a95
Showing with 9 additions and 312 deletions.
  1. +1 −142 forecast_app/tests/test_utils.py
  2. +8 −2 forecast_app/views.py
  3. +0 −95 utils/cdc.py
  4. +0 −73 utils/forecast.py
@@ -1,6 +1,4 @@
import csv
import datetime
import json
from pathlib import Path

import pymmwr
@@ -9,8 +7,7 @@
from forecast_app.models import Project, TimeZero
from forecast_app.models.forecast_model import ForecastModel
from utils.cdc import epi_week_filename_components_2016_2017_flu_contest, epi_week_filename_components_ensemble, \
load_cdc_csv_forecast_file, cdc_csv_filename_components, first_model_subdirectory, cdc_csv_rows_from_json_io_dict
from utils.forecast import json_io_dict_from_forecast, csv_rows_from_json_io_dict
load_cdc_csv_forecast_file, cdc_csv_filename_components, first_model_subdirectory
from utils.make_cdc_flu_contests_project import make_cdc_locations_and_targets, season_start_year_for_date


@@ -59,118 +56,6 @@ def setUpTestData(cls):
'forecast_app/tests/model_error/ensemble/EW52-KoTstable-2017-01-09.csv'), time_zero)


def test_csv_rows_from_json_io_dict(self):
# no meta
with self.assertRaises(RuntimeError) as context:
csv_rows_from_json_io_dict({})
self.assertIn('no meta section found in json_io_dict', str(context.exception))

# no meta > targets
with self.assertRaises(RuntimeError) as context:
csv_rows_from_json_io_dict({'meta': {}})
self.assertIn('no targets section found in json_io_dict meta section', str(context.exception))

# invalid prediction class
for invalid_prediction_class in ['InvalidClass']: # ok: PREDICTION_CLASS_TO_JSON_IO_DICT_CLASS
with self.assertRaises(RuntimeError) as context:
json_io_dict = {'meta': {'targets': []},
'predictions': [{'class': invalid_prediction_class}]}
cdc_csv_rows_from_json_io_dict(json_io_dict)
self.assertIn('invalid prediction_dict class', str(context.exception))

with open('forecast_app/tests/predictions/predictions-example.json') as fp:
json_io_dict = json.load(fp)
with self.assertRaises(RuntimeError) as context:
# remove arbitrary meta target. doesn't matter b/c all are referenced
del (json_io_dict['meta']['targets'][0])
cdc_csv_rows_from_json_io_dict(json_io_dict)
self.assertIn('prediction_dict target not found in meta targets', str(context.exception))

with open('forecast_app/tests/predictions/predictions-example.json') as fp:
json_io_dict = json.load(fp)
# location,target,unit,class,cat,family,lwr,param1,param2,param3,prob,sample,value
exp_rows = [
['location', 'target', 'unit', 'class', 'cat', 'family', 'lwr', 'param1', 'param2', 'param3', 'prob',
'sample', 'value'],
['US National', '1 wk ahead', 'percent', 'BinCat', 'cat1', '', '', '', '', '', 0.0, '', ''],
['US National', '1 wk ahead', 'percent', 'BinCat', 'cat2', '', '', '', '', '', 0.1, '', ''],
['US National', '1 wk ahead', 'percent', 'BinCat', 'cat3', '', '', '', '', '', 0.9, '', ''],
['HHS Region 1', '2 wk ahead', 'percent', 'BinLwr', '', '', 0.0, '', '', '', 0.0, '', ''],
['HHS Region 1', '2 wk ahead', 'percent', 'BinLwr', '', '', 0.1, '', '', '', 0.1, '', ''],
['HHS Region 1', '2 wk ahead', 'percent', 'BinLwr', '', '', 0.2, '', '', '', 0.9, '', ''],
['HHS Region 2', '3 wk ahead', 'percent', 'Binary', '', '', '', '', '', '', 0.5, '', ''],
['HHS Region 3', '4 wk ahead', 'percent', 'Named', '', 'gamma', '', 1.1, 2.2, 3.3, '', '', ''],
['HHS Region 4', 'Season onset', 'week', 'Point', '', '', '', '', '', '', '', '', '1'],
['HHS Region 5', 'Season peak percentage', 'percent', 'Sample', '', '', '', '', '', '', '', 1.1, ''],
['HHS Region 5', 'Season peak percentage', 'percent', 'Sample', '', '', '', '', '', '', '', 2.2, ''],
['HHS Region 6', 'Season peak week', 'week', 'SampleCat', 'cat1', '', '', '', '', '', '', 'cat1 sample',
''],
['HHS Region 6', 'Season peak week', 'week', 'SampleCat', 'cat2', '', '', '', '', '', '', 'cat2 sample',
'']]
act_rows = csv_rows_from_json_io_dict(json_io_dict)
self.assertEqual(exp_rows, act_rows)


def test_cdc_csv_rows_from_json_io_dict(self):
# no meta
with self.assertRaises(RuntimeError) as context:
cdc_csv_rows_from_json_io_dict({})
self.assertIn('no meta section found in json_io_dict', str(context.exception))

# no meta > targets
with self.assertRaises(RuntimeError) as context:
cdc_csv_rows_from_json_io_dict({'meta': {}})
self.assertIn('no targets section found in json_io_dict meta section', str(context.exception))

# no predictions
with self.assertRaises(RuntimeError) as context:
cdc_csv_rows_from_json_io_dict({'meta': {'targets': []}})
self.assertIn('no predictions section found in json_io_dict', str(context.exception))

# invalid prediction class
for invalid_prediction_class in ['Binary', 'Named', 'Sample', 'SampleCat']: # ok: 'BinCat', 'BinLwr', 'Point'
with self.assertRaises(RuntimeError) as context:
json_io_dict = {'meta': {'targets': []},
'predictions': [{'class': invalid_prediction_class}]}
cdc_csv_rows_from_json_io_dict(json_io_dict)
self.assertIn('invalid prediction_dict class', str(context.exception))

# prediction dict target not found in meta > targets
with open('forecast_app/tests/predictions/predictions-example.json') as fp:
json_io_dict = json.load(fp)

# remove invalid prediction classes
del (json_io_dict['predictions'][6]) # 'SampleCat'
del (json_io_dict['predictions'][5]) # 'Sample'
del (json_io_dict['predictions'][3]) # 'Named'
del (json_io_dict['predictions'][2]) # 'Binary

with self.assertRaises(RuntimeError) as context:
# remove arbitrary meta target. doesn't matter b/c all are referenced
del (json_io_dict['meta']['targets'][0])
cdc_csv_rows_from_json_io_dict(json_io_dict)
self.assertIn('prediction_dict target not found in meta targets', str(context.exception))

# blue sky: small forecast
project = Project.objects.create()
make_cdc_locations_and_targets(project)
time_zero = TimeZero.objects.create(project=project,
timezero_date=datetime.date(2016, 10, 30),
# 20161030-KoTstable-20161114.cdc.csv {'year': 2016, 'week': 44, 'day': 1}
data_version_date=datetime.date(2016, 10, 29))
forecast_model = ForecastModel.objects.create(project=project)
forecast = load_cdc_csv_forecast_file(
forecast_model, Path('forecast_app/tests/EW1-KoTsarima-2017-01-17-small.csv'), time_zero)
with open(Path('forecast_app/tests/EW1-KoTsarima-2017-01-17-small.csv')) as csv_fp:
csv_reader = csv.reader(csv_fp, delimiter=',')
exp_rows = list(csv_reader)
exp_rows[0] = list(map(str.lower, exp_rows[0])) # fix header case difference
exp_rows = list(map(_xform_cdc_csv_row, sorted(exp_rows)))
json_io_dict = json_io_dict_from_forecast(forecast)
act_rows = sorted(cdc_csv_rows_from_json_io_dict(json_io_dict))
self.assertEqual(exp_rows, act_rows)


def test_epi_week_filename_components_2016_2017_flu_contest(self):
filename_components_tuples = (('EW1-KoTstable-2017-01-17.csv', (1, 'KoTstable', datetime.date(2017, 1, 17))),
('-KoTstable-2017-01-17.csv', None),
@@ -236,29 +121,3 @@ def test_season_start_year_for_date(self):
]
for date, exp_season_start_year in date_exp_season_start_year:
self.assertEqual(exp_season_start_year, season_start_year_for_date(date))


# test_cdc_csv_rows_from_json_io_dict() helper that transforms expected row values to float() as needed to match actual
def _xform_cdc_csv_row(row):
location, target, row_type, unit, bin_start_incl, bin_end_notincl, value = row
if row_type == 'Bin' and unit == 'percent':
try:
bin_start_incl = float(bin_start_incl)
bin_end_notincl = float(bin_end_notincl)
value = float(value)
except ValueError:
pass

if row_type == 'Bin' and unit == 'week':
try:
value = float(value)
except ValueError:
pass

if row_type == 'Point' and unit == 'percent':
try:
value = float(value)
except ValueError:
pass

return [location, target, row_type, unit, bin_start_incl, bin_end_notincl, value]
@@ -982,7 +982,7 @@ def upload_truth(request, project_pk):

def process_upload_file_job__truth(upload_file_job_pk):
"""
An _upload_file() enqueue() function that loads a template file. Called by upload_truth().
An _upload_file() enqueue() function that loads a truth file. Called by upload_truth().
- Expected UploadFileJob.input_json key(s): 'project_pk' - passed to _upload_file()
- Saves UploadFileJob.output_json key(s): None
@@ -992,7 +992,7 @@ def process_upload_file_job__truth(upload_file_job_pk):
with upload_file_job_cloud_file(upload_file_job_pk) as (upload_file_job, cloud_file_fp):
project_pk = upload_file_job.input_json['project_pk']
project = get_object_or_404(Project, pk=project_pk)
project.load_truth_data(cloud_file_fp, upload_file_job.filename)
project.load_truth_data(cloud_file_fp, file_name=upload_file_job.filename)


def download_truth(request, project_pk):
@@ -1073,13 +1073,19 @@ def process_upload_file_job__forecast(upload_file_job_pk):
forecast_model = get_object_or_404(ForecastModel, pk=forecast_model_pk)
timezero_pk = upload_file_job.input_json['timezero_pk']
time_zero = get_object_or_404(TimeZero, pk=timezero_pk)
logger.debug(f"process_upload_file_job__forecast(): upload_file_job={upload_file_job}, "
f"forecast_model={forecast_model}, time_zero={time_zero}")
with transaction.atomic():
logger.debug(f"process_upload_file_job__forecast(): creating Forecast")
new_forecast = Forecast.objects.create(forecast_model=forecast_model, time_zero=time_zero,
source=upload_file_job.filename)
json_io_dict = json.load(cloud_file_fp)
logger.debug(f"process_upload_file_job__forecast(): loading predictions. "
f"#predictions={len(json_io_dict['predictions'])}")
load_predictions_from_json_io_dict(new_forecast, json_io_dict)
upload_file_job.output_json = {'forecast_pk': new_forecast.pk}
upload_file_job.save()
logger.debug(f"process_upload_file_job__forecast(): done")


def delete_forecast(request, forecast_pk):
@@ -220,101 +220,6 @@ def _prediction_dicts_for_csv_rows(rows):
return prediction_dicts


#
# cdc_csv_rows_from_json_io_dict()
#

def cdc_csv_rows_from_json_io_dict(json_io_dict):
"""
:param json_io_dict: a "JSON IO dict" to load from. see docs for details. NB: this dict MUST have a valid "meta"
section b/c we need ['meta']['targets'] for each target's 'unit' so we can figure out bin_end_notincl values.
:return: a list of CDC CSV rows as documented elsewhere. Does include a column header row. See CDC_CSV_HEADER:
['location', 'target', 'type', 'unit', 'bin_start_incl', 'bin_end_notincl', 'value'] .
"""
# do some initial validation
if 'meta' not in json_io_dict:
raise RuntimeError("no meta section found in json_io_dict")
elif 'targets' not in json_io_dict['meta']:
raise RuntimeError("no targets section found in json_io_dict meta section")
elif 'predictions' not in json_io_dict:
raise RuntimeError("no predictions section found in json_io_dict")

rows = [CDC_CSV_HEADER] # returned value. filled next
target_name_to_dict = {target_dict['name']: target_dict for target_dict in json_io_dict['meta']['targets']}
for prediction_dict in json_io_dict['predictions']:
prediction_class = prediction_dict['class']
if prediction_class not in ['BinCat', 'BinLwr', 'Point']:
raise RuntimeError(f"invalid prediction_dict class: {prediction_class}")

target_name = prediction_dict['target']
if target_name not in target_name_to_dict:
raise RuntimeError(f"prediction_dict target not found in meta targets: {target_name}")

location = prediction_dict['location']
target = prediction_dict['target']
row_type = CDC_POINT_ROW_TYPE if prediction_class == 'Point' else CDC_BIN_ROW_TYPE
unit = target_name_to_dict[target_name]['unit']
prediction = prediction_dict['prediction']
if row_type == CDC_POINT_ROW_TYPE: # output the single point row
bin_start_incl = CDC_POINT_NA_VALUE
bin_end_notincl = CDC_POINT_NA_VALUE
value = prediction['value']
rows.append([location, target, row_type, unit, bin_start_incl, bin_end_notincl, value])
elif prediction_class == 'BinCat': # 'BinCat' CDC_BIN_ROW_TYPE -> output multiple bin rows
# BinCat targets: unit='week', target='Season onset' or 'Season peak week'
for cat, prob in zip(prediction['cat'], prediction['prob']):
bin_start_incl = cat
bin_end_notincl = _recode_cat_bin_end_notincl(cat)
value = prob
rows.append([location, target, row_type, unit, bin_start_incl, bin_end_notincl, value])
else: # prediction_class == 'BinLwr' CDC_BIN_ROW_TYPE -> output multiple bin rows
# BinLwr targets: unit='percent', target='1 wk ahead' ... '4 wk ahead', or 'Season peak percentage'
for lwr, prob in zip(prediction['lwr'], prediction['prob']):
bin_start_incl = lwr
bin_end_notincl = 100 if lwr == 13 else lwr + 0.1
value = prob
rows.append([location, target, row_type, unit, bin_start_incl, bin_end_notincl, value])
return rows


def _recode_cat_bin_end_notincl(cat): # from predx: recode_flusight_bin_end_notincl()
return {
'40': '41',
'41': '42',
'42': '43',
'43': '44',
'44': '45',
'45': '46',
'46': '47',
'47': '48',
'48': '49',
'49': '50',
'50': '51',
'51': '52',
'52': '53',
'1': '2',
'2': '3',
'3': '4',
'4': '5',
'5': '6',
'6': '7',
'7': '8',
'8': '9',
'9': '10',
'10': '11',
'11': '12',
'12': '13',
'13': '14',
'14': '15',
'15': '16',
'16': '17',
'17': '18',
'18': '19',
'19': '20',
'20': '21',
'none': 'none'}[cat.lower()]


#
# *.cdc.csv file functions
#
@@ -256,79 +256,6 @@ def _target_dicts_for_project(project, target_names):
for target in project.targets.all() if target.name in target_names]


#
# csv_rows_from_json_io_dict()
#

CSV_HEADER = ['location', 'target', 'unit', 'class', 'cat', 'family', 'lwr', 'param1', 'param2', 'param3', 'prob',
'sample', 'value']


def csv_rows_from_json_io_dict(json_io_dict):
"""
A utility that converts json_io_dict to a list of CSV rows specific to Zoltar. The rows are an 'expanded' version of
json_io_dict where bin-type classes result in multiple rows: BinCatDistribution, BinLwrDistribution,
SampleDistribution, and SampleCatDistribution. The 'class' of each row is named according to
PREDICTION_CLASS_TO_JSON_IO_DICT_CLASS. Column ordering is CSV_HEADER. Note that the csv is 'sparse': not every row
uses all columns, and unused ones are empty. However, the first four columns are always non-empty, i.e., every
prediction has them.
:param json_io_dict: a "JSON IO dict" to load from. see docs for details. NB: this dict MUST have a valid "meta"
section b/c we need ['meta']['targets'] for each target's 'unit' so we can figure out bin_end_notincl values.
:return: a list of CSV rows including header
"""
# todo merge w/cdc_csv_rows_from_json_io_dict()

# do some initial validation
if 'meta' not in json_io_dict:
raise RuntimeError("no meta section found in json_io_dict")
elif 'targets' not in json_io_dict['meta']:
raise RuntimeError("no targets section found in json_io_dict meta section")
elif 'predictions' not in json_io_dict:
raise RuntimeError("no predictions section found in json_io_dict")

rows = [CSV_HEADER] # returned value. filled next
target_name_to_dict = {target_dict['name']: target_dict for target_dict in json_io_dict['meta']['targets']}
for prediction_dict in json_io_dict['predictions']:
prediction_class = prediction_dict['class']
target_name = prediction_dict['target']
if target_name not in target_name_to_dict:
raise RuntimeError(f"prediction_dict target not found in meta targets: {target_name}")

location = prediction_dict['location']
target = prediction_dict['target']
unit = target_name_to_dict[target_name]['unit']
prediction = prediction_dict['prediction']
# class-specific columns all default to empty:
cat, family, lwr, param1, param2, param3, prob, sample, value = '', '', '', '', '', '', '', '', ''
if prediction_class == 'BinCat':
for cat, prob in zip(prediction['cat'], prediction['prob']):
rows.append([location, target, unit, prediction_class, cat, family, lwr, param1, param2, param3, prob,
sample, value])
elif prediction_class == 'BinLwr':
for lwr, prob in zip(prediction['lwr'], prediction['prob']):
rows.append([location, target, unit, prediction_class, cat, family, lwr, param1, param2, param3, prob,
sample, value])
elif prediction_class == 'Binary':
rows.append([location, target, unit, prediction_class, cat, family, lwr, param1, param2, param3,
prediction['prob'], sample, value])
elif prediction_class == 'Named':
rows.append([location, target, unit, prediction_class, cat, prediction['family'], lwr, prediction['param1'],
prediction['param2'], prediction['param3'], prob, sample, value])
elif prediction_class == 'Point':
rows.append([location, target, unit, prediction_class, cat, family, lwr, param1, param2, param3, prob,
sample, prediction['value']])
elif prediction_class == 'Sample':
for sample in prediction['sample']:
rows.append([location, target, unit, prediction_class, cat, family, lwr, param1, param2, param3, prob,
sample, value])
else: # prediction_class == 'SampleCat'
for cat, sample in zip(prediction['cat'], prediction['sample']):
rows.append([location, target, unit, prediction_class, cat, family, lwr, param1, param2, param3, prob,
sample, value])
return rows


#
# load_predictions_from_json_io_dict()
#

0 comments on commit 2664f6d

Please sign in to comment.
You can’t perform that action at this time.