Skip to content

Commit

Permalink
read_header (#59)
Browse files Browse the repository at this point in the history
* read in column names from header

* make csv read backwards compatible

* remove typo

* fix read_header

* check pandas version

* conflicts

* work backwards

* fix_issue

* simplification

* requested changes

* requested changes

* requested changes

* add test
  • Loading branch information
Ci Zhang authored and rabernat committed Jun 14, 2017
1 parent 47c6c50 commit f75ff7c
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 41 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
26 changes: 14 additions & 12 deletions floater/test/test_generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,31 +189,34 @@ def test_pickling_with_land(fs_with_land, tmpdir):

def test_npart_to_2D_array():
# floatsets
lon = np.linspace(0, 8, 9, dtype=np.float32)
lat = np.linspace(-4, 4, 9, dtype=np.float32)
land_mask = np.zeros(81, dtype=bool)==False
lon = np.arange(0, 9, dtype=np.float32)
lat = np.arange(-4, 5, dtype=np.float32)
land_mask = np.full(81, True, dtype=bool)
land_mask.shape = (len(lat), len(lon))
land_mask[:,0:2] = False
model_grid = {'lon': lon, 'lat': lat, 'land_mask': land_mask}
fs_none = gen.FloatSet(xlim=(0, 9), ylim=(-4, 5), dx=1.0, dy=1.0)
fs_mask = gen.FloatSet(xlim=(0, 9), ylim=(-4, 5), dx=1.0, dy=1.0, model_grid=model_grid)
fs_none = gen.FloatSet(xlim=(0, 9), ylim=(-4, 5))
fs_mask = gen.FloatSet(xlim=(0, 9), ylim=(-4, 5), model_grid=model_grid)
fs_mask.get_rectmesh()
# dataarray/dataset
var_list = ['test_01', 'test_02', 'test_03']
values_list_none = []
values_list_mask = []
data_vars_none = {}
data_vars_mask = {}
len_none = 81
len_mask = list(fs_mask.ocean_bools).count(True)
for var in var_list:
values_none = np.random.random(81)
values_none.shape = (1, 1, 81)
values_mask = np.random.random(69)
values_mask.shape = (1, 1, 69)
values_none = np.random.random(len_none)
values_none.shape = (1, 1, len_none)
values_mask = np.random.random(len_mask)
values_mask.shape = (1, 1, len_mask)
values_list_none.append(values_none)
values_list_mask.append(values_mask)
data_vars_none.update({var: (['date', 'loc', 'npart'], values_none)})
data_vars_mask.update({var: (['date', 'loc', 'npart'], values_mask)})
npart_none = np.linspace(1, 81, 81, dtype=np.int32)
npart_mask = np.linspace(1, 69, 69, dtype=np.int32)
npart_none = np.arange(1, len_none+1, dtype=np.int32)
npart_mask = np.arange(1, len_mask+1, dtype=np.int32)
coords_none = {'date': (['date'], np.array([np.datetime64('2000-01-01')])),
'loc': (['loc'], np.array(['New York'])),
'npart': (['npart'], npart_none)}
Expand All @@ -229,7 +232,6 @@ def test_npart_to_2D_array():
test_mask = (fs_mask, da1d_mask, ds1d_mask, values_list_mask)
test_list = [test_none, test_mask]
for fs, da1d, ds1d, values_list in test_list:
fs.get_rectmesh()
# method test
da2d = fs.npart_to_2D_array(da1d)
ds2d = fs.npart_to_2D_array(ds1d)
Expand Down
86 changes: 62 additions & 24 deletions floater/test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@
-0.08639287948608398, 0.12957383692264557, -0.12062723934650421,
0.0, 0.0, 2.6598372642183676e-06)

_TESTDATA_FILENAME_CSV = 'sample_mitgcm_float_trajectories_csv.tar.gz'
_TMPDIR_SUBDIR_CSV = 'sample_mitgcm_data_csv'
_TESTDATA_FILENAME_CSV_OLD = 'sample_mitgcm_float_trajectories_csv_old.tar.gz'
_TMPDIR_SUBDIR_CSV_OLD = 'sample_mitgcm_data_csv_old'

_TESTDATA_FILENAME_CSV_NEW = 'sample_mitgcm_float_trajectories_csv_new.tar.gz'
_TMPDIR_SUBDIR_CSV_NEW = 'sample_mitgcm_data_csv_new'

#@pytest.fixture()
#def empty_output_dir(tmpdir):
Expand All @@ -41,12 +43,24 @@ def mitgcm_float_datadir(tmpdir_factory, request):
return target_dir

@pytest.fixture(scope='module')
def mitgcm_float_datadir_csv(tmpdir_factory, request):
def mitgcm_float_datadir_csv_old(tmpdir_factory, request):
filename = request.module.__file__
datafile = os.path.join(os.path.dirname(filename), _TESTDATA_FILENAME_CSV)
datafile = os.path.join(os.path.dirname(filename), _TESTDATA_FILENAME_CSV_OLD)
if not os.path.exists(datafile):
raise IOError('Could not find data file %s' % datafile)
target_dir = str(tmpdir_factory.mktemp(_TMPDIR_SUBDIR_CSV))
target_dir = str(tmpdir_factory.mktemp(_TMPDIR_SUBDIR_CSV_OLD))
tar = tarfile.open(datafile)
tar.extractall(target_dir)
tar.close()
return target_dir

@pytest.fixture(scope='module')
def mitgcm_float_datadir_csv_new(tmpdir_factory, request):
filename = request.module.__file__
datafile = os.path.join(os.path.dirname(filename), _TESTDATA_FILENAME_CSV_NEW)
if not os.path.exists(datafile):
raise IOError('Could not find data file %s' % datafile)
target_dir = str(tmpdir_factory.mktemp(_TMPDIR_SUBDIR_CSV_NEW))
tar = tarfile.open(datafile)
tar.extractall(target_dir)
tar.close()
Expand Down Expand Up @@ -88,47 +102,71 @@ def test_floats_to_bcolz(tmpdir, mitgcm_float_datadir):
for name, val in zip(_NAMES, _TESTVALS_FIRST):
np.testing.assert_almost_equal(bc[0][name], val)

def test_floats_to_netcdf(tmpdir, mitgcm_float_datadir_csv):
def test_floats_to_netcdf(tmpdir,
mitgcm_float_datadir_csv_old,
mitgcm_float_datadir_csv_new):
"""Test that we can convert MITgcm float data into NetCDF format.
"""
import xarray as xr
from floater.generators import FloatSet

input_dir = str(mitgcm_float_datadir_csv)
input_dir_old = str(mitgcm_float_datadir_csv_old)
input_dir_new = str(mitgcm_float_datadir_csv_new)
output_dir = str(tmpdir)
os.chdir(input_dir)
fs = FloatSet(xlim=(-5, 5), ylim=(-2, 2), dx=1.0, dy=1.0)
fs = FloatSet(xlim=(-5, 5), ylim=(-2, 2))

os.chdir(input_dir_old)
fs.to_pickle('./fs.pkl')
# least options
utils.floats_to_netcdf(input_dir='./', output_fname='test_old')
# most options
utils.floats_to_netcdf(input_dir='./', output_fname='test_old',
float_file_prefix='float_trajectories',
ref_time='1993-01-01', pkl_path='./fs.pkl',
output_dir=output_dir, output_prefix='prefix_test')

os.chdir(input_dir_new)
fs.to_pickle('./fs.pkl')
# least options
utils.floats_to_netcdf(input_dir=input_dir, output_fname='test')
utils.floats_to_netcdf(input_dir='./', output_fname='test_new')
# most options
utils.floats_to_netcdf(input_dir=input_dir, output_fname='test',
utils.floats_to_netcdf(input_dir='./', output_fname='test_new',
float_file_prefix='float_trajectories',
ref_time='1993-01-01', pkl_path='./fs.pkl',
output_dir=output_dir, output_prefix='prefix_test')

# filename prefix test
os.chdir(input_dir)
mfdl = xr.open_mfdataset('test_netcdf/float_trajectories.*.nc')
os.chdir(input_dir_old)
mfdol = xr.open_mfdataset('test_old_netcdf/float_trajectories.*.nc')
os.chdir(input_dir_new)
mfdnl = xr.open_mfdataset('test_new_netcdf/float_trajectories.*.nc')
os.chdir(output_dir)
mfdm = xr.open_mfdataset('test_netcdf/prefix_test.*.nc')
mfdom = xr.open_mfdataset('test_old_netcdf/prefix_test.*.nc')
mfdnm = xr.open_mfdataset('test_new_netcdf/prefix_test.*.nc')

# dimensions test
dims = [{'time': 2, 'npart': 40}, {'time': 2, 'y0': 4, 'x0': 10}]
assert mfdl.dims == dims[0]
assert mfdm.dims == dims[1]
assert mfdol.dims == dims[0]
assert mfdom.dims == dims[1]
assert mfdnl.dims == dims[0]
assert mfdnm.dims == dims[1]

# variables and values test
vars_values = [('x', 0.3237109375000000e+03), ('y', -0.7798437500000000e+02),
('z', -0.4999999999999893e+00), ('u', -0.5346306607990328e-02),
('v', -0.2787361934305595e-02), ('vort', 0.9160626946271506e-10)]
vars_values = [('x', 0.1961093750000000E+03), ('y', -0.7848437500000000E+02),
('z', -0.4999999999999893E+00), ('u', 0.3567512409555351E-04),
('v', 0.1028276712547044E-03), ('vort', 0.0000000000000000E+00)]
for var, value in vars_values:
np.testing.assert_almost_equal(mfdol[var].values[0][0], value, 8)
np.testing.assert_almost_equal(mfdom[var].values[0][0][0], value, 8)
vars_values.append(('lavd', 0.0000000000000000E+00))
for var, value in vars_values:
np.testing.assert_almost_equal(mfdl[var].values[0][0], value, 8)
np.testing.assert_almost_equal(mfdm[var].values[0][0][0], value, 8)
np.testing.assert_almost_equal(mfdnl[var].values[0][0], value, 8)
np.testing.assert_almost_equal(mfdnm[var].values[0][0][0], value, 8)

# times test
times = [(0, 0, np.datetime64('1993-01-01', 'ns')), (1, 86400, np.datetime64('1993-01-02', 'ns'))]
times = [(0, 0, np.datetime64('1993-01-01', 'ns')), (1, 2592000, np.datetime64('1993-01-31', 'ns'))]
for i, sec, time in times:
assert mfdl['time'][i].values == sec
assert mfdm['time'][i].values == time
assert mfdol['time'][i].values == sec
assert mfdom['time'][i].values == time
assert mfdnl['time'][i].values == sec
assert mfdnm['time'][i].values == time
13 changes: 8 additions & 5 deletions floater/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ def floats_to_netcdf(input_dir, output_fname,
Prefix of the transcoded NetCDF files
"""
import dask.dataframe as dd
import pandas as pd
import xarray as xr
from floater.generators import FloatSet
from glob import glob
Expand All @@ -251,14 +252,16 @@ def floats_to_netcdf(input_dir, output_fname,

match_pattern = float_file_prefix + '.*.csv'
float_files = glob(os.path.join(input_dir, match_pattern))
float_header = pd.read_csv(float_files[0], nrows=0).columns
float_timesteps = sorted(list({int(float_file[-22:-12]) for float_file in float_files}))
float_columns = ['npart', 'time', 'x', 'y', 'z', 'u', 'v', 'vort']

for float_timestep in tqdm(float_timesteps):
input_path = os.path.join(input_dir, '%s.%010d.*.csv' % (float_file_prefix, float_timestep))
df = dd.read_csv(input_path)
if df.columns.values[0] != 'npart': # check if old format
columns = ['npart', 'time', 'x', 'y', 'z', 'u', 'v', 'vort']
df = dd.read_csv(input_path, names=columns, header=None)
if float_header[0] != 'npart':
df = dd.read_csv(input_path, names=float_columns, header=None)
else:
df = dd.read_csv(input_path)
dfc = df.compute()
dfcs = dfc.sort_values('npart')
del_time = int(dfcs.time.values[0])
Expand All @@ -270,7 +273,7 @@ def floats_to_netcdf(input_dir, output_fname,
time = np.array([np.int32(del_time)])
npart = dfcs.npart.values.astype(np.int32)
var_shape = (1, len(npart))
var_names = dfcs.columns.values[2:]
var_names = dfcs.columns[2:]
data_vars = {var_name: (['time', 'npart'], dfcs[var_name].values.astype(np.float32).reshape(var_shape)) for var_name in var_names}
ds = xr.Dataset(data_vars, coords={'time': time, 'npart': npart})
if pkl_path is not None:
Expand Down

0 comments on commit f75ff7c

Please sign in to comment.