-
Notifications
You must be signed in to change notification settings - Fork 2
/
Sahel_1_save_data_ingrid.m
351 lines (327 loc) · 15 KB
/
Sahel_1_save_data_ingrid.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
%{
Use for all data accessed without jupyter and the cloud, rather through the
IRI library. Data is preprocessed using ingrid (see
http://rainbow.ldeo.columbia.edu/documentation/libraries/ingrid/Ingrid_5.html#SEC39).
Includes CMIP5 and observations. Can also be used for short CMIP5 AMIP simulations.
Log files for seach scenario "<Scenario>_log.txt" detail which simulations
were skipped and why.
%}
clear
%customizable variables
get_observations = false;
get_simulations = true;
scenarios = {'historical','historicalAerosol','historicalNat','historicalGHG','piControl'};%'historicalMisc' is volcanoes only.
%there are additional amip simulations which I downloaded onto a Lamont
%server which can be accessed here, called 'amip', 'PSL-FACTS', and
%'VanillaAMIP'. They are not included here because they were too short and
%redundant to CMIP6 amip-piF and amip-hist runs. Be sure to use the cmip6
%institution file for those.
shortcuts = {'h', 'a', 'n', 'g', 'piC'};
%make sure the shortcuts match the scenarios if they are modified.
variable = 'pr';
start_month = 7;
end_month = 9;
%define area limits
switch variable
case 'pr'
s_lat = 12;
n_lat = 18;
e_lon = -20;
w_lon = 40;
case 'ts'
%NA
s_lat = 10;
n_lat = 40;
e_lon = -75;
w_lon = -15;
%GT
GT_lat = [-20,20];
end
%create data folder
drnm = ['data/', variable];
if(~exist(drnm, 'dir'))
mkdir(drnm)
end
%Create MONTH_NAMES and MONTH_DAYS matrices for use in changing units later
month_names = {'Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'};
month_days = 31*ones(1,12); month_days(2)=28; %should it just be uniformly 30-day months? I think not in CESM
month_days(ismember(month_names,{'Sep','Apr','Jun','Nov'}))=30;
%ingrid preparations
month0 = month_names{start_month};
month1 = month_names{end_month};
%NOTE: this is only true if models don't just use 30-day months....
days_per_month = mean(month_days(start_month:end_month));
seasonal_average = ['T/%28', month0, '-', month1,'%29/seasonalAverage/',...
'T/12/STEP/dods'];
space_range_average = [...
%'/time//T/renameGRID',... rename grid if needed. causes cold to fail
%if not needed, hence it is commented out.
'/T/1/monthlyAverage',... changes units from months to years
'/lon/', num2str(e_lon), '/', num2str(w_lon), '/RANGE',... zonal range
'/lat/', num2str(s_lat), '/', num2str(n_lat), '/RANGE',... meridional range
'%7B/lat/cosd/%7D/',... box size changes with lat
'%5Blon/lat%5D/weighted-average/']; % zonal and meridional average
url_end = [space_range_average, seasonal_average];
if(strcmp(variable, 'ts'))
GT_space_average = [...
'/time//T/renameGRID',... rename grid if needed.
'/T/1/monthlyAverage',... changes units from months to years
'/lat/', num2str(GT_lat(1)), '/', num2str(GT_lat(2)), '/RANGE',... meridional range
'%7B/lat/cosd/%7D/',... box size changes with lat
'%5Blon/lat%5D/weighted-average/']; % zonal and meridional average
land_mask = [... %used for GT in simulations where ts includes land measurements
'SOURCES/.WORLDBATH/.bath/',...
'X/%28lon%29renameGRID/',...
'Y/%28lat%29renameGRID',...
'%5Blon/lat%5DregridLinear/',...
'0/maskgt/0/mul/add/'];
url_end_GT = [land_mask, GT_space_average, seasonal_average];
end
%Observations
if(get_observations)
obs_file_name = make_data_filename(variable, start_month, end_month, 'observations', 0);
fprintf('Writing file %s\n', obs_file_name);
ObsFile = matfile(obs_file_name,'Writable', true);
%matfile(['data/', variable, '/', num2str(start_month), '-', num2str(end_month), '/observations.mat']
switch variable
case 'pr'
source = '.WCRP/.GCOS/.GPCC/.FDP/.version2018/.1p0/.prcp/';
ObsFile.src = 'GPCC version18';
v = 'prcp';
case 'ts'
source = '.NOAA/.NCDC/.ERSST/.version5/.sst/';
ObsFile.src = 'ERSSTv5';
v = 'sst';
end
obs_url_setup = [...
'http://iridl.ldeo.columbia.edu/SOURCES/',...
source];
observations_url_space = [...
'X/', num2str(e_lon), '/', num2str(w_lon), '/RANGE/',...
'Y/',num2str(s_lat), '/', num2str(n_lat), '/RANGE/'];
obs_url_averaging = [...
'%7B/Y/cosd/%7D/',...
'%5BX/Y%5D/weighted-average/', seasonal_average];
observations_url = [obs_url_setup, observations_url_space,obs_url_averaging];
obs_units = ncreadatt(observations_url, 'T', 'units');
var_unstandardized = ncread(observations_url, v);
if(strcmp(variable, 'pr'))
var_unstandardized = rainfall_to_days(var_unstandardized, days_per_month);
end
ObsFile.var = var_unstandardized;
ObsFile.T = adjust_time(ncread(observations_url,'T'), get_ref_year(obs_units))';
if strcmp(variable, 'pr') %add an additional file for CRU observations
observations_url_CRU = [...
'http://kage.ldeo.columbia.edu:81/expert/home/.datasets/.CRU3.25/.cru3p25.nc/.pre/',...
observations_url_space,obs_url_averaging];
CRU_obs = matfile(['data/', variable,'/CRU_data.mat'], 'Writable', true);
CRU_obs.var = rainfall_to_days(ncread(observations_url_CRU, 'pre')', days_per_month);
CRU_obs.T = adjust_time(ncread(observations_url_CRU, 'T'), get_ref_year(ncreadatt(observations_url_CRU, 'T', 'units')));
CRU_obs.src = 'CRU';
else %add additional ocean basins to the observaitons file
obs_url_GT = [obs_url_setup,... %don't need a land mask for SST observations
'Y/',num2str(GT_lat(1)), '/', num2str(GT_lat(2)), '/RANGE/',...
obs_url_averaging];
var_unstandardized = [var_unstandardized; ncread(obs_url_GT, v)];
var_unstandardized(3,:) = var_unstandardized(1,:) - var_unstandardized(2,:);
ObsFile.var = permute(var_unstandardized, [3,2,1]);
ObsFile.indices = permute({'NA'; 'GT'; 'NARI'},[3,2,1]);
end
end
%Historical Simulations
if(get_simulations)
umbrella=load('data/institutions_cmip5.mat');
for j = 1:length(scenarios)
clear model
scenario = scenarios{j};
sc = shortcuts{j};
log = fopen(['data/', variable, '/', sc, '_log.txt'], 'wt');
switch scenario
case {'historical','historicalAerosol','historicalNat','historicalGHG','piControl'} %cmip5
url_setup = ['http://strega.ldeo.columbia.edu:81/expert/CMIP5/.byScenario/.', scenario,'/.atmos/.mon/.', variable, '/.'];
case 'amip' %cmip6, many start at 1950. supplemented with ERA20CM (e) and PSL-FACTS (p)
url_setup = 'http://carney.ldeo.columbia.edu:81/expert/home/.OTHER/.rebecca/.netcdf/.cmip6/.amip/.';
case 'PSL_FACTS'
url_setup = 'http://carney.ldeo.columbia.edu:81/expert/home/.OTHER/.rebecca/.netcdf/PSL-FACTS/.';
case 'VanillaAMIP'
url_setup = 'http://carney.ldeo.columbia.edu:81/expert/home/.OTHER/.rebecca/.netcdf/.cmip6/.amip-piForcing/.';
end
model_file_name = make_data_filename(variable, start_month, end_month, sc,'all');
if(exist(model_file_name,'file')==2)
load(model_file_name);
end
fprintf("Accessing scenario %s\n", scenario);
fprintf("Opening file %s_models.txt\n", scenario);
models = fopen(['data/',variable, '/',scenario, '_models.txt']);
%models = fopen(['data/',scenario, '_models.txt']);
file_name = fgetl(models);
next_line=1;
%for CMIP5, one line corresponds to all runs from one model. For
%data downloaded directly from ESGF, one line corresponds to one
%run.
while ischar(file_name)
elements = strsplit(file_name, {'_', '\t', ' '});
model_name = get_model_name(elements, scenario);
runs_ids = get_runs(elements, scenario);
for rx = 1:length(runs_ids)
r = runs_ids{rx};
tokeep = false;
[mpname, run_name] = make_model_and_p_name({model_name, r}, scenario); %already type char
if(exist('model','var') && any(contains(model(:,2), mpname) & contains(model(:,3), run_name)))
next_line=next_line+1;
fprintf(log, "Already saved run %s\n", file_name);
continue
end
url = [url_setup,model_name, '/.',r, '/.', variable, url_end];
T_units = ncreadatt(url, 'T', 'units');
P_units = ncreadatt(url, variable, 'units');
if(check_units(T_units, P_units, variable, log))
non_standardized_run = ncread(url, variable);
l = length(non_standardized_run);
T=ncread(url, 'T');
T_years = adjust_time(T, get_ref_year(T_units));
if(~check_years(scenario, T_years, l, model_name, run_name, log))
continue
end
if(sum(isnan(non_standardized_run))>0)
fprintf(log, "%s contains NaN in run %s\n",...
model_name, run_name);
else
model(next_line, 2:3) = {mpname, run_name};
if strcmp(variable, 'pr')
non_standardized_run = flux_to_rainfall(non_standardized_run)';
s3=1;
else
url_GT = [url_setup,model_name, '/.',r, '/.', variable, url_end_GT];
non_standardized_run = [non_standardized_run, ncread(url, variable)];
non_standardized_run(:,3) = non_standardized_run(:,1)-non_standardized_run(:,2);
non_standardized_run = permute(non_standardized_run, [3,1,2]);
s3=3;
end
runs(next_line,1:l,1:s3) = non_standardized_run;
time(next_line,1:l) = T_years;
next_line=next_line+1;
tokeep = true;
save(model_file_name,'model','runs', 'time');
end
end
if(tokeep)
fprintf(log, "Saved model %s run %s with %u years.\n", model_name, run_name, l);
else
fprintf(log, "skipping file %s\n", file_name);
end
end
file_name = fgetl(models);
end
fclose(models);
umbrella_model_names = find_umbrella_names(model_name, umbrella);
model(1:next_line-1,1) = umbrella_model_names;
save(model_file_name, 'model', 'runs', 'time');
clear model runs
end
if(strcmp(variable, 'ts'))
load(model_file_name);
indices = permute({'NA'; 'GT'; 'NARI'}, [3,2,1]);
save(model_file_name, 'model', 'runs', 'time', 'indices');
end
end
%functions for legibility
function units_ok = check_units(T_units, P_units, variable, log)
T_units_ok = all([(T_units(1:13) == 'months since '),...
(T_units(18:end)=='-01-01')]);
var_units_ok = strcmp(variable, 'pr') && strcmp(P_units,'kg m-2 s-1') || ...
strcmp(variable, 'ts') && strcmp(P_units, 'Kelvin_scale');
units_ok = T_units_ok && var_units_ok;
if(~units_ok)
fprintf(log, "cannot comprehend units: %s\n",T_units);
end
end
function years_ok = check_years(scenario, T_years, l, model_name, run, log)
ref_T_years = 1901:2003; %year limits for CMIP5.
L = length(ref_T_years);
%For piC, just make sure there are enough years total.
if(strcmp(scenario, "piControl"))
years_ok = (L<=l);
if(~years_ok)
fprintf(log, "incomplete data for model %s, run %s: %u years\n",...
model_name, run, l);
end
else
%for historical simulations, check that the years include all the necessary ones.
years_ok = sum(T_years >= min(ref_T_years) & T_years <= max(ref_T_years)) == L;
if(~years_ok)
fprintf(log, "incomplete data for model %s, run %s: %u:%u\n",...
model_name, run, T_years(1),T_years(length(T_years)));
end
end
end
function ref_year = get_ref_year(T_units)
ref_year = str2num(T_units(14:17));
end
%IS THIS EVEN RIGHT??? OR IS IT A WEIGHTED MEAN???
%mm/month*month/avg#days=mm/day
function [flux] = rainfall_to_days(rainfall, avg_days_per_month)
flux = rainfall/(avg_days_per_month);
end
%data are reported in mid-months. We take the floor so that we get the data
%point centered on the year.
function [T] = adjust_time(Tm, refT)
T = floor(Tm/12+refT);
end
%Extracts name from line with name and runs
function name = get_model_name(line, scenario)
%files downloaded directly from ESGF are listed by name individually
if(strcmp(scenario, 'PSL_FACTS') || strcmp(scenario, 'VanillaAMIP'))
name = line{2};
else %for CMIP5, the runs are listed within one line after the model name.
name = line{1};
end
end
function runs = get_runs(line, scenario)
if(strcmp(scenario, 'PSL_FACTS') || strcmp(scenario, 'VanillaAMIP'))
runs = line(6);
else
runs = line(2:end);
end
end
%Concatenates name with p number so that distinct p numbers will be saved as separate files.
%If we want to collapse all offshoots of one model into one, we would want
%to average over these. But that would happen in an analysis file.
%TODO: changes here to include f can be general. dif between split and
%strsplit seems to just be the dimension of the output array.
function [fname, run] = make_model_and_p_name(line, scenario)
if(strcmp(scenario, 'PSL_FACTS'))
run = line{6};
run_stats = split(run, ["s","."]);
fname = get_model_name(line, scenario);
run = run_stats{2};
else
run = line{2};
run_stats = split(run, 'p');
%useful only for post-CMIP5 simulations
p_and_f = split(run_stats(2), 'f');
p = p_and_f{1};
fname = [get_model_name(line, scenario), ' p', p];
end
end
function[umbrella_names] = find_umbrella_names(used_models, umbrella)
umbrella_names = cell(length(used_models),1);
%case for cmip6
if(~isfield(umbrella, 'abbrev'))
[~,Loc] = ismember(used_models,umbrella.institutions(:,2));
umbrella_names = umbrella.institutions(Loc,1);
%case for cmip5
else
%TODO I missed that some models would have more than one institution
%label, and that this approach takes the order of my list of
%umbrella models!
for k = 1:length(umbrella.models)
umbrella_names(contains(used_models, umbrella.models(k)), 1) = umbrella.abbrev(k);
end
end
end
%mm/month*month/avg#days*day/24hours*hour/60min*min/60s*m/1000mm*1000kg/m^3=kg/s/m^2
%kg/s/m^2*m^3/1000kg*1000mm/m*60s/min*60min/hour*24hours/day = mm/day
function [flux] = flux_to_rainfall(rainfall)
flux = rainfall*24*60*60;
end