In [1]:
import xarray as xr
import os
import netCDF4
import numpy as np
import torch
from torch import nn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torch.utils.data import random_split
import random
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau

# combine mli/mlo data to one nc file

In [2]:
# from os import listdir
# from os.path import isfile, join

# mypath = '/work/sds-lab/Shuochen/climsim/train'
# # allfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
# allfiles = []
# for path, subdirs, files in os.walk(mypath):
#     for name in files:
#        allfiles.append(os.path.join(path, name))
# print(len(allfiles))

# mli_files = []
# mlo_files = []

# for i in range(len(allfiles)):
#     if 'mli' in allfiles[i]:
#         mli_files.append(allfiles[i])
#     if 'mlo' in allfiles[i]:
#         mlo_files.append(allfiles[i])

# ds_mli = []
# ds_mlo = []

# for i in range(len(mli_files)):
#     ds_mli.append(xr.open_dataset(mli_files[i]).expand_dims('time'))
# for i in range(len(mlo_files)):
#     ds_mlo.append(xr.open_dataset(mlo_files[i]).expand_dims('time'))

# combined_mli = xr.concat(ds_mli, dim='time').to_netcdf(/work/sds-lab/Shuochen/climsim/ + 'val_mli.nc')
# combined_mlo = xr.concat(ds_mlo, dim='time').to_netcdf('/work/sds-lab/Shuochen/climsim/' + 'val_mlo.nc')

# preprocessing

In [3]:
mli = xr.open_dataset('/work/sds-lab/Shuochen/climsim/val_mli.nc')
mlo = xr.open_dataset('/work/sds-lab/Shuochen/climsim/val_mlo.nc')

In [4]:
# fix time index

date = []
for i in range(len(mli.ymd.values)):
    my_str=str(mli.ymd.values[i])
    date.append(my_str[:0] + '200' + my_str[0:])

mli['time'] = pd.to_datetime(date) + pd.to_timedelta(mli.tod.values, unit='s')
mlo['time'] = pd.to_datetime(date) + pd.to_timedelta(mlo.tod.values, unit='s')

In [5]:
# downsample data every 7 samples, data frequency is 1200s.
mli = mli.isel(time=slice(None,None,7))
mlo = mlo.isel(time=slice(None,None,7))

In [6]:
# in/out variable lists (6:10)
vars_mli = ['state_t','state_q0001','state_ps','pbuf_SOLIN', 'pbuf_LHFLX', 'pbuf_SHFLX']
vars_mlo = ['ptend_t','ptend_q0001','cam_out_NETSW','cam_out_FLWDS','cam_out_PRECSC','cam_out_PRECC','cam_out_SOLS','cam_out_SOLL','cam_out_SOLSD','cam_out_SOLLD']

# nc to numpy to tensor, then reshape
input_list = []
output_list = []

for i in range(len(vars_mli)):
    if i < 2:
        temp=torch.Tensor(mli[vars_mli[i]].to_numpy())
        new_var=temp.permute(0,2,1)
        new_var=new_var.reshape(new_var.shape[0] * new_var.shape[1], new_var.shape[2])
        input_list.append(new_var)
    else:
        temp=torch.Tensor(mli[vars_mli[i]].to_numpy())
        new_var=temp.reshape(temp.shape[0] * temp.shape[1], 1)
        input_list.append(new_var)

# calculate heating and moistening tendency.
ptend_t = (mlo['state_t'] - mli['state_t']) / 1200
ptend_q0001 = (mlo['state_q0001'] - mli['state_q0001']) / 1200

ptend_t = torch.Tensor(ptend_t.to_numpy()).permute(0,2,1)
ptend_t = ptend_t.reshape(ptend_t.shape[0] * ptend_t.shape[1], ptend_t.shape[2])

ptend_q0001 = torch.Tensor(ptend_q0001.to_numpy()).permute(0,2,1)
ptend_q0001 = ptend_q0001.reshape(ptend_q0001.shape[0] * ptend_q0001.shape[1], ptend_q0001.shape[2])

# add to output list
output_list.extend([ptend_t,ptend_q0001])

for i in range(len(vars_mlo)):
    if i >= 2:
        temp=torch.Tensor(mlo[vars_mlo[i]].to_numpy())
        new_var=temp.reshape(temp.shape[0] * temp.shape[1], 1)
        output_list.append(new_var)

In [7]:
# load normalization and scaling factors
norm_mean = xr.open_dataset('/work/sds-lab/Shuochen/climsim/normalizations_git/inputs/input_mean.nc')
norm_max = xr.open_dataset('/work/sds-lab/Shuochen/climsim/normalizations_git/inputs/input_max.nc')
norm_min = xr.open_dataset('/work/sds-lab/Shuochen/climsim/normalizations_git/inputs/input_min.nc')
scale = xr.open_dataset('/work/sds-lab/Shuochen/climsim/normalizations_git/outputs/output_scale.nc')

norm_mean_list = []
for i in range(len(vars_mli)):
    temp=torch.Tensor(norm_mean[vars_mli[i]].values)
    norm_mean_list.append(temp)

norm_max_list = []
for i in range(len(vars_mli)):
    temp=torch.Tensor(norm_max[vars_mli[i]].values)
    norm_max_list.append(temp)

norm_min_list = []
for i in range(len(vars_mli)):
    temp=torch.Tensor(norm_min[vars_mli[i]].values)
    norm_min_list.append(temp)

scale_list = []
for i in range(len(vars_mlo)):
    temp=torch.Tensor(scale[vars_mlo[i]].values)
    scale_list.append(temp)

In [8]:
for i in range(len(input_list)):
    input_list[i] = (input_list[i] - norm_mean_list[i]) / (norm_max_list[i] - norm_min_list[i])  
for i in range(len(output_list)):
    output_list[i] = output_list[i] * scale_list[i]

In [9]:
# concat
input = torch.cat((input_list[0],input_list[1],input_list[2],input_list[3],input_list[4],input_list[5]), dim=1)
output = torch.cat((output_list[0],output_list[1],output_list[2],output_list[3],output_list[4],output_list[5],output_list[6],output_list[7],output_list[8],output_list[9]), dim=1)

print(input.shape,output.shape)

torch.Size([1441920, 124]) torch.Size([1441920, 128])


# test input/output data with theirs

In [10]:
X=np.load('/work/sds-lab/Shuochen/climsim/val_input.npy')
y=np.load('/work/sds-lab/Shuochen/climsim/val_target.npy')

In [11]:
(torch.Tensor(X[0,:]) - input[0,:])

tensor([-1.3970e-09,  4.6566e-08,  4.8429e-08,  5.9605e-08, -3.0617e-08,
         4.0978e-08, -2.2352e-08,  9.6858e-08,  2.2352e-08,  5.2154e-08,
         1.4901e-08, -1.0058e-07, -4.4703e-08, -5.2154e-08,  8.5682e-08,
         4.0047e-08, -1.6391e-07,  1.5274e-07, -3.7253e-08, -7.4506e-09,
         1.4901e-08,  1.6391e-07, -7.8231e-08, -1.1828e-07, -6.7055e-08,
        -1.3039e-07, -8.1956e-08, -1.8626e-07, -1.4901e-08,  1.1921e-07,
        -2.2352e-08, -2.1607e-07, -4.4703e-08, -1.4901e-08, -2.1607e-07,
         1.7881e-07,  1.1921e-07, -4.0978e-07,  5.2154e-08,  1.4901e-07,
         2.0862e-07,  1.0431e-07,  2.9802e-08,  3.1292e-07, -1.4901e-08,
         8.9407e-08, -2.9057e-07,  4.4703e-08, -2.9802e-08, -1.3411e-07,
        -8.9407e-08, -2.9802e-07, -5.9605e-08,  4.4703e-08, -1.6391e-07,
        -9.6858e-08, -7.4506e-09,  2.2352e-08, -1.1176e-07, -1.4901e-08,
         1.1921e-07, -8.9407e-08,  8.9407e-08,  5.9605e-08,  5.9605e-08,
         2.9802e-08,  0.0000e+00,  8.9407e-08,  5.9

In [12]:
(torch.Tensor(y[0,:]) - output[0,:])

tensor([ 0.0000e+00,  0.0000e+00,  0.0000e+00,  3.7253e-09,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00, -3.7253e-09,  3.7253e-09,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00, -1.1642e-10,  0.0000e+00, -2.3283e-10,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  1.8626e-09,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  1.8626e-09,  1.8626e-09,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  1.8626e-09,
         0.0000e+00,  1.8626e-09,  0.0000e+00,  1.8626e-09,  1.8626e-09,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  1.8626e-09,
         0.0000e+00,  0.0000e+00,  1.8626e-09,  1.8626e-09,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  3.7253e-09,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0