# PCA
PCA steps for data downscaling projet, dominant PCs to be exported in an array since that is all we really work with in the main downscalling file (in an effort to reduce memory load)<br>

World data preprocessing in this file because i didnt feel as if a new netCDF file was necessary for that data as it already loads rather quickly and doesn't take up much space

In [1]:
#import required libraries
import pandas as pd
import numpy as np
import xarray as xr
import time
from sklearn.decomposition import PCA

In [2]:
#load in HRDPS and world data (for now still just working on pressure)
P_world= xr.open_mfdataset(["psl_NAM-22_CCCma-CanESM2_rcp45_r1i1p1_CCCma-CanRCM4_r2_day_20110101-20151231.nc", "psl_NAM-22_CCCma-CanESM2_rcp45_r1i1p1_CCCma-CanRCM4_r2_day_20160101-20201231.nc"])
U_world= xr.open_mfdataset(["uas_NAM-22_CCCma-CanESM2_rcp45_r1i1p1_CCCma-CanRCM4_r2_day_20110101-20151231.nc", "uas_NAM-22_CCCma-CanESM2_rcp45_r1i1p1_CCCma-CanRCM4_r2_day_20160101-20201231.nc"])
V_world= xr.open_mfdataset(["vas_NAM-22_CCCma-CanESM2_rcp45_r1i1p1_CCCma-CanRCM4_r2_day_20110101-20151231.nc", "vas_NAM-22_CCCma-CanESM2_rcp45_r1i1p1_CCCma-CanRCM4_r2_day_20160101-20201231.nc"])
hrdps = xr.open_dataset("hrdps_day_avgs_postSep2014.nc")

#lets go ahead and trim the hrdps dataset so it fits more snuggly in a year
#we're also going to remove the leap days here so that seasonal cycle calcs will work
a= hrdps.sel(time_counter=slice('2015-01-01T00:00:00.000000000', '2016-02-28T00:00:00.000000000'))
b= hrdps.sel(time_counter=slice('2016-03-01T00:00:00.000000000', '2020-02-28T00:00:00.000000000'))
c= hrdps.sel(time_counter=slice('2020-03-01T00:00:00.000000000', '2020-12-31T00:00:00.000000000'))
hrdps= xr.concat([a,b,c],dim='time_counter')

In [32]:
#first lets do some quick processing of the hrdps data
hrdps_lat = hrdps.nav_lat.sel(time_counter=hrdps.time_counter[0]).values.flatten()
hrdps_lon = hrdps.nav_lon.sel(time_counter=hrdps.time_counter[0]).values.flatten()
p_temp = hrdps.atmpres.values
u_temp = hrdps.u_wind.values
v_temp = hrdps.u_wind.values

In [35]:
#export the hrdps lat and lon
np.savetxt("hrdps_lat.csv", hrdps_lat, delimiter=",")
np.savetxt("hrdps_lon.csv", hrdps_lon, delimiter=",")

In [5]:
#get pressure into a shape we can work in
P_hrdps = np.empty((np.shape(p_temp)[1]*np.shape(p_temp)[2],np.shape(p_temp)[0]))
for i in range(np.shape(p_temp)[0]):
    P_hrdps[:,i] = np.reshape(p_temp[i],(np.shape(p_temp)[1]*np.shape(p_temp)[2],))
print(f"cummulative time -pressure done (s):"+str(time.time()))
    
U_hrdps = np.empty((np.shape(u_temp)[1]*np.shape(u_temp)[2],np.shape(u_temp)[0]))
for i in range(np.shape(u_temp)[0]):
    U_hrdps[:,i] = np.reshape(u_temp[i],(np.shape(u_temp)[1]*np.shape(u_temp)[2],))
print(f"cummulative time -U wind done (s):"+str(time.time()))
    
V_hrdps = np.empty((np.shape(v_temp)[1]*np.shape(v_temp)[2],np.shape(v_temp)[0]))
for i in range(np.shape(v_temp)[0]):
    V_hrdps[:,i] = np.reshape(v_temp[i],(np.shape(v_temp)[1]*np.shape(v_temp)[2],))
print(f"cummulative time -V wind done (s):"+str(time.time()))

cummulative time -pressure done (s):1610140592.8061147
cummulative time -U wind done (s):1610140596.1753159
cummulative time -V wind done (s):1610140600.163743


In [6]:
#replace all nan with 101325 or 0
P_hrdps = (pd.DataFrame(P_hrdps).fillna(101325)).to_numpy()
U_hrdps = (pd.DataFrame(U_hrdps).fillna(0)).to_numpy()
V_hrdps = (pd.DataFrame(V_hrdps).fillna(0)).to_numpy()

In [8]:
#now de-mean the dataset (If the mean isn't removed then it will show up in the mode 1 EOF and give disproportionate weight to the mode 1 percent variance)
#calculate seasonal signal for each grid point

nyears = 6

#pressure
seasonal = np.empty((np.shape(P_hrdps)[0],365))
for ii in range(len(P_hrdps)):
    seasonal[ii,:] = np.mean(np.reshape(P_hrdps[ii,:],(nyears,365)),axis=0)

#repeat the seasonal cycle for all years
seasonal_all = np.tile(seasonal,(1,nyears))

#remove seasonal cycle from the original data (calculate anomalies)
P_hrdps_anom = P_hrdps - seasonal_all

#U-wind
seasonal = np.empty((np.shape(U_hrdps)[0],365))
for ii in range(len(U_hrdps)):
    seasonal[ii,:] = np.mean(np.reshape(U_hrdps[ii,:],(nyears,365)),axis=0)

#repeat the seasonal cycle for all years
seasonal_all = np.tile(seasonal,(1,nyears))

#remove seasonal cycle from the original data (calculate anomalies)
U_hrdps_anom = U_hrdps - seasonal_all

#V-wind
seasonal = np.empty((np.shape(V_hrdps)[0],365))
for ii in range(len(V_hrdps)):
    seasonal[ii,:] = np.mean(np.reshape(V_hrdps[ii,:],(nyears,365)),axis=0)

#repeat the seasonal cycle for all years
seasonal_all = np.tile(seasonal,(1,nyears))

#remove seasonal cycle from the original data (calculate anomalies)
V_hrdps_anom = V_hrdps - seasonal_all

In [9]:
#now for preprocessing of world data
#trim the world dataset to be for right time extent
#want days between September 12 2014 and December 31 2020
P_world = P_world.sel(time=slice('2014-01-01 12:00:00', '2020-12-31 12:00:00'))
U_world = U_world.sel(time=slice('2014-01-01 12:00:00', '2020-12-31 12:00:00'))
V_world = V_world.sel(time=slice('2014-01-01 12:00:00', '2020-12-31 12:00:00'))

In [16]:
#now extract the data you want from the CanRCM4 xarray and trim them according to lat and lon of the HRDPS data
#first decide on range you want to work within for CanRCM4 data, want to overlap the HRDPS data by 10% of the max distance (y)
maxlon = max(hrdps_lon)
minlon = min(hrdps_lon)
maxlat = max(hrdps_lat)
minlat = min(hrdps_lat)

buffer = (maxlat-minlat)*0.15

lon = P_world.lon.values.flatten()
lat = P_world.lat.values.flatten()

index = []

#first find idexes that fit into lon range and lat range
for i in range(len(lon)):
    if lon[i] > (minlon-buffer) and lon[i] < (maxlon+buffer) and lat[i] > (minlat-buffer) and lat[i] < (maxlat+buffer):
        index.append(i)

#now make new lat, and lon
lat_RCM = []
lon_RCM = []

for i in index:
    lat_RCM.append(lat[i])
    lon_RCM.append(lon[i])

In [17]:
#export the RCM lat and lon
# np.savetxt("RCM_lat.csv", lat_RCM, delimiter=",")
# np.savetxt("RCM_lon.csv", lon_RCM, delimiter=",")

In [18]:
#load the data before trying to process it
P = P_world.psl.values
U = U_world.uas.values
V = V_world.vas.values

In [19]:
#now convert data to 2D
start = time.time()
P2d = np.empty((len(P_world.rlat)*len(P_world.rlon),len(P_world.time)))
               
for i in range(len(P_world.time)):
    P2d[:,i] = np.reshape(P[i],(len(P_world.rlat)*len(P_world.rlon),))
print(f"time -pressure done (s):"+str(time.time()-start))

               
start = time.time()
U2d = np.empty((len(U_world.rlat)*len(U_world.rlon),len(U_world.time)))
               
for i in range(len(U_world.time)):
    U2d[:,i] = np.reshape(U[i],(len(U_world.rlat)*len(U_world.rlon),))
print(f"time -U wind done (s):"+str(time.time()-start))

               
start = time.time()
V2d = np.empty((len(V_world.rlat)*len(V_world.rlon),len(V_world.time)))
               
for i in range(len(V_world.time)):
    V2d[:,i] = np.reshape(V[i],(len(V_world.rlat)*len(V_world.rlon),))
print(f"time -V wind done (s):"+str(time.time()-start))

time -pressure done (s):4.190555572509766
time -U wind done (s):4.60263729095459
time -V wind done (s):4.651688814163208


In [20]:
#use indices found in lat-lon step to trim CanRCM4 extent

P_RCM = np.empty((len(index),len(P_world.time)))

for i in range(len(P_world.time)):
    for j in range(len(index)):
        P_RCM[j,i] = P2d[index[j],i]
        
U_RCM = np.empty((len(index),len(U_world.time)))

for i in range(len(U_world.time)):
    for j in range(len(index)):
        U_RCM[j,i] = U2d[index[j],i]
        
V_RCM = np.empty((len(index),len(V_world.time)))

for i in range(len(V_world.time)):
    for j in range(len(index)):
        V_RCM[j,i] = V2d[index[j],i]


In [21]:
#remove seasonal cycle
nyears = 7

#pressure
seasonal = np.empty((np.shape(P_RCM)[0],365))
for ii in range(len(P_RCM)):
    seasonal[ii,:] = np.mean(np.reshape(P_RCM[ii,:],(nyears,365)),axis=0)

#repeat the seasonal cycle for all years
seasonal_all = np.tile(seasonal,(1,nyears))

#remove seasonal cycle from the original data (calculate anomalies)
P_RCM_anom = P_RCM - seasonal_all

#U-wind
seasonal = np.empty((np.shape(U_RCM)[0],365))
for ii in range(len(U_RCM)):
    seasonal[ii,:] = np.mean(np.reshape(U_RCM[ii,:],(nyears,365)),axis=0)

#repeat the seasonal cycle for all years
seasonal_all = np.tile(seasonal,(1,nyears))

#remove seasonal cycle from the original data (calculate anomalies)
U_RCM_anom = U_RCM - seasonal_all

#V-wind
seasonal = np.empty((np.shape(V_RCM)[0],365))
for ii in range(len(V_RCM)):
    seasonal[ii,:] = np.mean(np.reshape(V_RCM[ii,:],(nyears,365)),axis=0)

#repeat the seasonal cycle for all years
seasonal_all = np.tile(seasonal,(1,nyears))

#remove seasonal cycle from the original data (calculate anomalies)
V_RCM_anom = V_RCM - seasonal_all

## PCA
NOTE : before export find out how to limit range of the PCs so that you don't need to normalize afterwords

In [22]:
#PCA on the world data
#looking for dominant spatial patterns to be eigenvectors and how those spatial patterns evelove over the month to the the PCs)
#need to take the transpose of the matrix 

data = P_RCM_anom.T

n_modes = np.min(np.shape(data))
pca = PCA(n_components = n_modes)
P_RCM_PCs = pca.fit_transform(data)
P_RCM_eigvecs = pca.components_
P_RCM_fracVar = pca.explained_variance_ratio_

data = U_RCM_anom.T

n_modes = np.min(np.shape(data))
pca = PCA(n_components = n_modes)
U_RCM_PCs = pca.fit_transform(data)
U_RCM_eigvecs = pca.components_
U_RCM_fracVar = pca.explained_variance_ratio_

data = V_RCM_anom.T

n_modes = np.min(np.shape(data))
pca = PCA(n_components = n_modes)
V_RCM_PCs = pca.fit_transform(data)
V_RCM_eigvecs = pca.components_
V_RCM_fracVar = pca.explained_variance_ratio_

In [23]:
#export the relevant arrays to CSV
np.savetxt("P_RCM_PCs.csv", P_RCM_PCs, delimiter=",")
np.savetxt("P_RCM_eigvecs.csv", P_RCM_eigvecs, delimiter=",")
np.savetxt("P_RCM_fracVar.csv", P_RCM_fracVar, delimiter=",")

np.savetxt("U_RCM_PCs.csv", U_RCM_PCs, delimiter=",")
np.savetxt("U_RCM_eigvecs.csv", U_RCM_eigvecs, delimiter=",")
np.savetxt("U_RCM_fracVar.csv", U_RCM_fracVar, delimiter=",")

np.savetxt("V_RCM_PCs.csv", V_RCM_PCs, delimiter=",")
np.savetxt("V_RCM_eigvecs.csv", V_RCM_eigvecs, delimiter=",")
np.savetxt("V_RCM_fracVar.csv", V_RCM_fracVar, delimiter=",")

In [24]:
#PCA on the HRDPS data 
data = P_hrdps_anom.T

n_modes = np.min(np.shape(data))
pca = PCA(n_components = n_modes)
P_hrdps_PCs = pca.fit_transform(data)
P_hrdps_eigvecs = pca.components_
P_hrdps_fracVar = pca.explained_variance_ratio_

data = U_hrdps_anom.T

n_modes = np.min(np.shape(data))
pca = PCA(n_components = n_modes)
U_hrdps_PCs = pca.fit_transform(data)
U_hrdps_eigvecs = pca.components_
U_hrdps_fracVar = pca.explained_variance_ratio_

data = P_hrdps_anom.T

n_modes = np.min(np.shape(data))
pca = PCA(n_components = n_modes)
V_hrdps_PCs = pca.fit_transform(data)
V_hrdps_eigvecs = pca.components_
V_hrdps_fracVar = pca.explained_variance_ratio_

In [25]:
#export the relevant arrays to CSV
np.savetxt("P_hrdps_PCs.csv", P_hrdps_PCs, delimiter=",")
np.savetxt("P_hrdps_eigvecs.csv", P_hrdps_eigvecs, delimiter=",")
np.savetxt("P_hrdps_fracVar.csv", P_hrdps_fracVar, delimiter=",")

np.savetxt("U_hrdps_PCs.csv", U_hrdps_PCs, delimiter=",")
np.savetxt("U_hrdps_eigvecs.csv", U_hrdps_eigvecs, delimiter=",")
np.savetxt("U_hrdps_fracVar.csv", U_hrdps_fracVar, delimiter=",")

np.savetxt("V_hrdps_PCs.csv", V_hrdps_PCs, delimiter=",")
np.savetxt("V_hrdps_eigvecs.csv", V_hrdps_eigvecs, delimiter=",")
np.savetxt("V_hrdps_fracVar.csv", V_hrdps_fracVar, delimiter=",")

In [26]:
#save date arrays that you want to work with
dates = P_world.time.values
np.savetxt("RCM_dates.csv",dates,delimiter=",",fmt="%s")
dates = hrdps.time_counter.values
np.savetxt("hrdps_dates.csv",dates,delimiter=",",fmt="%s")

In [None]:
plt.plot(P_hrdps_PCs)