# PCA
PCA steps for data downscaling projet, dominant PCs to be exported in an array since that is all we really work with in the main downscalling file (in an effort to reduce memory load)<br>

World data preprocessing in this file because i didnt feel as if a new netCDF file was necessary for that data as it already loads rather quickly and doesn't take up much space

In [2]:
#import required libraries
import pandas as pd
import numpy as np
import xarray as xr
import time
from sklearn.decomposition import PCA

In [3]:
#load in HRDPS and world data (for now still just working on pressure)
P_world= xr.open_mfdataset(["psl_NAM-22_CCCma-CanESM2_rcp45_r1i1p1_CCCma-CanRCM4_r2_day_20110101-20151231.nc", "psl_NAM-22_CCCma-CanESM2_rcp45_r1i1p1_CCCma-CanRCM4_r2_day_20160101-20201231.nc"])
U_world= xr.open_mfdataset(["uas_NAM-22_CCCma-CanESM2_rcp45_r1i1p1_CCCma-CanRCM4_r2_day_20110101-20151231.nc", "uas_NAM-22_CCCma-CanESM2_rcp45_r1i1p1_CCCma-CanRCM4_r2_day_20160101-20201231.nc"])
V_world= xr.open_mfdataset(["vas_NAM-22_CCCma-CanESM2_rcp45_r1i1p1_CCCma-CanRCM4_r2_day_20110101-20151231.nc", "vas_NAM-22_CCCma-CanESM2_rcp45_r1i1p1_CCCma-CanRCM4_r2_day_20160101-20201231.nc"])
hrdps = xr.open_dataset("hrdps_day_avgs_postSep2014.nc")

In [4]:
#first lets do some quick processing of the hrdps data
hrdps_lat = hrdps.nav_lat.values.flatten()
hrdps_lon = hrdps.nav_lon.values.flatten()
p_temp = hrdps.atmpres.values
u_temp = hrdps.u_wind.values
v_temp = hrdps.u_wind.values

In [63]:
#export the hrdps lat and lon
np.savetxt("hrdps_lat.csv", hrdps_lat, delimiter=",")
np.savetxt("hrdps_lon.csv", hrdps_lon, delimiter=",")

In [6]:
#get pressure into a shape we can work in
P_hrdps = np.empty((np.shape(p_temp)[1]*np.shape(p_temp)[2],np.shape(p_temp)[0]))
for i in range(np.shape(p_temp)[0]):
    P_hrdps[:,i] = np.reshape(p_temp[i],(np.shape(p_temp)[1]*np.shape(p_temp)[2],))
print(f"cummulative time -pressure done (s):"+str(time.time()))
    
U_hrdps = np.empty((np.shape(u_temp)[1]*np.shape(u_temp)[2],np.shape(u_temp)[0]))
for i in range(np.shape(u_temp)[0]):
    U_hrdps[:,i] = np.reshape(u_temp[i],(np.shape(u_temp)[1]*np.shape(u_temp)[2],))
print(f"cummulative time -U wind done (s):"+str(time.time()))
    
P_hrdps = np.empty((np.shape(p_temp)[1]*np.shape(p_temp)[2],np.shape(p_temp)[0]))
for i in range(np.shape(p_temp)[0]):
    P_hrdps[:,i] = np.reshape(p_temp[i],(np.shape(p_temp)[1]*np.shape(p_temp)[2],))
print(f"cummulative time -V wind done (s):"+str(time.time()))

cummulative time -pressure done (s):1609971450.3316672
cummulative time -U wind done (s):1609971453.7356806
cummulative time -V wind done (s):1609971456.969175


In [7]:
#replace all nan with 101325 or 0
P_hrdps = (pd.DataFrame(P_hrdps).fillna(101325)).to_numpy()
U_hrdps = (pd.DataFrame(P_hrdps).fillna(0)).to_numpy()
V_hrdps = (pd.DataFrame(P_hrdps).fillna(0)).to_numpy()

In [8]:
#now for preprocessing of world data
#trim the world dataset to be for right time extent
#want days between September 12 2014 and December 31 2020
P_world = P_world.sel(time=slice('2014-09-12 12:00:00', '2020-12-31 12:00:00'))
U_world = U_world.sel(time=slice('2014-09-12 12:00:00', '2020-12-31 12:00:00'))
V_world = V_world.sel(time=slice('2014-09-12 12:00:00', '2020-12-31 12:00:00'))

In [9]:
#now extract the data you want from the CanRCM4 xarray and trim them according to lat and lon of the HRDPS data
#first decide on range you want to work within for CanRCM4 data, want to overlap the HRDPS data by 10% of the max distance (y)
buffer = (max(hrdps_lat)-min(hrdps_lat))*0.15

lon = P_world.lon.values.flatten()
lat = P_world.lat.values.flatten()

index = []

#first find idexes that fit into lon range and lat range
for i in range(len(lon)):
    if lon[i] > (min(hrdps_lon)-buffer) and lon[i] < (max(hrdps_lon)+buffer) and lat[i] > (min(hrdps_lat)-buffer) and lat[i] < (max(hrdps_lat)+buffer):
        index.append(i)

#now make new lat, and lon
lat_RCM = []
lon_RCM = []

for i in index:
    lat_RCM.append(lat[i])
    lon_RCM.append(lon[i])

In [64]:
#export the RCM lat and lon
np.savetxt("RCM_lat.csv", lat_RCM, delimiter=",")
np.savetxt("RCM_lon.csv", lon_RCM, delimiter=",")

In [28]:
#load the data before trying to process it
P = P_world.psl.values
U = U_world.uas.values
V = V_world.vas.values

In [29]:
#now convert data to 2D
start = time.time()
P2d = np.empty((len(P_world.rlat)*len(P_world.rlon),len(P_world.time)))
               
for i in range(len(P_world.time)):
    P2d[:,i] = np.reshape(P[i],(len(P_world.rlat)*len(P_world.rlon),))
print(f"time -pressure done (s):"+str(time.time()-start))

               
start = time.time()
U2d = np.empty((len(U_world.rlat)*len(U_world.rlon),len(U_world.time)))
               
for i in range(len(U_world.time)):
    U2d[:,i] = np.reshape(U[i],(len(U_world.rlat)*len(U_world.rlon),))
print(f"time -U wind done (s):"+str(time.time()-start))

               
start = time.time()
V2d = np.empty((len(V_world.rlat)*len(V_world.rlon),len(V_world.time)))
               
for i in range(len(V_world.time)):
    V2d[:,i] = np.reshape(V[i],(len(V_world.rlat)*len(V_world.rlon),))
print(f"time -V wind done (s):"+str(time.time()-start))

time -pressure done (s):3.9428982734680176
time -U wind done (s):3.5779924392700195
time -V wind done (s):3.6302521228790283


In [35]:
#use indices found in lat-lon step to trim CanRCM4 extent

P_RCM = np.empty((len(index),len(P_world.time)))

for i in range(len(P_world.time)):
    for j in range(len(index)):
        P_RCM[j,i] = P2d[index[j],i]
        
U_RCM = np.empty((len(index),len(U_world.time)))

for i in range(len(U_world.time)):
    for j in range(len(index)):
        U_RCM[j,i] = U2d[index[j],i]
        
V_RCM = np.empty((len(index),len(V_world.time)))

for i in range(len(V_world.time)):
    for j in range(len(index)):
        V_RCM[j,i] = V2d[index[j],i]


## PCA
NOTE : before export find out how to limit range of the PCs so that you don't need to normalize afterwords

In [44]:
#PCA on the world data
#looking for dominant spatial patterns to be eigenvectors and how those spatial patterns evelove over the month to the the PCs)
#need to take the transpose of the matrix 

data = P_RCM.T

n_modes = np.min(np.shape(data))
pca = PCA(n_components = n_modes)
P_RCM_PCs = pca.fit_transform(data)
P_RCM_eigvecs = pca.components_
P_RCM_fracVar = pca.explained_variance_ratio_

data = U_RCM.T

n_modes = np.min(np.shape(data))
pca = PCA(n_components = n_modes)
U_RCM_PCs = pca.fit_transform(data)
U_RCM_eigvecs = pca.components_
U_RCM_fracVar = pca.explained_variance_ratio_

data = V_RCM.T

n_modes = np.min(np.shape(data))
pca = PCA(n_components = n_modes)
V_RCM_PCs = pca.fit_transform(data)
V_RCM_eigvecs = pca.components_
V_RCM_fracVar = pca.explained_variance_ratio_

In [67]:
whos

Variable            Type       Data/Info
----------------------------------------
P                   ndarray    2301x260x310: 185460600 elems, type `float32`, 741842400 bytes (707.4760437011719 Mb)
P2d                 ndarray    80600x2301: 185460600 elems, type `float64`, 1483684800 bytes (1414.9520874023438 Mb)
PCA                 ABCMeta    <class 'sklearn.decomposition._pca.PCA'>
P_RCM               ndarray    1085x2301: 2496585 elems, type `float64`, 19972680 bytes (19.04743194580078 Mb)
P_RCM_PCs           ndarray    2301x1085: 2496585 elems, type `float64`, 19972680 bytes (19.04743194580078 Mb)
P_RCM_eigvecs       ndarray    1085x1085: 1177225 elems, type `float64`, 9417800 bytes (8.981513977050781 Mb)
P_RCM_fracVar       ndarray    1085: 1085 elems, type `float64`, 8680 bytes
P_hrdps             ndarray    51338x2303: 118231414 elems, type `float64`, 945851312 bytes (902.0341033935547 Mb)
P_hrdps_PCs         ndarray    2303x2303: 5303809 elems, type `float64`, 42430472 bytes (

In [None]:
#export the relevant arrays to CSV
np.savetxt("P_RCM_PCs.csv", P_RCM_PCs, delimiter=",")
np.savetxt("P_RCM_eigvecs.csv", P_RCM_eigvecs, delimiter=",")
np.savetxt("P_RCM_fracVar.csv", P_RCM_fracVar, delimiter=",")

np.savetxt("U_RCM_PCs.csv", U_RCM_PCs, delimiter=",")
np.savetxt("U_RCM_eigvecs.csv", U_RCM_eigvecs, delimiter=",")
np.savetxt("U_RCM_fracVar.csv", U_RCM_fracVar, delimiter=",")

np.savetxt("V_RCM_PCs.csv", V_RCM_PCs, delimiter=",")
np.savetxt("V_RCM_eigvecs.csv", V_RCM_eigvecs, delimiter=",")
np.savetxt("V_RCM_fracVar.csv", V_RCM_fracVar, delimiter=",")

In [48]:
#PCA on the HRDPS data 
data = P_hrdps.T

n_modes = np.min(np.shape(data))
pca = PCA(n_components = n_modes)
P_hrdps_PCs = pca.fit_transform(data)
P_hrdps_eigvecs = pca.components_
P_hrdps_fracVar = pca.explained_variance_ratio_

data = U_hrdps.T

n_modes = np.min(np.shape(data))
pca = PCA(n_components = n_modes)
U_hrdps_PCs = pca.fit_transform(data)
U_hrdps_eigvecs = pca.components_
U_hrdps_fracVar = pca.explained_variance_ratio_

data = P_hrdps.T

n_modes = np.min(np.shape(data))
pca = PCA(n_components = n_modes)
V_hrdps_PCs = pca.fit_transform(data)
V_hrdps_eigvecs = pca.components_
V_hrdps_fracVar = pca.explained_variance_ratio_

In [49]:
#export the relevant arrays to CSV
np.savetxt("P_hrdps_PCs.csv", P_hrdps_PCs, delimiter=",")
np.savetxt("P_hrdps_eigvecs.csv", P_hrdps_eigvecs, delimiter=",")
np.savetxt("P_hrdps_fracVar.csv", P_hrdps_fracVar, delimiter=",")

np.savetxt("U_hrdps_PCs.csv", U_hrdps_PCs, delimiter=",")
np.savetxt("U_hrdps_eigvecs.csv", U_hrdps_eigvecs, delimiter=",")
np.savetxt("U_hrdps_fracVar.csv", U_hrdps_fracVar, delimiter=",")

np.savetxt("V_hrdps_PCs.csv", V_hrdps_PCs, delimiter=",")
np.savetxt("V_hrdps_eigvecs.csv", V_hrdps_eigvecs, delimiter=",")
np.savetxt("V_hrdps_fracVar.csv", V_hrdps_fracVar, delimiter=",")

In [65]:
#save date arrays that you want to work with
dates = P_world.time.values
np.savetxt("RCM_dates.csv",dates,delimiter=",",fmt="%s")
dates = hrdps.time_counter.values
np.savetxt("hrdps_dates.csv",dates,delimiter=",",fmt="%s")