# PCA
PCA steps for hourly HRDPS data, dominant PCs to be exported in an array (in an effort to reduce memory load)

In [1]:
#import required libraries
import pandas as pd
import numpy as np
import xarray as xr
import time
from sklearn.decomposition import PCA

In [2]:
#load in HRDPS
hrdps = xr.open_dataset("hrdps_hourly_postSep2014.nc")

#lets go ahead and trim the hrdps dataset so it fits more snuggly in a year
#we're also going to remove the leap days here so that seasonal cycle calcs will work
a = hrdps.sel(time_counter=slice('2015-01-01T00:00:00.000000000', '2016-02-28T23:00:00.000000000'))
b = hrdps.sel(time_counter=slice('2016-03-01T00:00:00.000000000', '2020-02-28T23:00:00.000000000'))
c = hrdps.sel(time_counter=slice('2020-03-01T00:00:00.000000000', '2020-12-31T23:00:00.000000000'))
hrdps= xr.concat([a,b,c],dim='time_counter')

In [3]:
#first lets do some quick processing of the hrdps data
#hrdps_lat = hrdps.nav_lat.sel(time_counter=hrdps.time_counter[0]).values.flatten()
#hrdps_lon = hrdps.nav_lon.sel(time_counter=hrdps.time_counter[0]).values.flatten()
#p_temp = hrdps.atmpres.values
u_temp = hrdps.u_wind.values
v_temp = hrdps.v_wind.values

In [4]:
#get pressure into a shape we can work in
# P = np.empty((np.shape(p_temp)[1]*np.shape(p_temp)[2],np.shape(p_temp)[0]))
# for i in range(np.shape(p_temp)[0]):
#     P[:,i] = np.reshape(p_temp[i],(np.shape(p_temp)[1]*np.shape(p_temp)[2],))
# print(f"cummulative time -pressure done (s):"+str(time.time()))
    
U = np.empty((np.shape(u_temp)[1]*np.shape(u_temp)[2],np.shape(u_temp)[0]))
for i in range(np.shape(u_temp)[0]):
    U[:,i] = np.reshape(u_temp[i],(np.shape(u_temp)[1]*np.shape(u_temp)[2],))
print(f"cummulative time -U wind done (s):"+str(time.time()))
    
V = np.empty((np.shape(v_temp)[1]*np.shape(v_temp)[2],np.shape(v_temp)[0]))
for i in range(np.shape(v_temp)[0]):
    V[:,i] = np.reshape(v_temp[i],(np.shape(v_temp)[1]*np.shape(v_temp)[2],))
print(f"cummulative time -V wind done (s):"+str(time.time()))

cummulative time -U wind done (s):1611685311.9968083
cummulative time -V wind done (s):1611685371.8305454


In [5]:
#replace all nan with 101 kPa or 0 m/s
# P = (pd.DataFrame(P).fillna(101000)).to_numpy()
U = (pd.DataFrame(U).fillna(0)).to_numpy()
V = (pd.DataFrame(V).fillna(0)).to_numpy()

In [6]:
#convert U and V wind into wind speed and direction
speed = np.sqrt(np.add(np.square(U),np.square(V)))
direc = np.arctan2(U,V)

In [7]:
#now de-mean the dataset (If the mean isn't removed then it will show up in the mode 1 EOF and give disproportionate weight to the mode 1 percent variance)
#calculate seasonal signal for each grid point

nyears = 6
hours = 365*24

# #pressure
# seasonal = np.empty((np.shape(P)[0],hours))
# for ii in range(len(P)):
#     seasonal[ii,:] = np.mean(np.reshape(P[ii,:],(nyears,hours)),axis=0)

# #repeat the seasonal cycle for all years
# seasonal_all = np.tile(seasonal,(1,nyears))

# #remove seasonal cycle from the original data (calculate anomalies)
# P_anom = P - seasonal_all

# #save seasonal cycle so that you can add mean back to data in reconstruction
# np.savetxt("P_seasonal.csv", seasonal, delimiter=",")

In [None]:
#speed
seasonal = np.empty((np.shape(speed)[0],hours))
for ii in range(len(speed)):
    seasonal[ii,:] = np.mean(np.reshape(speed[ii,:],(nyears,hours)),axis=0)

#repeat the seasonal cycle for all years
seasonal_all = np.tile(seasonal,(1,nyears))

#remove seasonal cycle from the original data (calculate anomalies)
speed_anom = speed - seasonal_all

#save seasonal cycle so that you can add mean back to data in reconstruction
np.savetxt("speed_seasonal.csv", seasonal, delimiter=",")

In [None]:
#direction
seasonal = np.empty((np.shape(direc)[0],hours))
for ii in range(len(direc)):
    seasonal[ii,:] = np.mean(np.reshape(direc[ii,:],(nyears,hours)),axis=0)

#repeat the seasonal cycle for all years
seasonal_all = np.tile(seasonal,(1,nyears))

#remove seasonal cycle from the original data (calculate anomalies)
direc_anom = direc - seasonal_all

#save seasonal cycle so that you can add mean back to data in reconstruction
np.savetxt("direc_seasonal.csv", seasonal, delimiter=",")

## PCA


In [None]:
# #PCA on the HRDPS data 
# data = P_anom.T

# n_modes = np.min(np.shape(data))
# pca = PCA(n_components = n_modes)
# PCs = pca.fit_transform(data)
# eigvecs = pca.components_
# fracVar = pca.explained_variance_ratio_

# #export the relevant arrays to CSV
# np.savetxt("P_hrdps_hourly_PCs.csv", PCs, delimiter=",")
# np.savetxt("P_hrdps_hourly_eigvecs.csv",eigvecs, delimiter=",")
# np.savetxt("P_hrdps_hourly_fracVar.csv",fracVar, delimiter=",")

In [None]:
data = speed_anom.T

n_modes = np.min(np.shape(data))
pca = PCA(n_components = n_modes)
PCs = pca.fit_transform(data)
eigvecs = pca.components_
fracVar = pca.explained_variance_ratio_

np.savetxt("speed_hrdps_hourly_PCs.csv", PCs, delimiter=",")
np.savetxt("speed_hrdps_hourly_eigvecs.csv", eigvecs, delimiter=",")
np.savetxt("speed_hrdps_hourly_fracVar.csv", fracVar, delimiter=",")

In [None]:
data = direc_anom.T

n_modes = np.min(np.shape(data))
pca = PCA(n_components = n_modes)
PCs = pca.fit_transform(data)
eigvecs = pca.components_
fracVar = pca.explained_variance_ratio_

np.savetxt("direc_hrdps_hourly_PCs.csv", PCs, delimiter=",")
np.savetxt("direc_hrdps_hourly_eigvecs.csv", eigvecs, delimiter=",")
np.savetxt("direc_hrdps_hourly_fracVar.csv", fracVar, delimiter=",")

In [None]:
#save date arrays that you want to work with
dates = hrdps.time_counter.values
np.savetxt("hrdps_times.csv",dates,delimiter=",",fmt="%s")