In [None]:
def fn_make_matrix_pr(stid,ipd,nhours,ndays,nyears_row,nyears_col):

    #Create a matrix for precipitation data that feeds
    #The S4 model
    # 1. Read in all data files
    # 2. Find the common starting date that will be used to align the matrices
    # 3. Keep only the values that fit the common starting date
    # 4. Combine the data into one large matrix for the StationID 

    # 1. Read in data files and get start date v_date... for each
    # Check that data exist for all metrics

    #Air temperature (one variable recorded once an hour a day)
    df_at = pd.read_csv(ipd + stid + "_model_data_airtemp.csv")
    v_date_at = datetime.strptime(df_at.iloc[0,2], "%Y-%m-%d").date()
    #Relative Hunidity (3 variables recorded once a day)
    df_rh = pd.read_csv(ipd + stid + "_model_data_relhum.csv")
    v_date_rh = datetime.strptime(df_rh.iloc[0,4], "%Y-%m-%d").date()
    #Precipitation (one variable recorded once an hour a day)
    df_pr = pd.read_csv(ipd + stid + "_model_data_precip.csv")
    v_date_pr = datetime.strptime(df_pr.iloc[0,2], "%Y-%m-%d").date()


    # 2. Find the offsets needed to all datasets start at same starting date
    v_date_minimum = min(v_date_at,v_date_rh,v_date_pr)
    v_date_minimum

    #Identify offsets to use so all datadata vectors are appropriately aligned 
    offset_at = -(v_date_minimum - v_date_at).days
    offset_rh = -(v_date_minimum - v_date_rh).days
    offset_pr = -(v_date_minimum - v_date_pr).days

    # 3. Get vectors of data for each measure
    #These vectors all start on the same date
    vc_at1 = df_at.iloc[offset_at:(len(df_at.iloc[:,1])-offset_at),1]
    vc_rh1 = df_rh.iloc[offset_rh:(len(df_rh.iloc[:,1])-offset_rh),1]
    vc_rh2 = df_rh.iloc[offset_rh:(len(df_rh.iloc[:,2])-offset_rh),2]
    vc_rh3 = df_rh.iloc[offset_rh:(len(df_rh.iloc[:,3])-offset_rh),3]
    vc_pr1 = df_pr.iloc[offset_pr:(len(df_pr.iloc[:,1])-offset_pr),1]


    # 4. Interleave vectors so we get a complete vector of all metrics
    #Start with the once a day vectors and loop over the number of days as defined by once a day vector
    # These set how many years of data are needed : nyears_row + nyears_col
    nyears_data_limit = nyears_row + nyears_col + 1
    nrows = nhours*ndays*nyears_row
    ncols = nhours*ndays*nyears_col 
    
    # set the number of metrics created each day
    # Metrics measured at hourly intervals are put into day metrics
    # And ordered from most recent hour to most distant hour from left to right
    # Day metrics are also ordered most recent day to most distant day left to right
    ndaily_metrics = 2*nhours + 3*1

    #Set the total number of days of data needed for the model
    num_days = nyears_data_limit*ndays
    #print(f"Number of days: {num_days}")

    is_first = 1

    #Construct the vector that holds all data values (valid and missing)
    for i_day in range(num_days):
        i_day_hour_end = (i_day + 1)*nhours - 1
        i_day_hour_start = i_day_hour_end - (nhours -1)
        va = vc_pr1[(i_day_hour_start + offset_pr):(i_day_hour_end + offset_pr)]
        vb = vc_rh1[(i_day + offset_rh):(i_day + offset_rh)]
        vc = vc_rh2[(i_day + offset_rh):(i_day + offset_rh)]
        vd = vc_rh3[(i_day + offset_rh):(i_day + offset_rh)]
        ve = vc_at1[(i_day_hour_start + offset_pr):(i_day_hour_end + offset_pr)]

        #vmt_full set = np.concatenate((vmt_one_day,va,vb,vc,vd,ve)) 
        #print(i_day)
        if is_first == 0:
            vmt_full_set = np.concatenate((vmt_full_set,va,vb,vc,vd,ve))
            #print(len(vmt_full_set))
        else:
            is_first = 0
            vmt_full_set = np.concatenate((va,vb,vc,vd,ve))
            #print(len(vmt_full_set))

    #print(f"Length of data vector: {len(vmt_full_set)}")


    #Organize the data vector into an array which will be returned to the function call
    #Create an empty matrix in which data are organized
    nrows = nyears_row*ndays
    ncols = nyears_col*ndays*ndaily_metrics
    nrow_days =  nyears_row*ndays
    ncol_days =  nyears_col*ndays
    data_limit = len(vmt_full_set)
    #print(f"Number of columns in matrix: {ncols}")
    #print(f"Limit of data: {data_limit}")

    #Make template matrix to house data
    template_matrix = np.zeros((nrows, ncols), dtype=np.float16)

    # Fill the matrix
    for i in range(nrow_days):
        vc_start = i*(ndaily_metrics)
        vc_end = ncols + vc_start
        
        if vc_end < data_limit:
            template_matrix[i] = vmt_full_set[(vc_start):(vc_end)]

    #   print(vc_start)
    #   print(vc_end)
    #   print(len(vmt_full_set[(vc_start):(vc_end)]))
        
    return template_matrix



In [None]:
#Load the data for a station and organize
import pandas as pd
import numpy as np
import math
from datetime import datetime

# Path to data files
ipd = "C:\\Users\\jhugh\\Documents\\Py_S4\\Py_S4_v02_JHH\\NCEI_data\\"

#Organize the data based on the nature of the data so they all conform to the same size matrices
#The metrics included are
#  AT - air temperature in Celcius
#  RH - relative humidity in Percent
#  PR - precipitation in MM

#Identify which stations are in folder and have all three metrics
#List only files
files = [entry.name for entry in os.scandir(ipd) if entry.is_file()]

# Unique_stations
df_files = pd.DataFrame(files,columns=['File_name'])
df_files['stid'] = df_files['File_name'].str[:11]
df_files['mttype'] = df_files['File_name'].str[23:29]
df_files['has_airtem'] = np.where(df_files['mttype'] == 'airtem',1,0)
df_files['has_precip'] = np.where(df_files['mttype'] == 'precip',1,0)
df_files['has_relhum'] = np.where(df_files['mttype'] == 'relhum',1,0)
df_files['has_all3'] = df_files[['has_airtem','has_precip','has_relhum']].sum(axis=1) 

#All the available stations which have all three metrics needed for model
vc_stid = df_files[['stid','has_all3']].drop_duplicates()
#vc_stid <- vc_stid[1]

#print(df_files)
print(vc_stid)

#Put them all in appropriate matrices and create list of metrics
#Data files are of two types
#    -  24 hour * 365 days * n years  (AT, PR)
#    -  365 days * n years (RH)

#Parameters governing data matrix
nhours = 24
ndays = 365
nyears_row = 6
nyears_col = 3

def get_matrix(in_stid):
    return fn_make_matrix_pr(in_stid,ipd,nhours,ndays,nyears_row,nyears_col)

#lst_matrix = list(map(get_matrix,vc_stid.iloc[1,0]))

#print(lst_matrix)
#print(jj.shape)

jj <- fn_make_matrix_pr("72381523161",ipd,nhours,ndays,nyears_row,nyears_col)


In [146]:
def fn_make_matrix(vc_measure,offset_measure,nhours,ndays,nyears_col,nyears_row):
    #the offset from the start of the vector to align the data points
    vector = vc_measure

    nrows = nhours*ndays*nyears_row   #the number of values put in each row of matrix days*hours*years
    ncols = nhours*ndays*nyears_col   #the number of values put in each row of matrix days*hours*years
    
    # Create an empty matrix to store the results
    matrix = np.zeros((nrows, ncols))

    # Fill the matrix
    for i in range(nrows):
        matrix[i] = vector[(offset_measure + i):(offset_measure + i + ncols)]
     
    return matrix
    

In [None]:
def find_first_non_nan(vector):
    return next((i for i, x in enumerate(vector) if not math.isnan(x)), None)

first_number_at1 = find_first_non_nan(vc_at1)
first_number_rh1 = find_first_non_nan(vc_rh1)
first_number_rh2 = find_first_non_nan(vc_rh2)
first_number_rh3 = find_first_non_nan(vc_rh3)
first_number_pr1 = find_first_non_nan(vc_pr1)

#Number of missing data points
print(f"AT1 Number of missing cells: {np.count_nonzero(np.isnan(vc_at1))}")
print(f"AT1 Vector Length: {len(vc_at1)}")

print(f"RH1 Number of missing cells: {np.count_nonzero(np.isnan(vc_rh1))}")
print(f"RH1 Vector Length: {len(vc_rh1)}")

print(f"PR1 Number of missing cells: {np.count_nonzero(np.isnan(vc_pr1))}")
print(f"PR1 Vector Length: {len(vc_pr1)}")

#Once a day metrics
v_data_first_once_day = max(
                   first_number_rh1,
                   first_number_rh2,
                   first_number_rh3)
print(v_data_first_once_day)

#24 times a day metrics
v_data_first_24h_day = max(first_number_at1,
                   first_number_pr1)
print(v_data_first_24h_day)


#These vectors all start with a numeric value so missing values can be filled


#print(first_number_at1)
#print(first_number_rh1)
#print(first_number_rh2)
#print(first_number_rh3)
#print(first_number_pr1)

