In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.interpolate import interp1d
import os

TRAIN_DATA_DIRECTORY = 'path/to/your/train/data'

In [None]:
def create_sequences(X, y, time_steps=24):
    seq_indices = np.arange(0, X.shape[0] - time_steps + 1)
    xs = [X[i:i+time_steps] for i in seq_indices]
    ys = y[time_steps - 1:]
    return xs, ys

#this function won't work quite right--I think it checks the overall number of nas 
def filter_out_nas(list_of_dfs, max_consecutive_nas):
    index_fil = []
    for i, df in enumerate(list_of_dfs):
        observed_column = df['observed']
        # Check if there are any NAs at the beginning or end
        first_na = observed_column.isna().values[0]
        last_na = observed_column.isna().values[-1]
        # Check if the number of consecutive NAs is within the threshold
        num_consecutive_nas = np.sum(pd.isna(observed_column))
        # use run length  
        if num_consecutive_nas <= max_consecutive_nas and not first_na and not last_na:
            index_fil.append(i)
    return index_fil



In [None]:
# Prepare data
train_data_fp = TRAIN_DATA_DIRECTORY
td_files = os.listdir(train_data_fp)

x_list = []
y_list= []

#read in full df for scaling 
for file in td_files:
    i_df = pd.read_csv(os.path.join(train_data_fp, file))
    i_df['file_id'] = os.path.basename(file)
    x_list.append(i_df)  # Append each dataframe to x_list

# Concatenate all dataframes into one
full_df = pd.concat(x_list, ignore_index=True)
full_df 

In [None]:
# Prepare data
train_data_fp = TRAIN_DATA_DIRECTORY
td_files = os.listdir(train_data_fp)

x_list = []
y_list= []
i_list=[]
#read in full df for scaling 
for file in td_files:
    i_df = pd.read_csv(os.path.join(train_data_fp, file))
    i_df['file_id'] = os.path.basename(file)
    i_list.append(i_df)  # Append each dataframe to x_list

# Concatenate all dataframes into one
full_df = pd.concat(i_list, ignore_index=True)
#create scaler 
scaler = StandardScaler()
scaler.fit(full_df[['pm25_cf_1', 'temperature', 'humidity']])

for file in td_files[0:1]:
    df_org = pd.read_csv(os.path.join(train_data_fp, file))
    df_org['file_id'] = os.path.basename(file)
    #drop extraneous columns
    df = df_org.drop(
        columns=['pm25_cf_1_b', 'date', 'pm25_cf_1_a', 'year', 'month', 'R2', 'calibrated', 'PearsonR', 'calv2'])
    # Convert the 'datetime_column' to datetime format
    df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%d %H:%M:%S')
    
    #sort by date time 
    df = df.sort_values(by='datetime').reset_index(drop=True)
    df['observed'] = 1
    # Floor and ceiling the 'datetime' column
    earliest_date = df['datetime'].min()
    latest_date = df['datetime'].max()
    #add rows for hours not observed
    df = df.set_index('datetime').apply(lambda x: x.reindex(pd.date_range(min(x.index), max(x.index), freq='h')))
    #linearlly interpolate across gaps in observations 
    df[['epa_pm25', 'temperature', 'humidity', 'pm25_cf_1']] = df[
        ['epa_pm25', 'temperature', 'humidity', 'pm25_cf_1']].apply(lambda group: group.interpolate(method='linear'))
    #reset file_id column 
    df[['file_id']] = file
    
    #scale input based off range of original  
    x = df[['pm25_cf_1', 'temperature', 'humidity', 'observed']]
    #x.loc[:, ['pm25_cf_1', 'temperature', 'humidity']] = scaler.transform(x[['pm25_cf_1','temperature','humidity']])
    #seperate output 
    y = df[['epa_pm25']]
    time_steps = 24
    x_seq, y_seq = create_sequences(x, np.array(y), time_steps)
    #filter sequences to remove sequences with too many consecutive na values
    max_consec_nas = 3
    filtered_index = filter_out_nas(x_seq, max_consec_nas)
    x_fil = [x_seq[i] for i in filtered_index]
    y_fil = y_seq[filtered_index]
    print(file)
    
    #remove extra columns for X
    columns_to_remove = ['observed']
    #x_fil = [df.loc[:, df.columns != 'observed'] for df in x_fil]
    #x_fil= [df.reset_index(drop=True) for df in x_fil] 
    
    #append to overall list
    x_list.extend(x_fil)
    y_list.extend(y_fil)


In [None]:
np.random.seed(42)
samp = 0.7
index = np.random.choice(len(y_list), int(samp * len(y_list)), replace=False)

x_train = [x_list[i] for i in index]
y_train = [y_list[i][0] for i in index]
x_test = [x_list[i] for i in range(len(y_list)) if i not in index]
y_test =[y_list[i][0] for i in range(len(y_list)) if i not in index]

In [None]:
x_train_array = np.array(x_train)
y_train_array = np.array(y_train)
x_test_array = np.array(x_test)
y_test_array = np.array(y_test)

In [None]:
# Save to numpy arrays
np.save(f"{TRAIN_DATA_DIRECTORY}/Processed_TF_LSTM/x_train.npy", x_train_array)
np.save(f"{TRAIN_DATA_DIRECTORY}/Processed_TF_LSTM/y_train.npy", y_train_array)
np.save(f"{TRAIN_DATA_DIRECTORY}/Processed_TF_LSTM/x_test.npy", x_test_array)
np.save(f"{TRAIN_DATA_DIRECTORY}/Processed_TF_LSTM/y_test.npy", y_test_array)

In [None]:
x_list[224]

In [None]:
#drop extraneous columns
df = df_org.drop(columns=['pm25_cf_1_b', 'date', 'pm25_cf_1_a', 'year', 'month', 'R2', 'calibrated', 'PearsonR', 'calv2'])
# Convert the 'datetime_column' to datetime format
df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%d %H:%M:%S')

df = df.sort_values(by='datetime').reset_index(drop=True)
df['observed'] = 1

In [None]:
# Step 1: Floor and ceiling the 'datetime' column
earliest_date = df['datetime'].min()
latest_date = df['datetime'].max()

df = df.set_index('datetime').apply(lambda x: x.reindex(pd.date_range(min(x.index), max(x.index), freq='h')))

In [None]:
#linearlly interpolate
df[['epa_pm25', 'temperature', 'humidity', 'pm25_cf_1']] = df[['epa_pm25', 'temperature', 'humidity', 'pm25_cf_1']].apply(lambda group: group.interpolate(method='linear'))
#reset file_id column 
df[['file_id']]='placeholder'
#df[['file_id']]=file

In [None]:
x = df[['pm25_cf_1', 'temperature', 'humidity','observed']]
scaler = StandardScaler()
x[['pm25_cf_1','temperature','humidity']] = scaler.fit_transform(x[['pm25_cf_1','temperature','humidity']])

y = df[['epa_pm25']]

In [None]:
time_steps = 24
x_seq, y_seq = create_sequences(x, np.array(y), time_steps)

In [None]:
column_values = [m['observed'] for m in x_seq]
column_values

In [None]:
def filter_out_nas(list_of_dfs, max_consecutive_nas):
    index_fil = []
    for i, df in enumerate(list_of_dfs):
        observed_column = df['observed']
        # Check if there are any NAs at the beginning or end
        first_na = observed_column.isna().values[0]
        last_na = observed_column.isna().values[-1]
        # Check if the number of consecutive NAs is within the threshold
        num_consecutive_nas = np.sum(pd.isna(observed_column))
        if num_consecutive_nas <= max_consecutive_nas and not first_na and not last_na:
            index_fil.append(i)
    return index_fil

In [None]:
max_consec_nas = 3
filtered_index = filter_out_nas(x_seq, max_consec_nas)
filtered_index[1:10]

In [None]:
x_fil = [x_seq[i] for i in filtered_index]
y_fil = y_seq[filtered_index]
x_fil[230]

In [None]:
np.random.seed(42)
samp = 0.7
index = np.random.choice(len(y_fil), int(samp * len(y_fil)), replace=False)

x_train = [x_fil[i] for i in index]
y_train = y_fil[index]
x_test = [x_fil[i] for i in range(len(y_fil)) if i not in index]
y_test = y_fil[[i for i in range(len(y_fil)) if i not in index]]
