# Install and import

In [1]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import pickle
import datetime


In [17]:
#!pip install fastparquet==2023.10.1

Collecting fastparquet==2023.10.1
  Downloading fastparquet-2023.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
Installing collected packages: fastparquet
  Attempting uninstall: fastparquet
    Found existing installation: fastparquet 2024.2.0
    Uninstalling fastparquet-2024.2.0:
      Successfully uninstalled fastparquet-2024.2.0
Successfully installed fastparquet-2023.10.1


# Settings

In [2]:
# load dataset of bins
dataset = os.listdir("../../datasets/Canada_bins_df_final/")

# SELECT columns, Phi60_Sig1 will be added automatically in function create_windows
_features_cols = ["PC"]

# SELECT bins or use dataset for all bins 
_bins = dataset
# SET window_size
window_size = 45

# SET how many minutes ahead do we want to predict
minutes_ahead = 15

# SET the folder name
folder_name= "PC"

# please don't change this parameter yet so we can compare on the same dataset
test_year = 2019
validation_year = 2015

path_to_omni = "../../datasets/omni4col_INTER_NORMALIZED_hour_2013-2021.parquet.gzip"

f_name = f"{folder_name}/shift-{minutes_ahead}-windows-{window_size}/"
os.makedirs(f"data/{f_name}/", exist_ok=True)

In [3]:
df = pd.read_parquet("../../datasets/Canada_bins_df_final/12844.parquet.gzip")

In [4]:
df.columns

Index(['SVID', 'Azimuth', 'Elevation', 'AvgSig1_C/N0', 'S4_Sig1',
       'Cor_S4_Sig1', 'Phi60_Sig1', 'AvgCCD_Sig1', 'TEC_TOW', 'Sig1_lock_time',
       'Lock_time_2nd_freq', 'Avg_C/N0_2nd_freq', 'SI_ind_Sig1', 'p_Sig1',
       'Latitude_IPP', 'Longitude_IPP', 'Latitude_bin', 'Longitude_bin',
       'Bin_index', 'ne', 'Ti', 'nO2+', 'NmE', 'N2[cm^-3]', 'O2[cm^-3]',
       'Exospheric_temp[K]', 'B', 'BzGSE', 'FlowPressure', 'E', 'Xbsn', 'AE',
       'SymH', 'AsyD', 'AsyH', 'PC', 'Kp_index', 'Dst_index', 'ap_index',
       'f10.7_index', 'Sun_altitude', 'Sun-Earth_distance', 'Variation_coeff',
       'Binary_scinti_label'],
      dtype='object')

# Functions

In [3]:
# refactored function for aspis
def create_windows(df, window_size, minutes_ahead, features_cols, separate_windows=True, path_to_omni_hour=None):
    """
    Creates windows of the specified size and makes a time-shift.

        Parameters:
                df (Pandas DF): Data frame with features and label
                window_size (int): Features window size (in minutes)
                minutes_ahead (int): By how many minutes to make a prediction
                features_cols (list): List of features to be used for prediction
                separate_windows (bool): If true, returns windows with and without scintillation separately (training),
                                         if false, returns all windows at once (testing)
                path_to_omni_hour (path): path to Pandas DF with data from OMNIWeb

        Returns:
            separate_windows is True:
                X_min_scinti (numpy array): Minute resolution feature windows with scintillation
                X_hour_scinti (numpy array): Hour resolution feature values with scintillation
                y_scinti (numpy array): Label (regression values) with scintillation
                X_min_no_scinti (numpy array): Minute resolution feature windows without scintillation
                X_hour_no_scinti (numpy array): Hour resolution feature values without scintillation
                y_no_scinti (numpy array): Label (regression values) without scintillation

            separate_windows is False:
                X_min (numpy array): Minute resolution feature windows
                X_hour (numpy array): Hour resolution feature values
                y (numpy array): Label (regression values)
                scinti_label (numpy array): Label (binary values)
                index_to_save (numpy array): Time index for visualisation purposes
    """

    # return empty arrays, if data frame is too small
    if len(df) < (window_size + minutes_ahead):
        if separate_windows:
            return np.array([]), np.array([]), np.array([]), np.array([]), np.array([]), np.array([]), np.array([]), np.array([])
        else:
            np.array([]), np.array([]), np.array([]), np.array([]), np.array([])

    if "Phi60_Sig1" in features_cols:
        features_cols.remove("Phi60_Sig1")
    if "Binary_scinti_label" in features_cols:
        features_cols.remove("Binary_scinti_label")
    features_cols.append("Phi60_Sig1")
    features_cols.append("Binary_scinti_label")

    df.loc[df['Phi60_Sig1'] > 1, 'Phi60_Sig1'] = 1

    hourly_features = ["Kp_index", "Dst_index", "ap_index", "f10.7_index"]
    hourly_features_used = []
    minute_features_used = []
    for col in features_cols:
        if col in hourly_features:
            hourly_features_used.append(col)
        else:
            minute_features_used.append(col)

    # fills missing indexes
    new_date_range = pd.date_range(start=df.index[0], end=df.index[-1], freq="min")
    df = df.reindex(new_date_range, fill_value=np.nan)

    # calculates the number of windows and prepares an empty array
    n_of_windows = len(df) - window_size + 1
    X_min = np.ones((n_of_windows, len(minute_features_used), window_size))
    if len(hourly_features_used) > 0:
        X_hour = np.ones((n_of_windows, len(hourly_features_used), 1))

        if path_to_omni_hour is not None:
            omni_hour = pd.read_parquet(path_to_omni_hour)
            omni_hour = omni_hour.loc[str(df.index[0]):str(df.index[-1])]
            df[hourly_features_used] = omni_hour[hourly_features_used]
        else:
            raise ValueError('The path to the OMNI hourly dataset is not specified!')
    else:
        X_hour = np.array([])

    # fills the array of minute features with values
    for i in range(-1, (window_size + 1) * -1, -1):
        if i == -1:
            X_min[:, :, i] = X_min[:, :, i] * np.array(df[minute_features_used])[window_size + i:]
        else:
            X_min[:, :, i] = X_min[:, :, i] * np.array(df[minute_features_used])[window_size + i:i + 1]

    # fills the array of hour features with values
    if len(hourly_features_used) > 0:
        X_hour[:, :, -1] = X_hour[:, :, -1] * np.array(df[hourly_features_used])[window_size - 1:]

    # fills the array of labels with values
    y = np.array(df['Phi60_Sig1'])[window_size + minutes_ahead - 1:]
    y = np.append(y, np.ones(minutes_ahead) * np.nan)

    # fills the array of scintillation labels with values
    scinti_label = np.array(df['Binary_scinti_label'])[window_size + minutes_ahead - 1:]
    scinti_label = np.append(scinti_label, np.ones(minutes_ahead) * np.nan)
    index_to_save = np.array(df.index)[window_size + minutes_ahead - 1:]
    for i in range(minutes_ahead):
        index_to_save = np.append(index_to_save, pd.NaT)

    # delete windows with NaN values
    X_min_index_nan = np.argwhere(np.isnan(X_min))
    X_hour_index_nan = np.argwhere(np.isnan(X_hour))
    y_index_nan = np.argwhere(np.isnan(y))
    rows_with_nan = np.unique(np.append(np.unique(X_min_index_nan[:, 0]), np.append(np.unique(X_hour_index_nan[:, 0]), y_index_nan[:, 0])))

    X_min = np.delete(X_min, rows_with_nan, axis=0)
    if len(hourly_features_used) > 0:
        X_hour = np.delete(X_hour, rows_with_nan, axis=0)
    y = np.delete(y, rows_with_nan, axis=0)
    scinti_label = np.delete(scinti_label, rows_with_nan, axis=0)
    index_to_save = np.delete(index_to_save, rows_with_nan, axis=0)
    
    # column 'Phi60_Sig1' must be dropped because somehow it propagates out of function and normalization doesn't work properly
    features_cols.pop()
    features_cols.pop()
    if separate_windows:
        index_scinti = np.argwhere(scinti_label == 1)
        X_min_scinti = X_min[index_scinti]
        y_scinti = y[index_scinti]
        
        index_to_save_scinti = index_to_save[index_scinti]
        
        X_min_no_scinti = np.delete(X_min, index_scinti, axis=0)
        y_no_scinti = np.delete(y, index_scinti, axis=0)

        index_to_save_no_scinti = np.delete(index_to_save, index_scinti, axis=0)
        
        if len(hourly_features_used) > 0:
            X_hour_scinti = X_hour[index_scinti]
            X_hour_scinti = X_hour_scinti[:, 0, :]
            X_hour_no_scinti = np.delete(X_hour, index_scinti, axis=0)
        else:
            X_hour_scinti = np.array([])
            X_hour_no_scinti = np.array([])

        if len(y_scinti) == 0:
            return np.array([]), np.array([]), np.array([]), X_min_no_scinti, X_hour_no_scinti, y_no_scinti, np.array([]), index_to_save_no_scinti
        else:
            return X_min_scinti[:, 0, :], X_hour_scinti, y_scinti[:, 0], X_min_no_scinti, X_hour_no_scinti, y_no_scinti, index_to_save_scinti[:, 0], index_to_save_no_scinti

    return X_min, X_hour, y, scinti_label, index_to_save

In [5]:
def balance_classes(X_train_min_windows_scinti, X_train_hour_windows_scinti, y_train_windows_scinti,
                    X_train_min_windows_no_scinti, X_train_hour_windows_no_scinti, y_train_windows_no_scinti,
                    index_to_save_scinti, index_to_save_no_scinti):
    indexes = random.sample(range(len(X_train_min_windows_no_scinti)),
                            min(len(X_train_min_windows_no_scinti), len(X_train_min_windows_scinti)))

    # Ensure both arrays have the same number of dimensions
    X_train_min_windows_no_scinti = np.array([X_train_min_windows_no_scinti[i] for i in indexes])
    X_train_min_windows_scinti = np.array(X_train_min_windows_scinti)

    if X_train_min_windows_no_scinti.size > 0 and X_train_min_windows_scinti.size > 0:
        X_train_min_windows = np.concatenate((X_train_min_windows_no_scinti, X_train_min_windows_scinti), axis=0)
    else:
        X_train_min_windows = np.array([])

    if len(X_train_hour_windows_scinti) != 0:
        X_train_hour_windows_no_scinti = np.array([X_train_hour_windows_no_scinti[i] for i in indexes])
        X_train_hour_windows_scinti = np.array(X_train_hour_windows_scinti)
        X_train_hour_windows = np.concatenate((X_train_hour_windows_no_scinti, X_train_hour_windows_scinti), axis=0)
    else:
        X_train_hour_windows = np.array([])

    y_train_windows = np.concatenate((y_train_windows_no_scinti[indexes], y_train_windows_scinti), axis=0)
    index_to_save_train = np.concatenate((index_to_save_no_scinti[indexes], index_to_save_scinti), axis=0)
    print("total train windows: ", len(y_train_windows))
    print("total train time indexes to save: ", len(index_to_save_train))
    return X_train_min_windows, X_train_hour_windows, y_train_windows, index_to_save_train

# Create windows

In [None]:
# choose selection of bins or keep all bins
#_bins=["13388.parquet.gzip"]

X_train_min_final = []
X_train_hour_final = []
y_train_final = []
index_to_save_train_final = []
bins_to_save_train_final = []

X_test_min_final = []
X_test_hour_final = []
y_test_final = []
index_to_save_test_final = []
bins_to_save_test_final = []

X_val_min_final = []
X_val_hour_final = []
y_val_final = []
index_to_save_val_final = []
bins_to_save_val_final = []

# min-max normalization values
max_values = [-np.inf] * len(_features_cols)
min_values = [np.inf] * len(_features_cols)

for bins in _bins:
    print("Calculating normalization values, bin: ", bins)
    df = pd.read_parquet("../../datasets/Canada_bins_df_final/"+bins)
    train = df[df.index.year != test_year]
    train = train[train.index.year != validation_year]
    
    for i in range(len(_features_cols)):
        
        max_col_value = max(train[_features_cols[i]])
        if max_col_value > max_values[i]:
            max_values[i] = max_col_value
            
        min_col_value = min(train[_features_cols[i]])
        if min_col_value < min_values[i]:
            min_values[i] = min_col_value

for bins in _bins:
    # read data
    df = pd.read_parquet("../../datasets/Canada_bins_df_final/"+bins)
    
    # normalize
    fn = lambda value, x_max, x_min: (value - x_min) / (x_max - x_min)
    
    for i in range(len(_features_cols)):
        df[_features_cols[i]] = fn(df[_features_cols[i]], max_values[i], min_values[i])
    
    # define train, test and valid subset
    train = df[df.index.year != test_year]
    train = train[train.index.year != validation_year]
    test = df[df.index.year == test_year]
    valid = df[df.index.year == validation_year]
    
    X_min_scinti, X_hour_scinti, y_scinti, X_min_no_scinti, X_hour_no_scinti, y_no_scinti, time_scinti, time_no_scinti = create_windows(train, window_size, minutes_ahead, _features_cols, separate_windows=True, path_to_omni_hour=path_to_omni)
    
    if len(X_min_scinti) == 0 or len(y_scinti) == 0:
        continue
    
    print("TRAIN set")
    X_train_min_windows, X_train_hour_windows, y_train_windows, index_to_save_train = balance_classes(X_min_scinti, X_hour_scinti, y_scinti, X_min_no_scinti, X_hour_no_scinti, y_no_scinti, time_scinti, time_no_scinti)
    
    X_test_min_windows, X_test_hour_windows, y_test_windows, scinti_label_test, index_to_save_test = create_windows(test, window_size, minutes_ahead, _features_cols, separate_windows=False, path_to_omni_hour=path_to_omni)
    X_val_min_windows, X_val_hour_windows, y_val_windows, scinti_label_val, index_to_save_val = create_windows(valid, window_size, minutes_ahead, _features_cols, separate_windows=False, path_to_omni_hour=path_to_omni)
    
    print("Bin: ", bins)
    print("TEST set")
    
    print("total train")
    X_train_min_final.extend(X_train_min_windows)
    X_train_hour_final.extend(X_train_hour_windows)
    print(len(X_train_min_final))
    y_train_final.extend(y_train_windows)
    index_to_save_train_final.extend(index_to_save_train)
    bins_to_save_train_final.extend([bins]*len(index_to_save_train))
    
    print("total test")
    X_test_min_final.extend(X_test_min_windows)
    X_test_hour_final.extend(X_test_hour_windows)
    print(len(X_test_min_final))
    y_test_final.extend(y_test_windows)
    index_to_save_test_final.extend(index_to_save_test)
    bins_to_save_test_final.extend([bins]*len(index_to_save_test))
    
    print("total validation")
    X_val_min_final.extend(X_val_min_windows)
    X_val_hour_final.extend(X_val_hour_windows)
    print(len(X_val_min_final))
    y_val_final.extend(y_val_windows)
    index_to_save_val_final.extend(index_to_save_val)
    bins_to_save_val_final.extend([bins]*len(index_to_save_val))
    
    print("___________next bin________________")


Calculating normalization values, bin:  13388.parquet.gzip
Calculating normalization values, bin:  13379.parquet.gzip
Som v &create_windows&
A
B
C
D
D1
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
D2
E
E1
E2


# Create dict and save

In [10]:
ame = _features_cols + ["Phi60_Sig1"]
hour_features = ["Kp_index", "Dst_index", "ap_index", "f10.7_index"]
hour_features_used = []
minute_features_used = []

for col in name:
    if col in hour_features:
        hour_features_used.append(col)
    else:
        minute_features_used.append(col)


train_min = {key:[] for key in minute_features_used}
n=0

while n<len(train_min):
    for col in train_min:
        for i in X_train_min_final:
            train_min[col].append(i[n])
        n = n+1

for col in train_min:
    train_min[col]=np.vstack(train_min[col])


train_hour = {key:[] for key in hour_features_used}
n=0

while n<len(train_hour):
    for col in train_hour:
        for i in X_train_hour_final:
            train_hour[col].append(i[n])
        n = n+1

for col in train_hour:
    train_hour[col]=np.vstack(train_hour[col])


test_min = {key:[] for key in minute_features_used}
n=0

while n<len(test_min):
    for col in test_min:
        for i in X_test_min_final:
            test_min[col].append(i[n])
        n = n+1

for col in test_min:
    test_min[col]=np.vstack(test_min[col])


test_hour = {key:[] for key in hour_features_used}
n=0

while n<len(test_hour):
    for col in test_hour:
        for i in X_test_hour_final:
            test_hour[col].append(i[n])
        n = n+1

for col in test_hour:
    test_hour[col]=np.vstack(test_hour[col])



val_min = {key:[] for key in minute_features_used}
n=0

while n<len(val_min):
    for col in val_min:
        for i in X_val_min_final:
            val_min[col].append(i[n])
        n = n+1

for col in val_min:
    val_min[col]=np.vstack(val_min[col])


val_hour = {key:[] for key in hour_features_used}
n=0

while n<len(val_hour):
    for col in val_hour:
        for i in X_val_hour_final:
            val_hour[col].append(i[n])
        n = n+1

for col in val_hour:
    val_hour[col]=np.vstack(val_hour[col])

In [None]:
index_val_final=[datetime.datetime.utcfromtimestamp(int(timestamp)/1000000000).strftime('%Y-%m-%d %H:%M:%S') for timestamp in index_to_save_val_final]
index_test_final=[datetime.datetime.utcfromtimestamp(int(timestamp)/1000000000).strftime('%Y-%m-%d %H:%M:%S') for timestamp in index_to_save_test_final]
index_train_final=[datetime.datetime.utcfromtimestamp(int(timestamp)/1000000000).strftime('%Y-%m-%d %H:%M:%S') for timestamp in index_to_save_train_final]

y_test = np.vstack(y_test_final)
y_train = np.vstack(y_train_final)
y_val = np.vstack(y_val_final)

index_val = np.vstack(index_val_final)
index_test = np.vstack(index_test_final)
index_train = np.vstack(index_train_final)

bin_val = np.vstack(bins_to_save_val_final)
bin_test = np.vstack(bins_to_save_test_final)
bin_train = np.vstack(bins_to_save_train_final)

In [14]:
with open(f'data/{f_name}/y_train.npy', 'wb') as f:
    pickle.dump(y_train, f, protocol=4)
with open(f'data/{f_name}/y_test.npy', 'wb') as f:
    pickle.dump(y_test, f, protocol=4)
with open(f'data/{f_name}/y_val.npy', 'wb') as f:
    pickle.dump(y_val, f, protocol=4)

with open(f'data/{f_name}/X_train_min.npy', 'wb') as f:
    pickle.dump(train_min, f, protocol=4)
with open(f'data/{f_name}/X_test_min.npy', 'wb') as f:
    pickle.dump(test_min, f, protocol=4)
with open(f'data/{f_name}/X_val_min.npy', 'wb') as f:
    pickle.dump(val_min, f, protocol=4)

with open(f'data/{f_name}/X_train_hour.npy', 'wb') as f:
    pickle.dump(train_hour, f, protocol=4)
with open(f'data/{f_name}/X_test_hour.npy', 'wb') as f:
    pickle.dump(test_hour, f, protocol=4)
with open(f'data/{f_name}/X_val_hour.npy', 'wb') as f:
    pickle.dump(val_hour, f, protocol=4)

with open(f'data/{f_name}/v3/test_index_timestamp.npy', 'wb') as f:
    pickle.dump(index_test, f, protocol=4)
with open(f'data/{f_name}/v3/test_index_bin.npy', 'wb') as f:
    pickle.dump(bin_test, f, protocol=4)

with open(f'data/{f_name}/v3/train_index_timestamp.npy', 'wb') as f:
    pickle.dump(index_train, f, protocol=4)
with open(f'data/{f_name}/v3/train_index_bin.npy', 'wb') as f:
    pickle.dump(bin_train, f, protocol=4)

with open(f'data/{f_name}/v3/val_index_timestamp.npy', 'wb') as f:
    pickle.dump(index_val, f, protocol=4)
with open(f'data/{f_name}/v3/val_index_bin.npy', 'wb') as f:
    pickle.dump(bin_val, f, protocol=4)