In [1]:
#load libraries
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import datetime as datetime
import matplotlib.colors as mcolors
import seaborn as sns

In [2]:
#load ML specific libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import StepLR
import torch.optim as optim
import pickle

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import interpolate

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
hemi = 'NH' #options: 'NH', 'SH'
id = 2 
feature = 'fac' #options: 'pot', 'fac', 'sxx', 'syy', 'sxy'

In [5]:
#ds = xr.open_dataset(f'/run/media/sachin/0fa21ddb-f70c-4238-9cf4-705e0360f1c1/NICT_Data/test/half_day_1deg.nc')
ds = xr.open_dataset(f'/run/media/sachin/0fa21ddb-f70c-4238-9cf4-705e0360f1c1/NICT_Data/{hemi}/multi/{hemi}_2020-2024_{feature}_feats_140_40_2min_rtsw.nc')
#ds = ds.sel(dt=slice('2023-01-01', '2024-12-31'))
ds

In [6]:
#remove bad dates per Nakamizo-san .txt file
#Please note: this is an incomplete list of bad dates
dates_to_remove = [
    "2020-08-05", "2020-09-18", "2020-09-23", "2020-09-25",
    "2021-01-26",
    "2021-03-31", "2021-04-21", "2021-06-02", "2021-06-04", "2021-07-19", "2021-07-23", "2021-07-30",
    "2021-08-04", "2021-08-19", "2021-08-26", "2021-08-31", "2021-09-24", "2021-10-06",
    "2022-01-25", "2022-01-26", "2022-01-27", "2022-01-28", "2022-02-11", "2022-02-12", 
    "2022-02-13", "2022-02-14", "2022-02-15", "2022-02-16", "2022-02-17", "2022-03-04", 
    "2022-06-08", "2022-06-09", "2022-08-02", "2022-08-02", "2022-08-03", "2022-08-04", 
    "2022-08-05", "2022-09-22", "2022-12-11", "2022-12-12", "2022-12-13",
    "2023-01-01", "2023-01-02", "2023-01-11", "2023-03-11", "2023-03-12", "2023-03-13", 
    "2023-03-14", "2023-03-15", "2023-04-05", "2023-04-06", "2023-04-10", "2023-08-10", "2023-08-11"
]

dates_to_remove = pd.to_datetime(dates_to_remove)

# Create a list of timestamps for each date to remove with 2-minute intervals
timestamps_to_remove = []
for date in dates_to_remove:
    timestamps_to_remove.extend(pd.date_range(date, date + pd.Timedelta(days=1) - pd.Timedelta(minutes=1), freq='2min'))

timestamps_to_remove = pd.to_datetime(timestamps_to_remove)
time_mask = np.isin(ds['dt'].values, timestamps_to_remove)
ds = ds.sel(dt=~time_mask)
ds

In [7]:
#check for NaNs in the dataset
#good sanity check, but not necessary
def nan_count(ds, var):
    #count number of NaNs in a data variable
    con_data = ds[var].values
    nan_count = np.count_nonzero(np.isnan(con_data))
    nan_ratio = nan_count / con_data.size
    not_nan = con_data.size - nan_count
    return not_nan, nan_count, np.round(nan_ratio, 2)

#nan_count(ds, 'pot')

In [8]:
# Initialize scalers for the target variable and input variables
input_scaler = StandardScaler() #scale using standard dev where mean is 0 and std is 1

# Extract the target variable and reshape for scaling
target_var = ds[feature].values  # shape (t (n), mlat (50), mlt (24))

# Extract and scale input variables (variables that are dependent only on 'dt')
input_vars = ['By', 'Bz', 'vsw', 'np', 'tilt_angle'] #RTSW
#input_vars = ['BY_GSE', 'BZ_GSE', 'flow_speed', 'proton_density', 'tilt_angle'] #OMNI

input_data = np.array([ds[var].values for var in input_vars]).T  # shape (22320, number_of_vars)
input_data_scaled = input_scaler.fit_transform(input_data)

file_path = f'/home/sachin/Documents/NIPR/Research/Data/ML/SMRAI3/smrai3_scaler_{feature}_{hemi}_id{id}.pkl'
with open(file_path, 'wb') as file:
    pickle.dump(input_scaler, file)

def create_sequences(target_data, input_data, lb):
    X, y = [], []
    for i in range(len(target_data) - lb):
        X.append(input_data[i:i+lb].T)
        y.append(target_data[i+lb])

    return np.array(X), np.array(y)

lookback = 30 # number of time steps to look back. At 1 min set to 60, at 2 min set to 30, and so on
X, y = create_sequences(target_var, input_data_scaled, lookback)

X.shape, y.shape

((986818, 5, 30), (986818, 40, 140))

In [9]:
def simple_time_series_split(X, y):

    #split the data into training, validation, and test sets
    #this is based on analysis of the solar wind speed data across the period
    X_train_1, X_test, X_train_2, X_val, X_train_3 = X[:350000], X[350000:450000], X[450000:750000], X[750000:850000], X[850000:]
    y_train_1, y_test, y_train_2, y_val, y_train_3 = y[:350000], y[350000:450000], y[450000:750000], y[750000:850000], y[850000:]
    X_train = np.concatenate((X_train_1, X_train_2, X_train_3), axis=0)
    y_train = np.concatenate((y_train_1, y_train_2, y_train_3), axis=0)

    return X_train, X_val, X_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = simple_time_series_split(X, y)
X_train.shape, X_val.shape, X_test.shape

((786818, 5, 30), (100000, 5, 30), (100000, 5, 30))

In [10]:
# Convert data to PyTorch tensors and move to GPU
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = np.array(y_test)

# Create DataLoader
batch_size = 128
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_dataset = TensorDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [11]:
mlat_len = np.size(ds['lat'].values)
mlt_len = np.size(ds['lon'].values)

In [12]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_stacked_layers):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_stacked_layers = num_stacked_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_stacked_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, mlat_len * mlt_len)

    def forward(self, x):
        batch_size = x.size(0)
        # Use the same device as the model for hidden states
        device = next(self.parameters()).device  # Ensure hidden states are on the same device as the model
        
        h0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size, device=device)  # Initial hidden state
        c0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size, device=device)  # Initial cell state
        
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        out = out.view(-1, mlat_len, mlt_len)
        return out

    def reset_states(self):
        # Reset the internal states of the LSTM layer
        self.lstm.reset_parameters()

# Instantiate the model with lookback size
hs = 128   #hidden state
sl = 2     #stacked layers
model = LSTM(lookback, hs, sl).to(device)
model

LSTM(
  (lstm): LSTM(30, 128, num_layers=2, batch_first=True)
  (fc): Linear(in_features=128, out_features=5600, bias=True)
)

In [13]:
#calculate the area for a given latitude and longitude
def calculate_delta(lat1, lon1, lat2, lon2):
    lat1 = np.abs(lat1)
    lat2 = np.abs(lat2)
    
    radius = 6371.008 * 1000

    area = radius**2 * (np.sin(np.radians(lat2)) - np.sin(np.radians(lat1))) * (np.radians(lon2) - np.radians(lon1))
    
    return area 

#set up the latitude and longitude intervals
mlat = ds['lat'].values
mlt = ds['lon'].values

#calculate the area for each latitude and longitude interval
def calculate_area(mlat, mlt):
    areas = np.zeros((mlat_len-1, mlt_len-1))
    for i in range(len(mlat) - 1):
        for j in range(len(mlt) - 1):
            lat1, lat2 = mlat[i], mlat[i + 1]
            lon1, lon2 = mlt[j], mlt[j + 1]

            area = calculate_delta(lat1, lon1, lat2, lon2)
            areas[i, j] = area

    #interpolate the areas back to shape of mlat, mlt
    #Currently 49x23, interpolate back to 50x24
    x = np.arange(areas.shape[1])
    y = np.arange(areas.shape[0])
    f = interpolate.interp2d(x, y, areas, kind='linear')
    xnew = np.arange(0, areas.shape[1], areas.shape[1]/(areas.shape[1]+1))
    ynew = np.arange(0, areas.shape[0], areas.shape[0]/(areas.shape[0]+1))
    areas = f(xnew, ynew)

    return areas

areas = calculate_area(mlat, mlt)

def weighted_loss_function(y_true, y_pred):

    # Normalize the area
    weights = areas / np.min(areas)

    #create custom 'Mean Area Weighted Loss' (MAWE) function
    loss = torch.mean(torch.abs(y_true - y_pred) * torch.tensor(weights, dtype=torch.float32).to(device))
    
    return loss


For legacy code, nearly bug-for-bug compatible replacements are
`RectBivariateSpline` on regular grids, and `bisplrep`/`bisplev` for
scattered 2D data.

In new code, for regular grids use `RegularGridInterpolator` instead.
For scattered data, prefer `LinearNDInterpolator` or
`CloughTocher2DInterpolator`.

For more details see
`https://scipy.github.io/devdocs/notebooks/interp_transition_guide.html`

  f = interpolate.interp2d(x, y, areas, kind='linear')

For legacy code, nearly bug-for-bug compatible replacements are
`RectBivariateSpline` on regular grids, and `bisplrep`/`bisplev` for
scattered 2D data.

In new code, for regular grids use `RegularGridInterpolator` instead.
For scattered data, prefer `LinearNDInterpolator` or
`CloughTocher2DInterpolator`.

For more details see
`https://scipy.github.io/devdocs/notebooks/interp_transition_guide.html`

  areas = f(xnew, ynew)


In [14]:
#ptimizer = torch.optim.Adam(model.parameters(), lr=7e-4) # Adam optimizer
#scheduler = StepLR(optimizer, step_size=8, gamma=0.7) #decay learning rate by 0.7 every 8 epochs
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) # Adam optimizer
scheduler = StepLR(optimizer, step_size=6, gamma=0.6) #decay learning rate by 0.7 every 8 epochs

In [15]:
# Model settings
num_epochs = 100
train_losses = []
val_losses = []

# Early stopping
best_val_loss = float('inf')
patience = 7
counter = 0

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0

    for X_batch, y_batch in train_loader:
        # Move data to the appropriate device
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        # Forward pass
        train_outputs = model(X_batch)
        # loss = loss_function(train_outputs, y_batch)
        loss = weighted_loss_function(y_batch, train_outputs)

        # Backward pass and optimization
        optimizer.zero_grad()  # Clear previous gradients
        loss.backward()  # Backpropagate the loss
        optimizer.step()  # Update model parameters
        epoch_loss += loss.item()  # Accumulate epoch loss

    # Step the scheduler
    scheduler.step()  # Adjust learning rate

    train_loss = epoch_loss / len(train_loader)
    train_losses.append(train_loss)

    # Validation
    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            # Move data to the appropriate device
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            val_outputs = model(X_batch)
            # val_loss += loss_function(val_outputs, y_batch).item()
            val_loss += weighted_loss_function(y_batch, val_outputs).item()

    val_loss /= len(val_loader)
    val_losses.append(val_loss)

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
        torch.save(model.state_dict(), f'/home/sachin/Documents/NIPR/Research/Data/ML/SMRAI3/smrai3_model_{feature}_{hemi}_id{id}.pt')
    else:
        counter += 1
        if counter == patience:
            print(f'Validation loss did not improve for {patience} epochs. Stopping training.')
            break

    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss (MAWE): {train_loss:.4f}, Val Loss (MAWE): {val_loss:.4f}, Best Val Loss: {best_val_loss:.4f}, LR: {scheduler.get_last_lr()[0]:.1e}')


Epoch [1/100], Train Loss (MAWE): 1.3292, Val Loss (MAWE): 1.2160, Best Val Loss: 1.2160, LR: 1.0e-03
Epoch [2/100], Train Loss (MAWE): 1.1721, Val Loss (MAWE): 1.0606, Best Val Loss: 1.0606, LR: 1.0e-03
Epoch [3/100], Train Loss (MAWE): 1.1094, Val Loss (MAWE): 1.0216, Best Val Loss: 1.0216, LR: 1.0e-03
Epoch [4/100], Train Loss (MAWE): 1.0911, Val Loss (MAWE): 1.0147, Best Val Loss: 1.0147, LR: 1.0e-03
Epoch [5/100], Train Loss (MAWE): 1.0800, Val Loss (MAWE): 1.0138, Best Val Loss: 1.0138, LR: 1.0e-03
Epoch [6/100], Train Loss (MAWE): 1.0722, Val Loss (MAWE): 1.0085, Best Val Loss: 1.0085, LR: 6.0e-04
Epoch [7/100], Train Loss (MAWE): 1.0539, Val Loss (MAWE): 0.9829, Best Val Loss: 0.9829, LR: 6.0e-04
Epoch [8/100], Train Loss (MAWE): 1.0475, Val Loss (MAWE): 0.9770, Best Val Loss: 0.9770, LR: 6.0e-04
Epoch [9/100], Train Loss (MAWE): 1.0440, Val Loss (MAWE): 0.9751, Best Val Loss: 0.9751, LR: 6.0e-04
Epoch [10/100], Train Loss (MAWE): 1.0402, Val Loss (MAWE): 0.9720, Best Val Loss:

In [16]:
test_model = LSTM(lookback, hs, sl)
test_model.load_state_dict(torch.load(f'/home/sachin/Documents/NIPR/Research/Data/ML/SMRAI3/smrai3_model_{feature}_{hemi}_id{id}.pt'))
test_model.to('cpu')
test_model.eval()

# List to store predictions
predictions = []

# Perform predictions in batches
with torch.no_grad():
    batch_size = 128
    for i in range(0, len(X_test), batch_size):
        X_batch = X_test[i:i+batch_size]
        
        # Forward pass
        y_pred = test_model(X_batch)
        
        # Append the predictions (already on CPU)
        predictions.append(y_pred.numpy())

# Concatenate all predictions into a single array
predictions = np.concatenate(predictions)

In [17]:
def batch_mae(y_test, predictions, batch_size):
    mae_accum = []
    
    for i in range(0, len(y_test), batch_size):
        y_batch = y_test[i:i+batch_size]
        pred_batch = predictions[i:i+batch_size]
        mae_accum.append(np.mean(np.abs(y_batch - pred_batch)))

    return np.mean(mae_accum)

def batch_nmae_std(y_test, predictions, batch_size):
    overall_std = np.std(y_test)  # Use the overall standard deviation
    nmae_accum = []

    for i in range(0, len(y_test), batch_size):
        y_batch = y_test[i:i+batch_size]
        pred_batch = predictions[i:i+batch_size]
        nmae_accum.append(np.mean(np.abs(y_batch - pred_batch)) / overall_std)

    return np.mean(nmae_accum)

def batch_corrcoef(y_test, predictions, batch_size):
    corr_accum = []
    
    for i in range(0, len(y_test), batch_size):
        y_batch = y_test[i:i+batch_size]
        pred_batch = predictions[i:i+batch_size]
        if len(y_batch) > 1:  # Corrcoef needs at least 2 points to calculate
            corr_accum.append(np.corrcoef(y_batch.flatten(), pred_batch.flatten())[0, 1])
    
    return np.mean(corr_accum)

def skill(m, o):
    skill = 1 - (np.sum((m - o)**2) / np.sum((o - np.mean(o))**2))
    return skill      

# Set the batch size
batch_size = 10000

# Calculate metrics in batches
MAE = batch_mae(y_test, predictions, batch_size)
NMAE_std = batch_nmae_std(y_test, predictions, batch_size)  # Now using overall std of y_test
R = batch_corrcoef(y_test, predictions, batch_size)

# Calculate skill normally (if this isn't memory intensive)
PE = skill(predictions, y_test)

print(f'Test Loss (MAE): {MAE:.3f}, Test Loss (NMAE_std): {NMAE_std:.3f}, R: {R:.2f}, PE: {PE:.4f}')
#print(f'Test Loss (MAE): {MAE:.4f}, Test Loss (NMAE_std): {NMAE_std:.4f}')


Test Loss (MAE): 0.039, Test Loss (NMAE_std): 0.344, R: 0.73, PE: 0.5347
