In [2]:
import numpy as np
import pandas as pd
import xarray as xr

import torch
import torch.nn as nn
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, Dataset


In [24]:
#path = r'/home/ryuho/Documents/reddy/research/SMRAI/Data/REPPU/200/pbig5min.dat' #Ubuntu
path = r'/home/sachin/research/data/REPPU/pbig5min.dat' #Server

#read the REPPU data
with open (path) as f:
    rectype = np.dtype(np.float32)
    reppu_data = np.fromfile(f, rectype) #size = 109,900,800

reppu_data = reppu_data.reshape(-1,30,80)
reppu_data.shape

(45792, 30, 80)

In [25]:
#Read MHD dates and expand-out the date ranges
mhd_data = pd.read_csv('mhd_dates.csv')
expanded_dt = pd.concat([pd.Series(pd.date_range(start, end)) 
    for start, end in zip(mhd_data['start'], mhd_data['end'])])

In [26]:
data_reshaped = reppu_data.reshape(len(expanded_dt), 288, 30, 80) 

# Define coordinates
time = np.arange(288)
lat = np.linspace(53.1, 89.7, 30) #30 intervals between 53.1° to 89.7°
lon = np.linspace(1.6, 357.6, 80) #80 intervals between 1.6° to 357.6°

# Create 'dt' variable combining dates and five-minute intervals
dt = []
for day in expanded_dt:
    for t in time:
        dt.append(day + pd.Timedelta(minutes=t*5))
dt = np.array(dt) #convert from list to numpy array

# Create xarray Dataset
ds = xr.Dataset({'potential': (['dt', 'lat', 'lon'], data_reshaped.reshape(-1, 30, 80))},
coords={'dt': dt, 'lat': lat, 'lon': lon})

# Add potential and units
ds['potential'].attrs['units'] = 'kV'
ds['potential'] = ds['potential'] * 1e-3 # Convert to kV
ds

In [68]:
omni_df = pd.read_csv('omni_mhd_5min.csv')
#omni_df = pd.read_csv(omni_mhd_path+'omni_mhd_5min.csv')
omni_df.set_index('dt', inplace=True) #set the datetime as the index
omni_df = omni_df.ffill().bfill()
omni_df = omni_df.dropna() #drop any remaining NaNs
omni_df

omni_ds = xr.Dataset(omni_df)
omni_ds['dt'] = pd.to_datetime(omni_ds['dt']) #convert the index to datetime

#merge OMNI with REPPU data
reppu_omni_ds = ds.merge(omni_ds, join='inner')

#select date range
reppu_omni_ds = reppu_omni_ds.sortby('dt')
reppu_omni_ds = reppu_omni_ds.sel(dt=slice('2022-06-12', '2022-07-31'))
reppu_omni_ds

In [69]:
train_slice = slice(0, 11520) #40days * 288 = 11520
test_slice = slice(11520, None) #10 days * 288 = 2880 

# Define the slice ranges for train and test data
#train_slice = slice(0, 36576) #36576 / 24 / 12 = 127 days = 80% of the data
#test_slice = slice(36576, None) # 9216 / 24 / 12 = 32 days = 20% of the data

ds_train = reppu_omni_ds.isel(dt=train_slice)
ds_test = reppu_omni_ds.isel(dt=test_slice)

X_train = ds_train.drop_vars('potential').to_array().values.T
y_train = ds_train['potential'].values
y_train = y_train.reshape(-1, 30*80)
#y_train = np.mean(y_train, axis=1)

X_test = ds_test.drop_vars('potential').to_array().values.T
y_test = ds_test['potential'].values
y_test = y_test.reshape(-1, 30*80)
#y_test = np.mean(y_test, axis=1)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((11520, 5), (11520, 2400), (2880, 5), (2880, 2400))

In [71]:
# Normalizing the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

In [72]:
X_train_tensor = torch.tensor(X_train_normalized, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_normalized, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

In [73]:
X_train_tensor.shape, y_train_tensor.shape, X_test_tensor.shape, y_test_tensor.shape

(torch.Size([11520, 5]),
 torch.Size([11520, 2400]),
 torch.Size([2880, 5]),
 torch.Size([2880, 2400]))

In [74]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        # Remove extra dimensions from output
        out = self.fc(out[:, -1, :].squeeze())  # Squeeze the output
        return out


In [3]:
def dummy_slider():
    #THis is a handy visualisation tool for testing
    # it has no bearing on the actual training
    x = np.arange(0, 60)
    seq_len = 288
    slider = 4

    for i in range(0, len(x) - seq_len + 1, slider):
        inputs = x[i:i + seq_len]
        targets = x[i + seq_len - 1]
        print(f'inputs: {inputs}, targets: {targets}')

dummy_slider()

In [76]:
seq_len = 12
slider = 2

# Instantiate the LSTM model with updated input_size
input_size = X_train_tensor.shape[1]
hidden_size = 64
num_layers = 2
output_size = y_train_tensor.shape[1]
model = LSTM(input_size, hidden_size, num_layers, output_size)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5
for epoch in range(num_epochs):
    for i in range(0, len(X_train_tensor) - seq_len + 1, slider):
        inputs = X_train_tensor[i:i + seq_len]
        targets = y_train_tensor[i + seq_len - 1]
        # Forward pass
        outputs = model(inputs.unsqueeze(0)) # Add batch dimension
        loss = criterion(outputs, targets)
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    rmse = np.sqrt(loss.item())
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.2f}, RMSE: {rmse:.2f} V')


Epoch [1/5], Loss: 210.48, RMSE: 14.51 V
Epoch [2/5], Loss: 288.82, RMSE: 16.99 V
Epoch [3/5], Loss: 139.84, RMSE: 11.83 V
Epoch [4/5], Loss: 196.00, RMSE: 14.00 V
Epoch [5/5], Loss: 99.00, RMSE: 9.95 V


In [77]:
def expected_test_set_size(X_test_tensor, seq_len, slider):
    return ((len(X_test_tensor) - seq_len) / slider) + 1

expected_test_set_size(X_test_tensor, seq_len, slider)

1435.0

In [78]:
model.eval()

predicted = []

with torch.no_grad():
    for i in range(0, len(X_test_tensor) - seq_len + 1, slider):
        inputs = X_test_tensor[i:i + seq_len]
        outputs = model(inputs.unsqueeze(0))
        predicted.append(outputs)

predicted = torch.cat(predicted, dim=0)

In [79]:
predicted.reshape(-1, 30, 80).shape

torch.Size([1435, 30, 80])

In [80]:
dt = ds_test['dt'].values
dt = dt[seq_len - 1::slider]

predicted_ds = xr.Dataset({'predicted_pot': (['dt', 'lat', 'lon'], predicted.reshape(-1, 30, 80))},
                            coords={'dt': dt, 'lat': lat, 'lon': lon})

predicted_ds = xr.merge([ds_test, predicted_ds], join='inner')
predicted_ds['RMSE'] = np.sqrt((predicted_ds['predicted_pot'] - predicted_ds['potential'])**2)
predicted_ds['RMSE'].attrs['units'] = 'kV'

predicted_ds

In [81]:
np.mean(predicted_ds['RMSE'].values)

6.9375596