In [19]:
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import xarray as xr

In [8]:
path = r'/home/ryuho/Documents/reddy/research/SMRAI/Data/REPPU/200/pbig5min.dat' #Ubuntu
#path = r'/home/sachin/research/data/pbig5min.dat' #Server

#read the REPPU data
with open (path) as f:
    rectype = np.dtype(np.float32)
    reppu_data = np.fromfile(f, rectype) #size = 109,900,800

reppu_data = reppu_data.reshape(-1,30,80)
reppu_data.shape

(45792, 30, 80)

In [14]:
#Read MHD dates and expand-out the date ranges
mhd_data = pd.read_csv('mhd_dates.csv')
expanded_dt = pd.concat([pd.Series(pd.date_range(start, end)) 
        for start, end in zip(mhd_data['start'], mhd_data['end'])])

In [21]:
data_reshaped = reppu_data.reshape(len(expanded_dt), 288, 30, 80) 

# Define coordinates
time = np.arange(288)
lat = np.linspace(53.1, 89.7, 30) #30 intervals between 53.1° to 89.7°
lon = np.linspace(1.6, 357.6, 80) #80 intervals between 1.6° to 357.6°

# Create 'dt' variable combining dates and five-minute intervals
dt = []
for day in expanded_dt:
    for t in time:
        dt.append(day + pd.Timedelta(minutes=t*5))
dt = np.array(dt) #convert from list to numpy array

# Create xarray Dataset
ds = xr.Dataset({'potential': (['dt', 'lat', 'lon'], data_reshaped.reshape(-1, 30, 80))},
                coords={'dt': dt, 'lat': lat, 'lon': lon})

# Add potential and units
ds['potential'].attrs['units'] = 'kV'
ds['potential'] = ds['potential'] * 1e-3 # Convert to kV
ds


In [22]:
#open the omni data and merge it with the REPPU data
#omni_mhd_path = r'/Users/sr2/My Drive/Career/Employment/Current/JSPS/Research/Analysis/Apr-24/data/omni/'
omni_mhd_path = r'/home/ryuho/Documents/reddy/research/SMRAI/Data/OMNI/'

omni_df = pd.read_csv('omni_mhd_5min.csv')
#omni_df = pd.read_csv(omni_mhd_path+'omni_mhd_5min.csv')
omni_df.set_index('dt', inplace=True) #set the datetime as the index
omni_df = omni_df.dropna()

omni_ds = xr.Dataset(omni_df)
omni_ds['dt'] = pd.to_datetime(omni_ds['dt']) #convert the index to datetime

#merge OMNI with REPPU data
reppu_omni_ds = ds.merge(omni_ds, join='inner')
reppu_omni_ds.sortby('dt')
reppu_omni_ds

In [28]:
class CustomDataset(Dataset):
    def __init__(self, data, target, window_size, prediction_horizon):
        self.data = data  # This should be a numpy array of shape (dt, lat, lon, num_features)
        self.target = target  # This should be a numpy array of shape (dt, lat, lon)
        self.window_size = window_size
        self.prediction_horizon = prediction_horizon

    def __len__(self):
        return len(self.data) - self.window_size - self.prediction_horizon + 1

    def __getitem__(self, idx):
        input_data = self.data[idx : idx + self.window_size]
        target_data = self.target[idx + self.window_size : idx + self.window_size + self.prediction_horizon]
        return {
            'input': torch.tensor(input_data, dtype=torch.float32),
            'target': torch.tensor(target_data, dtype=torch.float32)
        }


In [61]:
import numpy as np

# Extract data from xarray dataset
BY_GSE_data = reppu_omni_ds['BY_GSE'].values
BZ_GSE_data = reppu_omni_ds['BZ_GSE'].values
flow_speed_data = reppu_omni_ds['flow_speed'].values
proton_density_data = reppu_omni_ds['proton_density'].values
tilt_angle_data = reppu_omni_ds['tilt_angle'].values
potential_data = reppu_omni_ds['potential'].values

# Stack the input features into a single array
input_data = np.stack([BY_GSE_data, BZ_GSE_data, flow_speed_data, proton_density_data, tilt_angle_data], axis=-1)

# Reshape potential_data to match the dimensions of input_data
# Assuming potential_data has the same dimensions as lat and lon
potential_data = np.expand_dims(potential_data, axis=-1)
potential_data = potential_data.squeeze()

from sklearn.model_selection import train_test_split

# Split the data into train and test sets, preserving the order
X_train, X_test, y_train, y_test = train_test_split(input_data, potential_data, test_size=0.1, shuffle=False)
y_train = y_train.reshape(-1, 30*80)
y_test = y_test.reshape(-1, 30*80)

# Define window size and prediction horizon
window_size = 12  # Input for each sample
prediction_horizon = 12  # Predicting 12 time steps ahead

# Create CustomDataset instances for train and test sets
train_dataset = CustomDataset(X_train, y_train, window_size, prediction_horizon)
test_dataset = CustomDataset(X_test, y_test, window_size, prediction_horizon)

# Create DataLoader for train and test sets
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)  # No shuffle for train_loader
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)  # No shuffle for test_loader




In [56]:
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2, dropout=0.0):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Define model parameters
input_size = 5  # Number of input features
hidden_size = 64  # Number of units in the hidden layer
output_size = 1  # Output size matches the lat-lon grid size in the target tensor
num_layers = 2  # Number of LSTM layers
dropout = 0.2  # Dropout rate for Monte Carlo Dropout (optional)

# Initialize the model
model = LSTMModel(input_size, hidden_size, output_size, num_layers, dropout)


In [68]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch in train_loader:
        inputs, targets = batch['input'], batch['target']
        print("Input shape:", inputs.shape)
        print("Target shape:", targets.shape)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)  # Use the reshaped targets
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * inputs.size(0)
    train_loss /= len(train_loader.dataset)
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}")


Input shape: torch.Size([32, 12, 5])
Target shape: torch.Size([32, 12, 2400])


RuntimeError: The size of tensor a (32) must match the size of tensor b (12) at non-singleton dimension 1