In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import StepLR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import xarray as xr

import joblib
import pickle

import pandas as pd
import numpy as np
import datetime as dt
import os
import shutil

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
#path = r'/home/ryuho/Documents/reddy/research/SMRAI/Data/REPPU/200/pbig5min.dat' #Ubuntu
path = r'/home/sachin/Documents/NIPR/Research/Data/REPPU/pbig5min.dat' #Server

#read the REPPU data
with open (path) as f:
    rectype = np.dtype(np.float32)
    reppu_data = np.fromfile(f, rectype) #size = 109,900,800

reppu_data = reppu_data.reshape(-1,30, 80)
reppu_data.shape

(45792, 30, 80)

In [3]:
mhd_data = pd.read_csv('mhd_dates.csv')
expanded_dt = pd.concat([pd.Series(pd.date_range(start, end)) 
    for start, end in zip(mhd_data['start'], mhd_data['end'])])

In [4]:
time_res = 5

if time_res == 1:
    time_step = 1440
else :
    time_step = 288

In [5]:
data_reshaped = reppu_data.reshape(len(expanded_dt), time_step, 30, 80) 

# Define coordinates
time = np.arange(time_step)
lat = np.linspace(50, 90, 30) #30 intervals between 53.1° to 89.7°
lon = np.linspace(1, 360, 80) #80 intervals between 1.6° to 357.6°

# Create 'dt' variable combining dates and five-minute intervals
dt = []
for day in expanded_dt:
    for t in time:
        dt.append(day + pd.Timedelta(minutes=t*time_res))
dt = np.array(dt) #convert from list to numpy array

# Create xarray Dataset
ds = xr.Dataset({'potential': (['dt', 'lat', 'lon'], data_reshaped.reshape(-1, 30, 80))},
        coords={'dt': dt, 'lat': lat, 'lon': lon})

# Add potential and units
ds['potential'].attrs['units'] = 'kV'
ds['potential'] = ds['potential'] * 1e-3 # Convert to kV
ds

In [6]:
omni_df = pd.read_csv('omni_mhd_5min.csv')
#omni_df = pd.read_csv(omni_mhd_path+'omni_mhd_5min.csv')
omni_df.set_index('dt', inplace=True) #set the datetime as the index
omni_df = omni_df.ffill().bfill()
omni_df = omni_df.dropna() #drop any remaining NaNs
omni_df

omni_ds = xr.Dataset(omni_df)
omni_ds['dt'] = pd.to_datetime(omni_ds['dt']) #convert the index to datetime

#merge OMNI with REPPU data
reppu_omni_ds = ds.merge(omni_ds, join='outer')
reppu_omni_ds['mlt'] = np.mod(reppu_omni_ds['lon'] / 15, 24)

#select date range
reppu_omni_ds = reppu_omni_ds.sortby('dt')
reppu_omni_ds

In [7]:
df = reppu_omni_ds.to_dataframe().reset_index()
df

Unnamed: 0,dt,lat,lon,potential,BY_GSE,BZ_GSE,flow_speed,proton_density,tilt_angle,mlt
0,2021-05-10 00:00:00,50.0,1.000000,0.690854,4.72,6.24,344.0,12.11,0.253032,0.066667
1,2021-05-10 00:00:00,50.0,5.544304,0.507001,4.72,6.24,344.0,12.11,0.253032,0.369620
2,2021-05-10 00:00:00,50.0,10.088608,0.314646,4.72,6.24,344.0,12.11,0.253032,0.672574
3,2021-05-10 00:00:00,50.0,14.632911,0.115738,4.72,6.24,344.0,12.11,0.253032,0.975527
4,2021-05-10 00:00:00,50.0,19.177215,-0.087280,4.72,6.24,344.0,12.11,0.253032,1.278481
...,...,...,...,...,...,...,...,...,...,...
109900795,2022-08-19 23:55:00,90.0,341.822785,-15.385611,-1.92,-4.50,658.0,5.34,0.173477,22.788186
109900796,2022-08-19 23:55:00,90.0,346.367089,-15.406006,-1.92,-4.50,658.0,5.34,0.173477,23.091139
109900797,2022-08-19 23:55:00,90.0,350.911392,-15.427092,-1.92,-4.50,658.0,5.34,0.173477,23.394093
109900798,2022-08-19 23:55:00,90.0,355.455696,-15.448745,-1.92,-4.50,658.0,5.34,0.173477,23.697046


In [8]:
df = df.iloc[::100]

In [9]:
df['mlt_sin'] = np.sin(df['mlt']*(2.*np.pi/24))
df['mlt_cos'] = np.cos(df['mlt']*(2.*np.pi/24))

df['lon_sin'] = np.sin((df['lon']-1)*(2.*np.pi/360))
df['lon_cos'] = np.cos((df['lon']-1)*(2.*np.pi/360))

df = df.drop(columns=['dt', 'mlt', 'lon'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mlt_sin'] = np.sin(df['mlt']*(2.*np.pi/24))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mlt_cos'] = np.cos(df['mlt']*(2.*np.pi/24))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lon_sin'] = np.sin((df['lon']-1)*(2.*np.pi/360))
A value is trying to be set on a copy of a slice from a 

In [10]:
X = df.drop(columns=['potential'], axis=1).values
X_cols = df.drop(columns=['potential'], axis=1).columns
y = df['potential'].values

In [11]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [12]:
#train-test-val split is 80-20-10
X_train_full, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42) 
X_val_full, X_test_full, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

X_train = X_train_full[:,:]
X_test = X_test_full[:,:]
X_val = X_val_full[:,:]

In [13]:
X_train_full.shape

(879206, 10)

In [14]:
class MonteCarloDropout(nn.Dropout):
    def forward(self, input):
        return nn.functional.dropout(input, self.p, True, self.inplace)

class MLP(nn.Module):
    def __init__(self, input_size):
        super(MLP, self).__init__()

        self.layers = nn.Sequential(
            nn.Linear(input_size, 2056),
            nn.ReLU(),
            #MonteCarloDropout(0.2),
            nn.Linear(2056, 1024),
            nn.ReLU(),
            #MonteCarloDropout(0.2), 
	        nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            MonteCarloDropout(0.2), 
            nn.Linear(128, 1),
        )

    def forward(self, x):
        return self.layers(x)

In [15]:
# Convert the numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).to(device)

In [16]:
# Create a dataset and a dataloader for train and validation
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)  # No need to shuffle validation data
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [17]:

# Instantiate the MLP model
input_size = X_train.shape[1]
#print('Input size', input_size)
model = MLP(input_size).to(device)

# Define the optimizer, scheulder and the loss function
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = StepLR(optimizer, step_size=10, gamma=0.6)
loss_function = nn.L1Loss()

# Track loss and accuracy over epochs
train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []

# Early stopping
best_val_loss = float('inf')
patience = 5
counter = 0

In [18]:

# Training loop
num_epochs = 150
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    epoch_loss = 0.0

    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        #forward pass
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = loss_function(outputs, batch_y.unsqueeze(1))

        #backward pass
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    #scheduler.step()

    train_loss = epoch_loss / len(train_loader)
    train_losses.append(train_loss)

    # Validation
    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0

    with torch.no_grad():
        for val_X, val_y in val_loader:
            val_X, val_y = val_X.to(device), val_y.to(device)
            val_outputs = model(val_X)
            val_loss += loss_function(val_outputs, val_y.unsqueeze(1)).item()

    val_loss /= len(val_loader)
    val_losses.append(val_loss)

    # Print the epoch results
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    # Early stopping based on validation loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        counter += 1
        if counter >= patience:
            print("Validation loss did not improve for", patience, "epochs. Stopping training.")
            end_time = dt.datetime.now()
            diff = end_time - start_time
            break

best_model = model
best_model.load_state_dict(torch.load('best_model.pth'))


Epoch [1/150], Train Loss: 4.1623, Val Loss: 3.9516
Epoch [2/150], Train Loss: 3.9200, Val Loss: 3.8655
Epoch [3/150], Train Loss: 3.8175, Val Loss: 3.7889
Epoch [4/150], Train Loss: 3.7348, Val Loss: 3.7428
Epoch [5/150], Train Loss: 3.6576, Val Loss: 3.6114
Epoch [6/150], Train Loss: 3.5873, Val Loss: 3.5902
Epoch [7/150], Train Loss: 3.5262, Val Loss: 3.5125
Epoch [8/150], Train Loss: 3.4727, Val Loss: 3.4619
Epoch [9/150], Train Loss: 3.4198, Val Loss: 3.4100
Epoch [10/150], Train Loss: 3.3725, Val Loss: 3.4496
Epoch [11/150], Train Loss: 3.3253, Val Loss: 3.3983
Epoch [12/150], Train Loss: 3.2784, Val Loss: 3.2777
Epoch [13/150], Train Loss: 3.2302, Val Loss: 3.2157
Epoch [14/150], Train Loss: 3.1879, Val Loss: 3.2252
Epoch [15/150], Train Loss: 3.1470, Val Loss: 3.1327
Epoch [16/150], Train Loss: 3.1110, Val Loss: 3.1325
Epoch [17/150], Train Loss: 3.0738, Val Loss: 3.1141
Epoch [18/150], Train Loss: 3.0401, Val Loss: 3.0794
Epoch [19/150], Train Loss: 3.0123, Val Loss: 3.0576
Ep

AttributeError: 'numpy.ndarray' object has no attribute 'datetime'

In [36]:
model_cpu = best_model.to('cpu')
torch.save(model_cpu.state_dict(), 'best_model.pt')

In [35]:
model.eval()

test_loss = 0.0
predictions = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        outputs = model(batch_X)
        loss = loss_function(outputs, batch_y.unsqueeze(1))
        test_loss += loss.item()
        predictions.append(outputs.cpu().numpy())

test_loss /= len(test_loader)
predictions = np.concatenate(predictions)

print(f"Test Loss: {test_loss:.1f}")


Test Loss: 2.4


In [34]:
rmse = np.sqrt(np.mean((predictions.flatten() - y_test) ** 2))
mae = np.mean(np.abs(predictions.flatten() - y_test))
R = np.corrcoef(predictions.flatten(), y_test)[0, 1]

def skill(m, o):
    skill = 1 - (np.sum((m - o)**2) / 
            np.sum((o - np.mean(o))**2))

    return skill          

print(f"RMSE: {rmse:.1f}")
print(f"MAE: {mae:.1f}")
print(f"R: {R:.2f}")
print(f"Skill: {skill(predictions.flatten(), y_test):.2f}")

RMSE: 3.9
MAE: 2.4
R: 0.94
Skill: 0.88
