In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import StepLR

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import xarray as xr

import joblib
import pickle

import pandas as pd
import numpy as np
import datetime as datetime
import os
import shutil

In [28]:
mid = "1pct"
path = "/home/sachin/Documents/NIPR/Research/Data/ML/MLP"
#check if dir exists and replace if so
if os.path.exists(f'{path}/{mid}_outputs'):
#if os.path.exists(f'./home/sachin/Documents/NIPR/Research/Data/ML/MLP/{mid}_outputs'):
    pass
else: 
    os.mkdir(f'{path}/{mid}_outputs')

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
#path = r'/home/ryuho/Documents/reddy/research/SMRAI/Data/REPPU/200/pbig5min.dat' #Ubuntu
path = r'/home/sachin/Documents/NIPR/Research/Data/REPPU/pbig5min.dat' #Server

#read the REPPU data
with open (path) as f:
    rectype = np.dtype(np.float32)
    reppu_data = np.fromfile(f, rectype) #size = 109,900,800

reppu_data = reppu_data.reshape(-1,30, 80)
reppu_data.shape

(45792, 30, 80)

In [5]:
mhd_data = pd.read_csv('mhd_dates.csv')
expanded_dt = pd.concat([pd.Series(pd.date_range(start, end)) 
    for start, end in zip(mhd_data['start'], mhd_data['end'])])

In [6]:
time_res = 5

if time_res == 1:
    time_step = 1440
else :
    time_step = 288

In [7]:
data_reshaped = reppu_data.reshape(len(expanded_dt), time_step, 30, 80) 

# Define coordinates
time = np.arange(time_step)
lat = np.linspace(50, 90, 30) #30 intervals between 53.1° to 89.7°
lon = np.linspace(1, 360, 80) #80 intervals between 1.6° to 357.6°

# Create 'dt' variable combining dates and five-minute intervals
dt = []
for day in expanded_dt:
    for t in time:
        dt.append(day + pd.Timedelta(minutes=t*time_res))
dt = np.array(dt) #convert from list to numpy array

# Create xarray Dataset
ds = xr.Dataset({'potential': (['dt', 'lat', 'lon'], data_reshaped.reshape(-1, 30, 80))},
        coords={'dt': dt, 'lat': lat, 'lon': lon})

# Add potential and units
ds['potential'].attrs['units'] = 'kV'
ds['potential'] = ds['potential'] * 1e-3 # Convert to kV
ds

In [8]:
omni_df = pd.read_csv('omni_mhd_5min.csv')
#omni_df = pd.read_csv(omni_mhd_path+'omni_mhd_5min.csv')
omni_df.set_index('dt', inplace=True) #set the datetime as the index
omni_df = omni_df.ffill().bfill()
omni_df = omni_df.dropna() #drop any remaining NaNs
omni_df

omni_ds = xr.Dataset(omni_df)
omni_ds['dt'] = pd.to_datetime(omni_ds['dt']) #convert the index to datetime

#merge OMNI with REPPU data
reppu_omni_ds = ds.merge(omni_ds, join='outer')
reppu_omni_ds['mlt'] = np.mod(reppu_omni_ds['lon'] / 15, 24)

#select date range
reppu_omni_ds = reppu_omni_ds.sortby('dt')
reppu_omni_ds

In [9]:
df = reppu_omni_ds.to_dataframe().reset_index()
df = df[['dt','mlt','lon','lat','potential','BY_GSE','BZ_GSE','flow_speed','proton_density','tilt_angle']]
df

Unnamed: 0,dt,mlt,lon,lat,potential,BY_GSE,BZ_GSE,flow_speed,proton_density,tilt_angle
0,2021-05-10 00:00:00,0.066667,1.000000,50.0,0.690854,4.72,6.24,344.0,12.11,0.253032
1,2021-05-10 00:00:00,0.369620,5.544304,50.0,0.507001,4.72,6.24,344.0,12.11,0.253032
2,2021-05-10 00:00:00,0.672574,10.088608,50.0,0.314646,4.72,6.24,344.0,12.11,0.253032
3,2021-05-10 00:00:00,0.975527,14.632911,50.0,0.115738,4.72,6.24,344.0,12.11,0.253032
4,2021-05-10 00:00:00,1.278481,19.177215,50.0,-0.087280,4.72,6.24,344.0,12.11,0.253032
...,...,...,...,...,...,...,...,...,...,...
109900795,2022-08-19 23:55:00,22.788186,341.822785,90.0,-15.385611,-1.92,-4.50,658.0,5.34,0.173477
109900796,2022-08-19 23:55:00,23.091139,346.367089,90.0,-15.406006,-1.92,-4.50,658.0,5.34,0.173477
109900797,2022-08-19 23:55:00,23.394093,350.911392,90.0,-15.427092,-1.92,-4.50,658.0,5.34,0.173477
109900798,2022-08-19 23:55:00,23.697046,355.455696,90.0,-15.448745,-1.92,-4.50,658.0,5.34,0.173477


In [10]:
#df = df.iloc[::1000]

In [11]:
df = df.sample(frac=0.01, random_state=42).reset_index(drop=True)
df

Unnamed: 0,dt,mlt,lon,lat,potential,BY_GSE,BZ_GSE,flow_speed,proton_density,tilt_angle
0,2022-02-01 20:45:00,4.610970,69.164557,54.137931,0.044543,2.26,1.21,345.0,3.52,-0.200169
1,2021-12-16 09:05:00,22.788186,341.822785,90.000000,-11.681410,0.68,-3.16,535.0,4.87,-0.471622
2,2021-11-25 23:10:00,3.096203,46.443038,59.655172,-2.363475,0.23,-1.26,386.0,2.50,-0.382916
3,2021-12-19 03:45:00,17.637975,264.569620,88.620690,1.686218,0.31,2.53,336.0,8.76,-0.555447
4,2021-11-05 22:45:00,12.487764,187.316456,72.068966,1.045904,1.72,2.18,534.0,10.54,-0.282381
...,...,...,...,...,...,...,...,...,...,...
1099003,2022-02-03 13:30:00,15.820253,237.303797,90.000000,-45.016697,-5.48,-0.75,525.0,12.67,-0.188079
1099004,2022-06-14 20:55:00,13.699578,205.493671,63.793103,0.889530,5.20,-0.01,487.0,3.85,0.480349
1099005,2021-10-13 10:35:00,21.879325,328.189873,65.172414,7.066214,2.34,0.96,427.0,2.38,-0.137980
1099006,2022-07-23 03:00:00,21.273418,319.101266,81.724138,-8.612472,1.24,-5.15,582.0,5.96,0.208338


In [12]:
df['mlt_sin'] = np.sin(df['mlt']*(2.*np.pi/24))
df['mlt_cos'] = np.cos(df['mlt']*(2.*np.pi/24))

df['lon_sin'] = np.sin((df['lon']-1)*(2.*np.pi/360))
df['lon_cos'] = np.cos((df['lon']-1)*(2.*np.pi/360))

df = df.drop(columns=['dt'])
df

Unnamed: 0,mlt,lon,lat,potential,BY_GSE,BZ_GSE,flow_speed,proton_density,tilt_angle,mlt_sin,mlt_cos,lon_sin,lon_cos
0,4.610970,69.164557,54.137931,0.044543,2.26,1.21,345.0,3.52,-0.200169,0.934606,0.355685,0.928256,0.371942
1,22.788186,341.822785,90.000000,-11.681410,0.68,-3.16,535.0,4.87,-0.471622,-0.311957,0.950096,-0.328491,0.944507
2,3.096203,46.443038,59.655172,-2.363475,0.23,-1.26,386.0,2.50,-0.382916,0.724690,0.689075,0.712553,0.701618
3,17.637975,264.569620,88.620690,1.686218,0.31,2.53,336.0,8.76,-0.555447,-0.995512,-0.094636,-0.993709,-0.111996
4,12.487764,187.316456,72.068966,1.045904,1.72,2.18,534.0,10.54,-0.282381,-0.127349,-0.991858,-0.110020,-0.993929
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099003,15.820253,237.303797,90.000000,-45.016697,-5.48,-0.75,525.0,12.67,-0.188079,-0.841547,-0.540185,-0.831991,-0.554789
1099004,13.699578,205.493671,63.793103,0.889530,5.20,-0.01,487.0,3.85,0.480349,-0.430411,-0.902633,-0.414593,-0.910007
1099005,21.879325,328.189873,65.172414,7.066214,2.34,0.96,427.0,2.38,-0.137980,-0.527106,0.849800,-0.541857,0.840471
1099006,21.273418,319.101266,81.724138,-8.612472,1.24,-5.15,582.0,5.96,0.208338,-0.654724,0.755868,-0.667816,0.744326


In [13]:
X = df.drop(columns=['potential'], axis=1).values
X_cols = df.drop(columns=['potential'], axis=1).columns
y = df['potential'].values

In [14]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

file_path = f'{path}/{mid}_scaler.pkl'
with open(file_path, 'wb') as file:
    pickle.dump(scaler, file)

In [15]:
#train-test-val split is 80-20-10
X_train_full, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42) 
X_val_full, X_test_full, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

X_train = X_train_full[:,2:]
X_test = X_test_full[:,2:]
X_val = X_val_full[:,2:]

In [16]:
class MonteCarloDropout(nn.Dropout):
    def forward(self, input):
        return nn.functional.dropout(input, self.p, True, self.inplace)

class MLP(nn.Module):
    def __init__(self, input_size):
        super(MLP, self).__init__()

        self.layers = nn.Sequential(
            nn.Linear(input_size, 2056),
            nn.ReLU(),
            #MonteCarloDropout(0.2),
            nn.Linear(2056, 1024),
            nn.ReLU(),
            #MonteCarloDropout(0.2), 
	        nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            MonteCarloDropout(0.2), 
            nn.Linear(128, 1),
        )

    def forward(self, x):
        return self.layers(x)

In [17]:
# Convert the numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).to(device)

X_train_tensor.shape, y_train_tensor.shape, X_test_tensor.shape, y_test_tensor.shape, X_val_tensor.shape, y_val_tensor.shape

(torch.Size([879206, 10]),
 torch.Size([879206]),
 torch.Size([109901, 10]),
 torch.Size([109901]),
 torch.Size([109901, 10]),
 torch.Size([109901]))

In [18]:
# Create a dataset and a dataloader for train and validation
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
batch_size = 512
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)  # No need to shuffle validation data
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)  # No need to shuffle test data

In [19]:
#check batch size
for x, y in test_loader:
    print(x.shape, y.shape)
    break

torch.Size([512, 10]) torch.Size([512])


In [20]:
#Instantiate the MLP model
input_size = X_train.shape[1]
#print('Input size', input_size)
model = MLP(input_size).to(device)

# Define the optimizer, scheulder and the loss function
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = StepLR(optimizer, step_size=10, gamma=0.6)
loss_function = nn.L1Loss()

# Track loss and accuracy over epochs
train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []

# Early stopping
best_val_loss = float('inf')
patience = 4
counter = 0

In [21]:
start_time = datetime.datetime.now()

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    epoch_loss = 0.0

    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        #forward pass
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = loss_function(outputs, batch_y.unsqueeze(1))

        #backward pass
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    scheduler.step()

    train_loss = epoch_loss / len(train_loader)
    train_losses.append(train_loss)

    # Validation
    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0

    with torch.no_grad():
        for val_X, val_y in val_loader:
            val_X, val_y = val_X.to(device), val_y.to(device)
            val_outputs = model(val_X)
            val_loss += loss_function(val_outputs, val_y.unsqueeze(1)).item()

    val_loss /= len(val_loader)
    val_losses.append(val_loss)

    # Print the epoch results
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    # Early stopping based on validation loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
        torch.save(model.state_dict(), f'./{mid}_outputs/{mid}_model.pt') #save the best model
    else:
        counter += 1
        if counter >= patience:
            print("Validation loss did not improve for", patience, "epochs. Stopping training.")
            end_time = datetime.datetime.now()
            diff = end_time - start_time
            break

best_model = model
best_model.load_state_dict(torch.load(f'./{mid}_outputs/{mid}_model.pt'))

Epoch [1/100], Train Loss: 4.4827, Val Loss: 4.2706
Epoch [2/100], Train Loss: 4.1708, Val Loss: 4.2221
Epoch [3/100], Train Loss: 4.0788, Val Loss: 4.0297
Epoch [4/100], Train Loss: 3.9981, Val Loss: 3.9276
Epoch [5/100], Train Loss: 3.9279, Val Loss: 3.8834
Epoch [6/100], Train Loss: 3.8587, Val Loss: 3.8538
Epoch [7/100], Train Loss: 3.7833, Val Loss: 3.7222
Epoch [8/100], Train Loss: 3.7042, Val Loss: 3.6970
Epoch [9/100], Train Loss: 3.6385, Val Loss: 3.6141
Epoch [10/100], Train Loss: 3.5691, Val Loss: 3.5429
Epoch [11/100], Train Loss: 3.4389, Val Loss: 3.4351
Epoch [12/100], Train Loss: 3.3788, Val Loss: 3.4142
Epoch [13/100], Train Loss: 3.3230, Val Loss: 3.3425
Epoch [14/100], Train Loss: 3.2724, Val Loss: 3.3322
Epoch [15/100], Train Loss: 3.2185, Val Loss: 3.2579
Epoch [16/100], Train Loss: 3.1727, Val Loss: 3.2193
Epoch [17/100], Train Loss: 3.1284, Val Loss: 3.1815
Epoch [18/100], Train Loss: 3.0834, Val Loss: 3.1813
Epoch [19/100], Train Loss: 3.0487, Val Loss: 3.1118
Ep

<All keys matched successfully>

In [22]:
#test the model
best_model.eval()
test_loss = 0.0
predictions = []

with torch.no_grad():
    for test_X, test_y in test_loader:
        test_X, test_y = test_X.to(device), test_y.to(device)
        test_outputs = best_model(test_X)
        test_loss += loss_function(test_outputs, test_y.unsqueeze(1)).item()
        predictions.append(test_outputs) 

# Convert the predictions to a numpy array
predictions = torch.cat(predictions).cpu()

test_loss /= len(test_loader)
print(f"Test Loss: {test_loss:.4f}")

Test Loss: 2.5770


In [23]:
#y_pred = predictions.cpu()
y_pred = np.asarray(predictions).flatten()
y_test = np.asarray(y_test).flatten()

In [24]:
rmse = np.sqrt(np.mean((y_pred - y_test)**2))
mae = np.mean(np.abs(y_pred - y_test))
R = np.corrcoef(y_pred, y_test)[0, 1]

def skill(m, o):
    skill = 1 - (np.sum((m - o)**2) / np.sum((o - np.mean(o))**2))
    return skill          

print(f"RMSE: {rmse:.1f} [kV]")
print(f"MAE: {mae:.1f} [kV]")
print(f"R: {R:.2f}/1")
print(f"Skill: {skill(y_pred, y_test):.2f}/1")

RMSE: 4.2 [kV]
MAE: 2.6 [kV]
R: 0.93/1
Skill: 0.87/1


In [25]:
y_df = pd.DataFrame(X_test_full, columns=X_cols)
y_df['y_test'] = y_test
y_df['y_pred'] = y_pred
y_df.to_csv(f'{path}/{mid}_y_df.csv', index=False)