In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import StepLR

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import xarray as xr

import joblib
import pickle

import pandas as pd
import numpy as np
import datetime as datetime
import os
import shutil

In [2]:
target = 'conductivity'

In [3]:
mid = "y-cond-3day"
path = f"/home/sachin/Documents/NIPR/Research/Data/ML/MLP/{mid}_outputs"
#check if dir exists and replace if so
if os.path.exists(path):
#if os.path.exists(f'./home/sachin/Documents/NIPR/Research/Data/ML/MLP/{mid}_outputs'):
    pass
else: 
    os.mkdir(path)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
#path = r'/home/ryuho/Documents/reddy/research/SMRAI/Data/REPPU/200/pbig5min.dat' #Ubuntu
path_reppu = r'/home/sachin/Documents/NIPR/Research/Data/REPPU/ybig5min.dat' #Server

#read the REPPU data
with open (path_reppu) as f:
    rectype = np.dtype(np.float32)
    reppu_data = np.fromfile(f, rectype) #size = 109,900,800

reppu_data = reppu_data.reshape(-1,30, 80)
reppu_data.shape

(45792, 30, 80)

In [6]:
mhd_data = pd.read_csv('mhd_dates.csv')
expanded_dt = pd.concat([pd.Series(pd.date_range(start, end)) 
    for start, end in zip(mhd_data['start'], mhd_data['end'])])

In [7]:
time_res = 5

if time_res == 1:
    time_step = 1440
else :
    time_step = 288

In [8]:
data_reshaped = reppu_data.reshape(len(expanded_dt), time_step, 30, 80) 

# Define coordinates
time = np.arange(time_step)
lat = np.linspace(50, 90, 30) #30 intervals between 53.1° to 89.7°
lon = np.linspace(1, 360, 80) #80 intervals between 1.6° to 357.6°

# Create 'dt' variable combining dates and five-minute intervals
dt = []
for day in expanded_dt:
    for t in time:
        dt.append(day + pd.Timedelta(minutes=t*time_res))
dt = np.array(dt) #convert from list to numpy array

# Create xarray Dataset
ds = xr.Dataset({target: (['dt', 'lat', 'lon'], data_reshaped.reshape(-1, 30, 80))},
        coords={'dt': dt, 'lat': lat, 'lon': lon})

# Add potential and units
#ds['potential'].attrs['units'] = 'kV'
#ds['potential'] = ds['potential'] * 1e-3 # Convert to kV
ds

In [9]:
omni_df = pd.read_csv('omni_mhd_5min.csv')
#omni_df = pd.read_csv(omni_mhd_path+'omni_mhd_5min.csv')
omni_df.set_index('dt', inplace=True) #set the datetime as the index
omni_df = omni_df.ffill().bfill()
omni_df = omni_df.dropna() #drop any remaining NaNs

omni_ds = xr.Dataset(omni_df)
omni_ds['dt'] = pd.to_datetime(omni_ds['dt']) #convert the index to datetime

#merge OMNI with REPPU data
reppu_omni_ds = ds.merge(omni_ds, join='outer')
reppu_omni_ds['mlt'] = np.mod(reppu_omni_ds['lon'] / 15, 24)

#select date range
reppu_omni_ds = reppu_omni_ds.sortby('dt')
reppu_omni_ds

In [10]:
dates_to_remove = pd.date_range(start='2021-05-10 00:00:00', end='2021-05-10 23:55:00', freq='5min')
mask = ~reppu_omni_ds['dt'].isin(dates_to_remove)
date1 = reppu_omni_ds.where(mask, drop=True)

dates_to_remove2 = pd.date_range(start='2021-10-11 00:00:00', end='2021-10-11 23:55:00', freq='5min')
mask2 = ~date1['dt'].isin(dates_to_remove2)
date2 = date1.where(mask2, drop=True)

dates_to_remove3 = pd.date_range(start='2022-01-30 00:00:00', end='2022-01-30 23:55:00', freq='5min')
mask3 = ~date2['dt'].isin(dates_to_remove3)
reppu_omni_ds = date2.where(mask3, drop=True)
reppu_omni_ds

In [11]:
df = reppu_omni_ds.to_dataframe().reset_index()
df = df[['dt','mlt','lon','lat',target,'BY_GSE','BZ_GSE','flow_speed','proton_density','tilt_angle']]
df

Unnamed: 0,dt,mlt,lon,lat,conductivity,BY_GSE,BZ_GSE,flow_speed,proton_density,tilt_angle
0,2021-05-11 00:00:00,0.066667,1.000000,50.0,1.623050,4.20,-0.59,340.0,8.97,0.257476
1,2021-05-11 00:00:00,0.369620,5.544304,50.0,1.623050,4.20,-0.59,340.0,8.97,0.257476
2,2021-05-11 00:00:00,0.672574,10.088608,50.0,1.623050,4.20,-0.59,340.0,8.97,0.257476
3,2021-05-11 00:00:00,0.975527,14.632911,50.0,1.623050,4.20,-0.59,340.0,8.97,0.257476
4,2021-05-11 00:00:00,1.278481,19.177215,50.0,1.623050,4.20,-0.59,340.0,8.97,0.257476
...,...,...,...,...,...,...,...,...,...,...
107827195,2022-08-19 23:55:00,22.788186,341.822785,90.0,4.135466,-1.92,-4.50,658.0,5.34,0.173477
107827196,2022-08-19 23:55:00,23.091139,346.367089,90.0,4.135110,-1.92,-4.50,658.0,5.34,0.173477
107827197,2022-08-19 23:55:00,23.394093,350.911392,90.0,4.134816,-1.92,-4.50,658.0,5.34,0.173477
107827198,2022-08-19 23:55:00,23.697046,355.455696,90.0,4.134587,-1.92,-4.50,658.0,5.34,0.173477


In [12]:
#df = df.iloc[::1000]

In [13]:
#df = df.sample(frac=0.001, random_state=42).reset_index(drop=True)
#df

In [14]:
df['mlt_sin'] = np.sin(df['mlt']*(2.*np.pi/24))
df['mlt_cos'] = np.cos(df['mlt']*(2.*np.pi/24))

df['lon_sin'] = np.sin((df['lon']-1)*(2.*np.pi/360))
df['lon_cos'] = np.cos((df['lon']-1)*(2.*np.pi/360))

df = df.drop(columns=['dt'])
df

Unnamed: 0,mlt,lon,lat,conductivity,BY_GSE,BZ_GSE,flow_speed,proton_density,tilt_angle,mlt_sin,mlt_cos,lon_sin,lon_cos
0,0.066667,1.000000,50.0,1.623050,4.20,-0.59,340.0,8.97,0.257476,0.017452,0.999848,0.000000,1.000000
1,0.369620,5.544304,50.0,1.623050,4.20,-0.59,340.0,8.97,0.257476,0.096615,0.995322,0.079230,0.996856
2,0.672574,10.088608,50.0,1.623050,4.20,-0.59,340.0,8.97,0.257476,0.175171,0.984538,0.157962,0.987445
3,0.975527,14.632911,50.0,1.623050,4.20,-0.59,340.0,8.97,0.257476,0.252625,0.967564,0.235700,0.971826
4,1.278481,19.177215,50.0,1.623050,4.20,-0.59,340.0,8.97,0.257476,0.328491,0.944507,0.311957,0.950096
...,...,...,...,...,...,...,...,...,...,...,...,...,...
107827195,22.788186,341.822785,90.0,4.135466,-1.92,-4.50,658.0,5.34,0.173477,-0.311957,0.950096,-0.328491,0.944507
107827196,23.091139,346.367089,90.0,4.135110,-1.92,-4.50,658.0,5.34,0.173477,-0.235700,0.971826,-0.252625,0.967564
107827197,23.394093,350.911392,90.0,4.134816,-1.92,-4.50,658.0,5.34,0.173477,-0.157962,0.987445,-0.175171,0.984538
107827198,23.697046,355.455696,90.0,4.134587,-1.92,-4.50,658.0,5.34,0.173477,-0.079230,0.996856,-0.096615,0.995322


In [15]:
X = df.drop(columns=[target], axis=1).values
X_cols = df.drop(columns=[target], axis=1).columns
y = df[target].values

In [16]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

file_path = f'{path}/scaler_{mid}.pkl'
with open(file_path, 'wb') as file:
    pickle.dump(scaler, file)

In [17]:
#train-test-val split is 80-20-10
X_train_full, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42) 
X_val_full, X_test_full, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

X_train = X_train_full[:,2:]
X_test = X_test_full[:,2:]
X_val = X_val_full[:,2:]

In [18]:
class MonteCarloDropout(nn.Dropout):
    def forward(self, input):
        return nn.functional.dropout(input, self.p, True, self.inplace)

class MLP(nn.Module):
    def __init__(self, input_size):
        super(MLP, self).__init__()

        self.layers = nn.Sequential(
            nn.Linear(input_size, 2056),
            nn.ReLU(),
            #MonteCarloDropout(0.2),
            nn.Linear(2056, 1024),
            nn.ReLU(),
            #MonteCarloDropout(0.2), 
	        nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            MonteCarloDropout(0.2), 
            nn.Linear(128, 1),
        )

    def forward(self, x):
        return self.layers(x)

In [19]:
# Convert the numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).to(device)

X_train_tensor.shape, y_train_tensor.shape, X_test_tensor.shape, y_test_tensor.shape, X_val_tensor.shape, y_val_tensor.shape

(torch.Size([86261760, 10]),
 torch.Size([86261760]),
 torch.Size([10782720, 10]),
 torch.Size([10782720]),
 torch.Size([10782720, 10]),
 torch.Size([10782720]))

In [20]:
# Create a dataset and a dataloader for train and validation
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
batch_size = 1024
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)  # No need to shuffle validation data
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)  # No need to shuffle test data

In [21]:
#check batch size
for x, y in test_loader:
    print(x.shape, y.shape)
    break

torch.Size([1024, 10]) torch.Size([1024])


In [22]:
#Instantiate the MLP model
input_size = X_train.shape[1]
#print('Input size', input_size)
model = MLP(input_size).to(device)

# Define the optimizer, scheulder and the loss function
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = StepLR(optimizer, step_size=10, gamma=0.6)
loss_function = nn.L1Loss()

# Track loss and accuracy over epochs
train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []

# Early stopping
best_val_loss = float('inf')
patience = 4
counter = 0

In [23]:
start_time = datetime.datetime.now()

# Training loop
num_epochs = 120
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    epoch_loss = 0.0

    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        #forward pass
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = loss_function(outputs, batch_y.unsqueeze(1))

        #backward pass
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    scheduler.step()

    train_loss = epoch_loss / len(train_loader)
    train_losses.append(train_loss)

    # Validation
    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0

    with torch.no_grad():
        for val_X, val_y in val_loader:
            val_X, val_y = val_X.to(device), val_y.to(device)
            val_outputs = model(val_X)
            val_loss += loss_function(val_outputs, val_y.unsqueeze(1)).item()

    val_loss /= len(val_loader)
    val_losses.append(val_loss)

    # Print the epoch results
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    # Early stopping based on validation loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
        torch.save(model.state_dict(), f'{path}/model_{mid}.pt') #save the best model
    else:
        counter += 1
        if counter >= patience:
            print("Validation loss did not improve for", patience, "epochs. Stopping training.")
            break

end_time = datetime.datetime.now()
diff = end_time - start_time

best_model = model
best_model.load_state_dict(torch.load(f'{path}/model_{mid}.pt'))

Epoch [1/120], Train Loss: 0.2543, Val Loss: 0.2153
Epoch [2/120], Train Loss: 0.2071, Val Loss: 0.2025
Epoch [3/120], Train Loss: 0.1945, Val Loss: 0.1938
Epoch [4/120], Train Loss: 0.1874, Val Loss: 0.1851
Epoch [5/120], Train Loss: 0.1836, Val Loss: 0.1835
Epoch [6/120], Train Loss: 0.1813, Val Loss: 0.1792
Epoch [7/120], Train Loss: 0.1795, Val Loss: 0.1779
Epoch [8/120], Train Loss: 0.1781, Val Loss: 0.1778
Epoch [9/120], Train Loss: 0.1769, Val Loss: 0.1755
Epoch [10/120], Train Loss: 0.1755, Val Loss: 0.1744
Epoch [11/120], Train Loss: 0.1695, Val Loss: 0.1689
Epoch [12/120], Train Loss: 0.1687, Val Loss: 0.1681
Epoch [13/120], Train Loss: 0.1681, Val Loss: 0.1678
Epoch [14/120], Train Loss: 0.1676, Val Loss: 0.1672
Epoch [15/120], Train Loss: 0.1672, Val Loss: 0.1673
Epoch [16/120], Train Loss: 0.1668, Val Loss: 0.1659
Epoch [17/120], Train Loss: 0.1665, Val Loss: 0.1660
Epoch [18/120], Train Loss: 0.1661, Val Loss: 0.1659
Epoch [19/120], Train Loss: 0.1657, Val Loss: 0.1646
Ep

<All keys matched successfully>

In [24]:
#test the model
best_model.eval()
test_loss = 0.0
predictions = []

with torch.no_grad():
    for test_X, test_y in test_loader:
        test_X, test_y = test_X.to(device), test_y.to(device)
        test_outputs = best_model(test_X)
        test_loss += loss_function(test_outputs, test_y.unsqueeze(1)).item()
        predictions.append(test_outputs) 

# Convert the predictions to a numpy array
predictions = torch.cat(predictions).cpu()

test_loss /= len(test_loader)
print(f"Test Loss: {test_loss:.4f}")

Test Loss: 0.1506


In [25]:
#y_pred = predictions.cpu()
y_pred = np.asarray(predictions).flatten()
y_test = np.asarray(y_test).flatten()

In [26]:
rmse = np.sqrt(np.mean((y_pred - y_test)**2))
mae = np.mean(np.abs(y_pred - y_test))
R = np.corrcoef(y_pred, y_test)[0, 1]

def skill(m, o):
    skill = 1 - (np.sum((m - o)**2) / np.sum((o - np.mean(o))**2))
    return skill          

print(f"RMSE: {rmse:.1f} [kV]")
print(f"MAE: {mae:.1f} [kV]")
print(f"R: {R:.2f}/1")
print(f"Skill: {skill(y_pred, y_test):.2f}/1")

RMSE: 0.3 [kV]
MAE: 0.2 [kV]
R: 0.99/1
Skill: 0.98/1


In [27]:
#save training/val curves
loss_df = pd.DataFrame({'train_losses':train_losses, 'val_losses':val_losses})
loss_df.to_csv(f'{path}/train-val-curves_{mid}.csv')

In [28]:
#save the results
y_df = pd.DataFrame(X_test_full, columns=X_cols)
y_df['y_test'] = y_test
y_df['y_pred'] = y_pred
y_df.to_csv(f'{path}/y_df_{mid}.csv', index=False)

In [29]:
#save run info
with open(f'{path}/run-info_{mid}.txt', 'w') as f:
    f.write(f'unit,value\n')
    f.write(f'Epochs,{epoch+1}\n')
    f.write(f'Total Time,{diff}\n')
    f.write(f'Train Loss,{(train_loss):.2f}\n')
    f.write(f'Val Loss,{(val_loss):.2f}\n')
    f.write(f'Test Loss,{test_loss:.2f}\n')
    f.write(f'RMSE,{rmse:.2f}\n')
    f.write(f'MAE,{mae:.2f}\n')
    f.write(f'R,{R:.2f}\n')
    #f.write(f'MC Test Loss,{(mc_loss.item()):.2f}\n')