# For Kaggle

In [None]:
# !pip install gdown
# !gdown 1zDCi0nnxjP3so8wPwc5JGIj55lWimFw4

# !gdown 1p4cBOvRvSsUYdRdkjo4CPFiBqHyjw_87

# Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pickle as pk
from FS.pso import jfs

# Seeds

In [None]:
seed_value = 0

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os

os.environ["PYTHONHASHSEED"] = str(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)
np.random.default_rng(seed=seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(seed_value)

# Remove limit on dataframe columns
pd.set_option("display.max_columns", None)

# Utility

In [None]:
# Helper functions

# Input a series containing X and y to create a windowed dataset
def create_multi_features_windowed_dataset(series, window_size, horizon, batch_size, shuffle_buffer):
    ds = tf.data.Dataset.from_tensor_slices(series)
    ds = ds.window(window_size + horizon, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda w: w.batch(window_size + horizon))
    ds = ds.shuffle(shuffle_buffer_size)
    ds = ds.map(lambda w: (w[:-horizon, :-1], w[-horizon:, -1]))
    ds = ds.batch(batch_size).prefetch(1)
    return ds

# Input a series containing X only to get Kp for the next timestep
def forecast_next_timestep(model, series, window_size, batch_size=64):
    ds = tf.data.Dataset.from_tensor_slices(series)
    ds = ds.window(window_size, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda w: w.batch(window_size))
    ds = ds.batch(batch_size).prefetch(1)
    forecast = model.predict(ds).squeeze()
    return forecast

# Preprocessing

In [None]:
df = pd.read_table("./omni2_all_years.dat", sep="\s+",header=None)

In [None]:
df.to_csv("omni2_all_years.csv", index=False)

Add column names in csv file

In [None]:
df = pd.read_csv("omni2_all_years.csv")

In [None]:
# Drop columns of IDs and other indices
df.drop(
    [
        "ID for IMF SC",
        "ID for SW Plasma SC",
        "DST Index",
        "AE-index",
        "Ap-index",
        "f10.7_index",
        "AL-index",
        "AU-index",
    ],
    inplace=True,
    axis=1,
)

In [None]:
# Convert year, day, hour to datetime
time = pd.to_datetime(df["Year"] * 1000 + df["Day"], format="%Y%j")
time = time + pd.to_timedelta(df.Hour, unit="h")
df.index = time

# Reduce the number of kp values from 28 to 10
df["Kp"] = df["Kp*10"].apply(lambda x: round(x / 10))

# Drop the old columns
df = df.drop(["Year", "Day", "Hour", "Kp*10"], axis=1)

In [None]:
# Chosen time range
df = df[(df.index.year > 1975) & (df.index.year < 2024)]

In [None]:
replacement = [
    999,
    999.9,
    9999999.0,
    9999.0,
    99.99,
    9.999,
    999.99,
    999999.99,
    99999.99,
    99.9,
]
# Replace missing values with NaN
df.replace(replacement, np.nan, inplace=True)

In [None]:
# Interpolate the missing values
df.interpolate(method="time", limit_direction="both", inplace=True)

In [None]:
# Drop some redundant features
df.drop(["By,GSM", "Bz,GSM", "Field Magnitude Avg"], axis=1, inplace=True)

In [None]:
# pso_features = [
#     "Magnitude of Average Field vector",
#     "Bx,GSE",
#     "Sigma-B",
#     "Sigma-Bx",
#     "Sigma-By",
#     "Sigma-Bz",
#     "Na/Np",
#     "Sigma-phi-V",
#     "Sigma-theta-V",
#     "PROT Flux  >1 MeV",
#     "PROT Flux  >2 MeV",
#     "PROT Flux  >30 MeV",
#     "PROT Flux  >60 MeV",
#     "PC(N)",
#     "Kp",
# ]

In [None]:
# Feature indices that were selected by PSO
pso_features = [3, 6, 10, 11, 12, 13, 19, 24, 25, 31, 32, 35, 36, 38, 40]
df = df.iloc[:, pso_features]

In [None]:
# No splitting for production model
train = df
test = df

In [None]:
num_features = df.shape[1]

In [None]:
train_y = train["Kp"]
test_y = test["Kp"]

In [None]:
# Scale the data except for Kp
scaler = StandardScaler()
train[train.columns[:-1]] = scaler.fit_transform(train[train.columns[:-1]])
test[test.columns[:-1]] = scaler.transform(test[test.columns[:-1]])

In [None]:
# Duplicate the Kp column for the windowed dataset
train_xy = pd.concat([train, train_y], axis=1)
test_xy = pd.concat([test, test_y], axis=1)

In [None]:
# Hyperparameters for the dataset and model
window_size = 24 # window size in hours
horizon = 24 # forecast horizon in hours
batch_size = 64
shuffle_buffer_size = 1000

In [None]:
# Create windowed datasets
ds = create_multi_features_windowed_dataset(
    train_xy, window_size, horizon, batch_size=batch_size, shuffle_buffer=shuffle_buffer_size
)
test_ds = create_multi_features_windowed_dataset(
    test_xy, window_size, horizon, batch_size=batch_size, shuffle_buffer=shuffle_buffer_size
)

# Model

In [None]:
# Model
model = tf.keras.Sequential(
    [
        tf.keras.layers.GRU(
            100,
            input_shape=(window_size, num_features),
            return_sequences=True,
        ),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.GRU(
            100,
            input_shape=(window_size, num_features),
        ),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(
            32,
            activation="relu",
        ),
        tf.keras.layers.Dense(horizon),
    ]
)
optimizer = tf.keras.optimizers.Adam()
model.compile(loss="mse", optimizer=optimizer)
history = model.fit(ds, epochs=20, validation_data=test_ds, verbose=1)

In [None]:
# plot loss
plt.plot(history.history["loss"], label="train")
plt.plot(history.history["val_loss"], label="test")
plt.legend()
plt.show()

# Evaluation

In [None]:
# Test the model on test data
pred = forecast_next_timestep(model, test, window_size)
pred_discrete = pred.round()
pred_discrete = np.clip(pred_discrete, 0, 9)
# Remove the last prediction as it does not have a corresponding true value
pred_discrete = pred_discrete[:-1]

In [None]:
# Cut the window size from the true values as it does not have a corresponding prediction
real = test_y[window_size:]

# Binary values to classify storms
real_binary = (real < 5).astype(int)
pred_binary = (pred_discrete < 5).astype(int)

In [None]:
# Plot results for 1 horizon value
x = test_y.index[window_size:]
true_interval = real
pred_interval = pred_discrete[:,0]

plt.plot(x, true_interval, label="True", color="blue")
plt.plot(x, pred_interval, label="Prediction", color="red")
plt.legend()

In [None]:
# Metrics
r2_scores = []
f1_scores = []
rmses = []
rmses.append(root_mean_squared_error(real, pred_discrete[:, 0]))
r2_scores.append(r2_score(real, pred_discrete[:, 0]))
f1_scores.append(
    classification_report(real_binary, pred_binary[:, 0], output_dict=True)["0"][
        "f1-score"
    ]
)
# Metrics for multiple horizon values
for h in range(1, horizon):
    rmses.append(root_mean_squared_error(real[h:], pred_discrete[:-h, h]))
    r2_scores.append(r2_score(real[h:], pred_discrete[:-h, h]))
    f1_scores.append(
        classification_report(real_binary[h:], pred_binary[:-h, h], output_dict=True)[
            "0"
        ]["f1-score"]
    )

In [None]:
print(rmses)
print(r2_scores)
print(f1_scores)
# Average metrics for all horizon values
print(f"RMSE: {np.mean(rmses)}")
print(f"R2 Score: {np.mean(r2_scores)}")
print(f"F1 Score: {np.mean(f1_scores)}")

In [None]:
# plt.plot(r2_scores)

In [None]:
# plt.plot(f1_scores)

In [None]:
# Confusion matrix for 1 horizon value
cm = confusion_matrix(real, pred_discrete[:, 0])
sns.heatmap(cm, annot=True, fmt="d")

# Classification report

print(classification_report(real, pred_discrete[:, 0]))

In [None]:
# Confusion matrix for binary classification
cm = confusion_matrix(real_binary, pred_binary[:, 0])
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    xticklabels=["Storm", "Safe"],
    yticklabels=["Storm", "Safe"],
).set(xlabel="Predicted", ylabel="Real")

print(classification_report(real_binary, pred_binary[:, 0]))

# For future model usage

## Saving the model

In [None]:
model.save("./exports/model.keras")
with open('./exports/standard_scaler.pkl', 'wb') as pickle_file:
    pk.dump(scaler, pickle_file)

## Running the model

In [1]:
from enum import Enum
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
import pickle as pk

2024-04-25 15:24:09.057249: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-25 15:24:09.092531: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-25 15:24:09.092560: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-25 15:24:09.093535: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-25 15:24:09.098996: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-25 15:24:09.099377: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [2]:
class Feature(Enum):
    MAG_AVG_FIELD = 0
    BX_GSE = 1
    SIGMA_B = 2
    SIGMA_BX = 3
    SIGMA_BY = 4
    SIGMA_BZ = 5
    NA_NP = 6
    SIGMA_PHI_V = 7
    SIGMA_THETA_V = 8
    PROT_FLUX_1 = 11
    PROT_FLUX_2 = 12
    PROT_FLUX_30 = 13
    PROT_FLUX_60 = 14
    PC_N = 10
    KP = 9

# Order of features in the trained model
features_order = [f.value for f in Feature]

In [10]:
# Load the new downloaded data for the past 24hrs
with open('./omni2_cl5f6i8QEF.lst', 'r',) as f:
    data = []
    for line in f:
        data.append(line.split())
df = pd.DataFrame(np.array(data, dtype=float))
# Remove the date columns
df.drop([0,1,2], axis='columns', inplace=True)
# Rename the columns
df.columns = range(df.shape[1])
# Reorder the columns to match the model features order
df = df[features_order]

In [8]:
# Load model and scaler
scaler = pk.load(open('./exports/standard_scaler.pkl', 'rb'))
model = tf.keras.models.load_model('./exports/model.keras')

In [13]:
# # returns Kp(np.array), storm(bool)
def forecast(scaler, model, data):
    # Scale Kp values from Kp*10 to Kp
    df[df.columns[-1]] = df[df.columns[-1]] / 10
    # Scale the data except for kp
    df[df.columns[:-1]] = scaler.transform(df[df.columns[:-1]])
    # Reshape the data to match the model input
    data = np.expand_dims(df, axis=0)
    pred = model.predict(data).squeeze()
    pred = pred.round()
    pred = np.clip(pred, 0, 9)
    # Check if any of the predictions are stormy
    storm = (pred >= 5).any()
    return (pred, storm)