In [4]:
import os
import pandas as pd
import numpy as np

TRAIN_DIR = "solar_flare_training_data"
os.makedirs(TRAIN_DIR, exist_ok = True)


def classify_flare(x):
    if  1e-5<= x < 1e-4:
        return 'M'
    elif x >= 1e-4:
        return 'X'
    
def create_folder(folder_name: str):
    #create folder to hold the data
    os.makedirs(f"{TRAIN_DIR}/{folder_name}", exist_ok = True)

def split_data_in_train_and_test(df, col_name, window):
    # target value
    #C flare - between 10^-6 and 10^-5 
    #M flare - between 10^-5 and 10^-4
    #X Flare - above 10^-4
    target_value_slare = 1E-5
    target_value_non_flare = 1E-4

    # Initialize lists to store X_train and Y_train
    X_train_list = []
    Y_train_list = []

    # Iterate through the DataFrame to find X_train and Y_train
    for i in range(len(df)):
        if df.loc[i, col_name] >= target_value_slare and i+1 > window:
            X_train = df.loc[i-window:i-1, col_name].values
            Y_train = df.loc[i, col_name]
            
            X_train_list.append(X_train)
            Y_train_list.append(Y_train)
    main_len = len(X_train_list)
    print("Flare class done for ", col_name, main_len, int(main_len * 0.85))

    # Iterate through the DataFrame to find X_train and Y_train
    for i in range(len(df)):
        if i <= window + 200: continue
        if df.loc[i, col_name] <= target_value_non_flare and i+1 > window:
            X_train = df.loc[i-window:i-1, col_name].values
            Y_train = df.loc[i, col_name]
            
            X_train_list.append(X_train)
            Y_train_list.append(Y_train)
        if len(X_train_list) == main_len + int(main_len * 0.85): break
    
    # Create a new DataFrame for X_train and Y_train
    X_train_df = pd.DataFrame(X_train_list, columns=['X_train_{}'.format(i) for i in range(window)])
    Y_train_df = pd.DataFrame({'Y_train': Y_train_list})
    
    # Apply the function to create a new column
    folder_name = f"{col_name}_{str(window)}_minutes"
    create_folder(folder_name)

    X_train_df.to_csv(os.path.join(TRAIN_DIR, folder_name, "X_Train.csv"), index = False)
    Y_train_df.to_csv(os.path.join(TRAIN_DIR, folder_name, "Y_Train.csv"), index = False)

if __name__ == "__main__":
    print("Before running it keep it same as data source like sol_22_23_24_data_no_missing_scaling_applied_v4.0.csv")
    print("Enter column name: (this should be xs or xl)")
    col_name = input()
    print(col_name)
    print("Enter window size:")
    window_size = int(input())
    print(window_size)
    df = pd.read_csv("sol_22_23_24_data_no_missing_scaling_applied_v4.0.csv")
    
    split_data_in_train_and_test(df, col_name, window_size)

Before running it keep it same as data source like sol_22_23_24_data_no_missing_scaling_applied_v4.0.csv
Enter column name: (this should be xs or xl)
xl
Enter window size:
60
Flare class done for  xl 82788 70369


## Multistep time series

In [31]:
import pandas as pd
df = pd.read_csv("min_filled_and_scaling_applied.csv")
df

Unnamed: 0,time_tag,xs,xl
0,1986-01-01 00:00:00,7.282353e-11,1.471429e-09
1,1986-01-01 00:01:00,7.282353e-11,1.471429e-09
2,1986-01-01 00:02:00,7.282353e-11,1.471429e-09
3,1986-01-01 00:03:00,7.282353e-11,1.471429e-09
4,1986-01-01 00:04:00,7.282353e-11,1.471429e-09
...,...,...,...
19982875,2023-12-31 23:55:00,3.666610e-07,6.414194e-06
19982876,2023-12-31 23:56:00,3.539363e-07,6.254857e-06
19982877,2023-12-31 23:57:00,3.453191e-07,6.114120e-06
19982878,2023-12-31 23:58:00,3.319112e-07,5.965794e-06


In [18]:
def parse_datetime(dt_str):
    try:
        return pd.to_datetime(dt_str, format="%Y-%m-%d %H:%M:%S.%f")
    except ValueError:
        return pd.to_datetime(dt_str, format="%Y-%m-%d %H:%M:%S")

df['time_tag'] = df['time_tag'].apply(parse_datetime)
df_train = df[df['time_tag'].dt.year <= 2008]
df_val = df[(df['time_tag'].dt.year > 2008) & (df['time_tag'].dt.year <= 2019)]
df_test = df[df['time_tag'].dt.year > 2019]

In [19]:
df_train

Unnamed: 0,time_tag,xs,xl
0,1986-01-01 00:00:00,7.282353e-11,1.471429e-09
1,1986-01-01 00:01:00,7.282353e-11,1.471429e-09
2,1986-01-01 00:02:00,7.282353e-11,1.471429e-09
3,1986-01-01 00:03:00,7.282353e-11,1.471429e-09
4,1986-01-01 00:04:00,7.282353e-11,1.471429e-09
...,...,...,...
12097435,2008-12-31 23:55:00,4.141176e-09,5.314286e-09
12097436,2008-12-31 23:56:00,4.141176e-09,5.314286e-09
12097437,2008-12-31 23:57:00,4.200000e-09,5.314286e-09
12097438,2008-12-31 23:58:00,4.164706e-09,5.314286e-09


In [20]:
df_val

Unnamed: 0,time_tag,xs,xl
12097440,2009-01-01 00:00:00,4.200000e-09,5.314286e-09
12097441,2009-01-01 00:01:00,4.164706e-09,5.314286e-09
12097442,2009-01-01 00:02:00,4.141176e-09,5.314286e-09
12097443,2009-01-01 00:03:00,4.188235e-09,5.314286e-09
12097444,2009-01-01 00:04:00,4.164706e-09,5.314286e-09
...,...,...,...
17879035,2019-12-31 23:55:00,5.911707e-09,3.994008e-09
17879036,2019-12-31 23:56:00,5.987893e-09,4.039449e-09
17879037,2019-12-31 23:57:00,5.943698e-09,9.275667e-09
17879038,2019-12-31 23:58:00,7.769330e-08,9.619668e-07


In [21]:
df_test

Unnamed: 0,time_tag,xs,xl
17879040,2020-01-01 00:00:00,6.181761e-09,2.959024e-09
17879041,2020-01-01 00:01:00,5.765058e-09,2.460549e-09
17879042,2020-01-01 00:02:00,6.462557e-09,1.894576e-09
17879043,2020-01-01 00:03:00,5.618807e-09,4.517273e-09
17879044,2020-01-01 00:04:00,6.058952e-09,3.845082e-09
...,...,...,...
19982875,2023-12-31 23:55:00,3.666610e-07,6.414194e-06
19982876,2023-12-31 23:56:00,3.539363e-07,6.254857e-06
19982877,2023-12-31 23:57:00,3.453191e-07,6.114120e-06
19982878,2023-12-31 23:58:00,3.319112e-07,5.965794e-06


In [22]:
df_train.to_csv(f"advance_work/mean/train.csv", index=False)
df_val.to_csv(f"advance_work/mean/val.csv", index=False)
df_test.to_csv(f"advance_work/mean/test.csv", index=False)