# 

## Missing Value Imputation

In [270]:
import warnings
warnings.filterwarnings("ignore")

import pickle
import numpy as np
import pandas as pd
import os

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/1_Raw/"
label_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/2_Labels/"
processed_data_dir = "/Users/samskanderi/MLP-ContrastiveLR-SWANSF/I_Data/"
os.makedirs(processed_data_dir, exist_ok=True)
raw_data = []
labels = []

num_partitions = 5

# Load raw data
for i in range(num_partitions):
    with open(data_dir + "Partition" + str(i+1) + ".pkl", 'rb') as f:
        raw_data.append(pickle.load(f))

# Load labels
for i in range(num_partitions):
    labels.append(pd.read_csv(label_dir + "Partition" + str(i+1) + "_labels.csv"))

# Processing data and labels
for i in range(num_partitions):
    # Transpose data to (num_samples, num_timestamps, num_features) and remove the first feature
    data = np.transpose(raw_data[i], (2, 0, 1))[:, :, 2:6]  # shape (num_samples, num_timestamps, 24 features)
    
    # Process FLARE_CLASS labels
    class_mapping = {'X': 5, 'M': 4, 'B': 3, 'C': 2, 'FQ': 1}
    flare_class_labels = labels[i]['FLARE_CLASS'].map(class_mapping).values

    # Process FLARE_TYPE labels
    type_mapping = {'FQ': 1, 'C': 10, 'B': 100, 'M': 1000, 'X': 10000}
    
    def calculate_flare_type(flare_type):
        if flare_type == 'FQ':
            return 1
        else:
            return type_mapping[flare_type[0]] * float(flare_type[1:])
    
    flare_type_labels = labels[i]['FLARE_TYPE'].apply(calculate_flare_type).values
    
    # Mean imputation and removal of invalid samples
    valid_samples = []
    for sample_idx in range(data.shape[0]):
        valid_sample = True
        for feature_idx in range(data.shape[2]):
            feature_data = data[sample_idx, :, feature_idx]
            n = len(feature_data)
            valid_values = feature_data[(feature_data != 0) & (~np.isnan(feature_data))]
            if len(valid_values) > 0:
                for t in range(n):
                    next_value_found = False
                    if feature_data[t] == 0 or np.isnan(feature_data[t]):
                        # Try to find the next available value
                        for j in range(t + 1, n):
                            if feature_data[j] != 0 and not np.isnan(feature_data[j]):
                                feature_data[t] = feature_data[j]
                                next_value_found = True
                                break
                    # If no next value is found, use the previous value
                    if not next_value_found:
                        for j in range(t - 1, -1, -1):
                            if feature_data[j] != 0 and not np.isnan(feature_data[j]):
                                feature_data[t] = feature_data[j]
                                break
    
            else:
                valid_sample = False
                break  # Exit the loop if the sample is invalid
            data[sample_idx, :, feature_idx] = feature_data
        if valid_sample:
            valid_samples.append(sample_idx)
    
    data = data[valid_samples]
    flare_class_labels = flare_class_labels[valid_samples]
    flare_type_labels = flare_type_labels[valid_samples]
    print(data.shape)
    
    unique, counts = np.unique(flare_class_labels, return_counts=True)
    class_counts = dict(zip(unique, counts))
    print(f"Partition {i+1} flare_class_labels count: {class_counts}")
    
    # Check for any NaN values before saving
    has_nan = np.isnan(data).any()
    print(f"Partition {i+1} has NaN values: {has_nan}")
    
    with open(processed_data_dir + "Partition" + str(i+1) + "_data.pkl", 'wb') as f:
        pickle.dump(data, f)
    
    with open(processed_data_dir + "Partition" + str(i+1) + "_flare_class_labels.pkl", 'wb') as f:
        pickle.dump(flare_class_labels, f)
    
    with open(processed_data_dir + "Partition" + str(i+1) + "_flare_type_labels.pkl", 'wb') as f:
        pickle.dump(flare_type_labels, f)

(73492, 60, 6)
Partition 1 flare_class_labels count: {1: 60130, 2: 6416, 3: 5692, 4: 1089, 5: 165}
Partition 1 has NaN values: False
(87684, 60, 6)
Partition 2 flare_class_labels count: {1: 72498, 2: 8809, 3: 4976, 4: 1329, 5: 72}
Partition 2 has NaN values: False
(42482, 60, 6)
Partition 3 flare_class_labels count: {1: 34734, 2: 5639, 3: 685, 4: 1288, 5: 136}
Partition 3 has NaN values: False
(51219, 60, 6)
Partition 4 flare_class_labels count: {1: 43252, 2: 5956, 3: 846, 4: 1012, 5: 153}
Partition 4 has NaN values: False
(75292, 60, 6)
Partition 5 flare_class_labels count: {1: 62615, 2: 5763, 3: 5924, 4: 971, 5: 19}
Partition 5 has NaN values: False


In [16]:
import warnings
warnings.filterwarnings("ignore")

import pickle
import numpy as np
import pandas as pd
import os

data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/1_Raw/"
label_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/2_Labels/"
processed_data_dir = "/Users/samskanderi/MLP-ContrastiveLR-SWANSF/I_Data/"
os.makedirs(processed_data_dir, exist_ok=True)
raw_data = []
labels = []

num_partitions = 5

# Load raw data
for i in range(num_partitions):
    with open(data_dir + "Partition" + str(i+1) + ".pkl", 'rb') as f:
        raw_data.append(pickle.load(f))

# Load labels
for i in range(num_partitions):
    labels.append(pd.read_csv(label_dir + "Partition" + str(i+1) + "_labels.csv"))

# Processing data and labels
for i in range(num_partitions):
    # Transpose data to (num_samples, num_timestamps, num_features) and remove the first feature
    data = np.transpose(raw_data[i], (2, 0, 1))[:, :, 1:]  # shape (num_samples, num_timestamps, 24 features)
    
    # Process FLARE_CLASS labels
    class_mapping = {'X': 5, 'M': 4, 'B': 3, 'C': 2, 'FQ': 1}
    flare_class_labels = labels[i]['FLARE_CLASS'].map(class_mapping).values

    # Process FLARE_TYPE labels
    type_mapping = {'FQ': 1, 'C': 10, 'B': 100, 'M': 1000, 'X': 10000}
    
    def calculate_flare_type(flare_type):
        if flare_type == 'FQ':
            return 1
        else:
            return type_mapping[flare_type[0]] * float(flare_type[1:])
    
    flare_type_labels = labels[i]['FLARE_TYPE'].apply(calculate_flare_type).values
    
    # Mean imputation and removal of invalid samples
    valid_samples = []
    for sample_idx in range(data.shape[0]):
        valid_sample = True
        for feature_idx in range(data.shape[2]):
            feature_data = data[sample_idx, :, feature_idx]
            n = len(feature_data)
            valid_values = feature_data[(feature_data != 0) & (~np.isnan(feature_data))]
            if len(valid_values) > 0:
                for t in range(n):
                    next_value_found = False
                    if feature_data[t] == 0 or np.isnan(feature_data[t]):
                        # Try to find the next available value
                        for j in range(t + 1, n):
                            if feature_data[j] != 0 and not np.isnan(feature_data[j]):
                                feature_data[t] = feature_data[j]
                                next_value_found = True
                                break
                    # If no next value is found, use the previous value
                    if not next_value_found:
                        for j in range(t - 1, -1, -1):
                            if feature_data[j] != 0 and not np.isnan(feature_data[j]):
                                feature_data[t] = feature_data[j]
                                break
    
            else:
                valid_sample = False
                break  # Exit the loop if the sample is invalid
            data[sample_idx, :, feature_idx] = feature_data
        if valid_sample:
            valid_samples.append(sample_idx)
    
    data = data[valid_samples]
    flare_class_labels = flare_class_labels[valid_samples]
    flare_type_labels = flare_type_labels[valid_samples]
    print(data.shape)
    
    unique, counts = np.unique(flare_class_labels, return_counts=True)
    class_counts = dict(zip(unique, counts))
    print(f"Partition {i+1} flare_class_labels count: {class_counts}")
    
    # Check for any NaN values before saving
    has_nan = np.isnan(data).any()
    print(f"Partition {i+1} has NaN values: {has_nan}")
    
    with open(processed_data_dir + "Partition" + str(i+1) + "_WFS_data.pkl", 'wb') as f:
        pickle.dump(data, f)
    
    with open(processed_data_dir + "Partition" + str(i+1) + "_WFS_flare_class_labels.pkl", 'wb') as f:
        pickle.dump(flare_class_labels, f)
    
    with open(processed_data_dir + "Partition" + str(i+1) + "_WFS_flare_type_labels.pkl", 'wb') as f:
        pickle.dump(flare_type_labels, f)

(47002, 60, 24)
Partition 1 flare_class_labels count: {1: 33747, 2: 6375, 3: 5626, 4: 1089, 5: 165}
Partition 1 has NaN values: False
(53511, 60, 24)
Partition 2 flare_class_labels count: {1: 38428, 2: 8775, 3: 4909, 4: 1327, 5: 72}
Partition 2 has NaN values: False
(25531, 60, 24)
Partition 3 flare_class_labels count: {1: 17849, 2: 5583, 3: 675, 4: 1288, 5: 136}
Partition 3 has NaN values: False
(30167, 60, 24)
Partition 4 flare_class_labels count: {1: 22234, 2: 5938, 3: 830, 4: 1012, 5: 153}
Partition 4 has NaN values: False
(42215, 60, 24)
Partition 5 flare_class_labels count: {1: 29618, 2: 5738, 3: 5869, 4: 971, 5: 19}
Partition 5 has NaN values: False


## Normalization

In [271]:
import warnings
warnings.filterwarnings("ignore")

import pickle
import numpy as np
import pandas as pd
import os
from scipy.stats import skew, zscore
from sklearn.preprocessing import MinMaxScaler

data_dir = "/Users/samskanderi/MLP-ContrastiveLR-SWANSF/I_Data/"
processed_data_dir = "/Users/samskanderi/MLP-ContrastiveLR-SWANSF/I_Data/"
os.makedirs(processed_data_dir, exist_ok=True)
raw_data = []
raw_labels = []

num_partitions = 5

# Load processed data and labels
for i in range(num_partitions):
    with open(data_dir + "Partition" + str(i+1) + "_data.pkl", 'rb') as f:
        raw_data.append(pickle.load(f))
    with open(data_dir + "Partition" + str(i+1) + "_flare_class_labels.pkl", 'rb') as f:
        raw_labels.append(pickle.load(f))

# Function to apply normalization based on skewness
def normalize_feature(feature_data):
    feature_data_flat = feature_data.flatten()
    feature_data_normalized = zscore(feature_data_flat)
    
    return feature_data_normalized.reshape(feature_data.shape)

# Normalize data and convert labels to binary
for i in range(num_partitions):
    data = raw_data[i]
    labels = raw_labels[i]
    
    num_samples, num_timestamps, num_features = data.shape

    normalized_data = np.empty_like(data)

    for feature_idx in range(num_features):
        feature_data = data[:, :, feature_idx]
        normalized_feature_data = normalize_feature(feature_data)
        normalized_data[:, :, feature_idx] = normalized_feature_data

    # Check for any NaN values before saving
    has_nan = np.isnan(normalized_data).any()
    print(f"Partition {i+1} has NaN values: {has_nan}")

    # Convert labels to binary
    binary_labels = np.where(np.isin(labels, [4, 5]), 1, 0)

    # Save normalized data
    with open(processed_data_dir + "Partition" + str(i+1) + "_normalized_data.pkl", 'wb') as f:
        pickle.dump(normalized_data, f)
    
    # Save binary labels
    with open(processed_data_dir + "Partition" + str(i+1) + "_binary_labels.pkl", 'wb') as f:
        pickle.dump(binary_labels, f)

    print(f"Partition {i+1} normalized data shape: {normalized_data.shape}")
    print(f"Partition {i+1} binary labels distribution: {np.bincount(binary_labels)}")

Partition 1 has NaN values: False
Partition 1 normalized data shape: (73492, 60, 6)
Partition 1 binary labels distribution: [72238  1254]
Partition 2 has NaN values: False
Partition 2 normalized data shape: (87684, 60, 6)
Partition 2 binary labels distribution: [86283  1401]
Partition 3 has NaN values: False
Partition 3 normalized data shape: (42482, 60, 6)
Partition 3 binary labels distribution: [41058  1424]
Partition 4 has NaN values: False
Partition 4 normalized data shape: (51219, 60, 6)
Partition 4 binary labels distribution: [50054  1165]
Partition 5 has NaN values: False
Partition 5 normalized data shape: (75292, 60, 6)
Partition 5 binary labels distribution: [74302   990]


In [17]:
import warnings
warnings.filterwarnings("ignore")

import pickle
import numpy as np
import pandas as pd
import os
from scipy.stats import skew, zscore
from sklearn.preprocessing import MinMaxScaler

data_dir = "/Users/samskanderi/MLP-ContrastiveLR-SWANSF/I_Data/"
processed_data_dir = "/Users/samskanderi/MLP-ContrastiveLR-SWANSF/I_Data/"
os.makedirs(processed_data_dir, exist_ok=True)
raw_data = []
raw_labels = []

num_partitions = 5

# Load processed data and labels
for i in range(num_partitions):
    with open(data_dir + "Partition" + str(i+1) + "_WFS_data.pkl", 'rb') as f:
        raw_data.append(pickle.load(f))
    with open(data_dir + "Partition" + str(i+1) + "_WFS_flare_class_labels.pkl", 'rb') as f:
        raw_labels.append(pickle.load(f))

# Function to apply normalization based on skewness
def normalize_feature(feature_data):
    feature_data_flat = feature_data.flatten()
    feature_data_normalized = zscore(feature_data_flat)
    
    return feature_data_normalized.reshape(feature_data.shape)

# Normalize data and convert labels to binary
for i in range(num_partitions):
    data = raw_data[i]
    labels = raw_labels[i]
    
    num_samples, num_timestamps, num_features = data.shape

    normalized_data = np.empty_like(data)

    for feature_idx in range(num_features):
        feature_data = data[:, :, feature_idx]
        normalized_feature_data = normalize_feature(feature_data)
        normalized_data[:, :, feature_idx] = normalized_feature_data

    # Check for any NaN values before saving
    has_nan = np.isnan(normalized_data).any()
    print(f"Partition {i+1} has NaN values: {has_nan}")

    # Convert labels to binary
    binary_labels = np.where(np.isin(labels, [4, 5]), 1, 0)

    # Save normalized data
    with open(processed_data_dir + "Partition" + str(i+1) + "_WFS_normalized_data.pkl", 'wb') as f:
        pickle.dump(normalized_data, f)
    
    # Save binary labels
    with open(processed_data_dir + "Partition" + str(i+1) + "_WFS_binary_labels.pkl", 'wb') as f:
        pickle.dump(binary_labels, f)

    print(f"Partition {i+1} normalized data shape: {normalized_data.shape}")
    print(f"Partition {i+1} binary labels distribution: {np.bincount(binary_labels)}")

Partition 1 has NaN values: False
Partition 1 normalized data shape: (47002, 60, 24)
Partition 1 binary labels distribution: [45748  1254]
Partition 2 has NaN values: False
Partition 2 normalized data shape: (53511, 60, 24)
Partition 2 binary labels distribution: [52112  1399]
Partition 3 has NaN values: False
Partition 3 normalized data shape: (25531, 60, 24)
Partition 3 binary labels distribution: [24107  1424]
Partition 4 has NaN values: False
Partition 4 normalized data shape: (30167, 60, 24)
Partition 4 binary labels distribution: [29002  1165]
Partition 5 has NaN values: False
Partition 5 normalized data shape: (42215, 60, 24)
Partition 5 binary labels distribution: [41225   990]


## SMOTE Over-sampling

In [28]:
import warnings
warnings.filterwarnings("ignore")

import pickle
import numpy as np
import pandas as pd
import os
from imblearn.over_sampling import SMOTE

data_dir = "/Users/samskanderi/MLP-ContrastiveLR-SWANSF/I_Data/"
processed_data_dir = "/Users/samskanderi/MLP-ContrastiveLR-SWANSF/I_Data/"
os.makedirs(processed_data_dir, exist_ok=True)
raw_data = []
labels = []

num_partitions = 5

# Load normalized data and labels
for i in range(num_partitions):
    with open(data_dir + "Partition" + str(i+1) + "_WFS_normalized_data.pkl", 'rb') as f:
        raw_data.append(pickle.load(f))
    with open(data_dir + "Partition" + str(i+1) + "_WFS_flare_class_labels.pkl", 'rb') as f:
        labels.append(pickle.load(f))

# Convert classes for binary classification and apply SMOTE
for i in range(num_partitions):
    data = raw_data[i]
    flare_class_labels = labels[i]
    
    # Convert classes
    binary_labels = np.where(flare_class_labels >= 4, 1, 0)
    
    # Reshape data to (num_samples, num_timestamps * num_features) for SMOTE
    num_samples, num_timestamps, num_features = data.shape
    reshaped_data = data.reshape((num_samples, num_timestamps * num_features))
    
    # Apply SMOTE
    smote = SMOTE()
    reshaped_data_smote, binary_labels_smote = smote.fit_resample(reshaped_data, binary_labels)
    
    # Reshape data back to (num_samples, num_timestamps, num_features)
    new_data = reshaped_data_smote.reshape((-1, num_timestamps, num_features))
    
    # Save new data and labels
    with open(processed_data_dir + "Partition" + str(i+1) + "_smote_data.pkl", 'wb') as f:
        pickle.dump(new_data, f)
    
    with open(processed_data_dir + "Partition" + str(i+1) + "_smote_labels.pkl", 'wb') as f:
        pickle.dump(binary_labels_smote, f)
    
    print(f"Partition {i+1} new data shape: {new_data.shape}")
    print(f"Partition {i+1} new label distribution: {np.bincount(binary_labels_smote)}")

Partition 1 new data shape: (91496, 60, 24)
Partition 1 new label distribution: [45748 45748]
Partition 2 new data shape: (104224, 60, 24)
Partition 2 new label distribution: [52112 52112]
Partition 3 new data shape: (48214, 60, 24)
Partition 3 new label distribution: [24107 24107]
Partition 4 new data shape: (58004, 60, 24)
Partition 4 new label distribution: [29002 29002]
Partition 5 new data shape: (82450, 60, 24)
Partition 5 new label distribution: [41225 41225]


## Balanced Sampling

In [18]:
import warnings
warnings.filterwarnings("ignore")

import pickle
import numpy as np
import pandas as pd
import os
from scipy.stats import skew, zscore
from sklearn.preprocessing import MinMaxScaler

# Define the paths
data_dir = "/Users/samskanderi/MLP-ContrastiveLR-SWANSF/I_Data/"
processed_data_dir = "/Users/samskanderi/MLP-ContrastiveLR-SWANSF/I_Data/"
os.makedirs(processed_data_dir, exist_ok=True)
raw_data = []
labels = []
flare_type_labels_list = []

num_partitions = 5

# Load processed data
for i in range(num_partitions):
    with open(data_dir + "Partition" + str(i+1) + "_WFS_normalized_data.pkl", 'rb') as f:
        raw_data.append(pickle.load(f))
    with open(data_dir + "Partition" + str(i+1) + "_WFS_flare_class_labels.pkl", 'rb') as f:
        labels.append(pickle.load(f))
    with open(data_dir + "Partition" + str(i+1) + "_WFS_flare_type_labels.pkl", 'rb') as f:
        flare_type_labels_list.append(pickle.load(f))

# Gaussian Noise Injection
def gaussian_noise_injection(data, num_samples, noise_proportion=0.1):
    std_dev = np.std(data, axis=0)
    noise_level = std_dev * noise_proportion

    new_samples = []
    for _ in range(num_samples):
        sample_index = np.random.choice(len(data))
        sample = data[sample_index]
        noise = np.random.normal(0, noise_level, sample.shape)
        new_sample = sample + noise
        new_samples.append(new_sample)

    return np.array(new_samples)


# Process each partition
for i in range(num_partitions):
    data = raw_data[i]
    flare_class_labels = labels[i]
    flare_type_labels = flare_type_labels_list[i]

    # Oversampling
    augmented_data = []
    augmented_class_labels = []
    augmented_type_labels = []

    for class_label, factor in [(5, 10), (4, 1.5)]:
        class_indices = np.where(flare_class_labels == class_label)[0]
        class_data = data[class_indices]
        class_type_labels = flare_type_labels[class_indices]
        num_samples = int(len(class_indices) * factor)

        # Gaussian Noise Injection
        gni_data = gaussian_noise_injection(class_data, num_samples)
        gni_labels = np.full(num_samples, class_label)
        gni_type_labels = np.random.choice(class_type_labels, num_samples, replace=True)
        augmented_data.append(gni_data)
        augmented_class_labels.append(gni_labels)
        augmented_type_labels.append(gni_type_labels)

    # Combine original and augmented data
    augmented_data = np.concatenate(augmented_data, axis=0)
    augmented_class_labels = np.concatenate(augmented_class_labels, axis=0)
    augmented_type_labels = np.concatenate(augmented_type_labels, axis=0)
    data = np.concatenate((data, augmented_data), axis=0)
    flare_class_labels = np.concatenate((flare_class_labels, augmented_class_labels), axis=0)
    flare_type_labels = np.concatenate((flare_type_labels, augmented_type_labels), axis=0)

    # Calculate target number of samples for minority classes
    total_majority_class_samples = len(flare_class_labels[flare_class_labels == 5]) + len(flare_class_labels[flare_class_labels == 4])
    keep_1 = int(total_majority_class_samples * 1.2)
    keep_2_3 = int(total_majority_class_samples // 2)

    # Undersample minority classes to match the target number of samples
    minority_class_1_indices = np.where(flare_class_labels == 1)[0]
    minority_class_2_indices = np.where(flare_class_labels == 2)[0]
    minority_class_3_indices = np.where(flare_class_labels == 3)[0]

    minority_class_1_samples_to_keep = np.random.choice(minority_class_1_indices, min(keep_1, len(minority_class_1_indices)), replace=False)
    minority_class_2_samples_to_keep = np.random.choice(minority_class_2_indices, min(keep_2_3, len(minority_class_2_indices)), replace=False)
    minority_class_3_samples_to_keep = np.random.choice(minority_class_3_indices, min(keep_2_3, len(minority_class_3_indices)), replace=False)

    valid_indices = np.concatenate((minority_class_1_samples_to_keep, minority_class_2_samples_to_keep, minority_class_3_samples_to_keep, np.where(np.isin(flare_class_labels, [4, 5]))[0]))

    data = data[valid_indices]
    flare_class_labels = flare_class_labels[valid_indices]
    flare_type_labels = flare_type_labels[valid_indices]

    # Normalize data
    binary_labels = np.where(flare_class_labels >= 4, 1, 0)
    
    
    # Save normalized data and binary labels
    with open(processed_data_dir + "Partition" + str(i+1) + "_OUS_normalized_data.pkl", 'wb') as f:
        pickle.dump(data, f)
    
    with open(processed_data_dir + "Partition" + str(i+1) + "_OUS_binary_labels.pkl", 'wb') as f:
        pickle.dump(binary_labels, f)

    # Save flare_type_labels
    with open(processed_data_dir + "Partition" + str(i+1) + "_OUS_flare_type_labels.pkl", 'wb') as f:
        pickle.dump(flare_type_labels, f)

    print(f"Partition {i+1} normalized data shape: {data.shape}")
    print(f"Partition {i+1} binary labels distribution: {np.bincount(binary_labels)}")

Partition 1 normalized data shape: (14517, 60, 24)
Partition 1 binary labels distribution: [9980 4537]
Partition 2 normalized data shape: (13147, 60, 24)
Partition 2 binary labels distribution: [9038 4109]
Partition 3 normalized data shape: (13408, 60, 24)
Partition 3 binary labels distribution: [8692 4716]
Partition 4 normalized data shape: (12204, 60, 24)
Partition 4 binary labels distribution: [7991 4213]
Partition 5 normalized data shape: (8435, 60, 24)
Partition 5 binary labels distribution: [5799 2636]


##  Near Decision Boundary Sample Removal

In [19]:
import warnings
warnings.filterwarnings("ignore")

import pickle
import numpy as np
import pandas as pd
import os
from scipy.stats import skew, zscore
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

# Define the paths
data_dir = "/Users/samskanderi/MLP-ContrastiveLR-SWANSF/I_Data/"
processed_data_dir = "/Users/samskanderi/MLP-ContrastiveLR-SWANSF/I_Data/"
os.makedirs(processed_data_dir, exist_ok=True)
raw_data = []
labels = []
flare_type_labels_list = []

num_partitions = 5

# Load processed data
for i in range(num_partitions):
    with open(data_dir + "Partition" + str(i+1) + "_WFS_normalized_data.pkl", 'rb') as f:
        raw_data.append(pickle.load(f))
    with open(data_dir + "Partition" + str(i+1) + "_WFS_flare_class_labels.pkl", 'rb') as f:
        labels.append(pickle.load(f))
    with open(data_dir + "Partition" + str(i+1) + "_WFS_flare_type_labels.pkl", 'rb') as f:
        flare_type_labels_list.append(pickle.load(f))

# Gaussian Noise Injection
def smote_synthetic_samples(data, num_samples, k_neighbors=5):
    n_samples, n_timestamps, n_features = data.shape
    
    # Reshape the data for Nearest Neighbors to work on a 2D array
    reshaped_data = data.reshape(n_samples, -1)
    
    # Nearest Neighbors to determine the points for interpolation
    nn = NearestNeighbors(n_neighbors=k_neighbors+1)
    nn.fit(reshaped_data)
    
    synthetic_samples = []
    for _ in range(num_samples):
        # Randomly pick an index
        sample_index = np.random.randint(0, n_samples)
        sample = data[sample_index]
        
        # Find k-nearest neighbors
        neighbors = nn.kneighbors([reshaped_data[sample_index]], return_distance=False)[0]
        # Exclude the sample itself
        neighbors = neighbors[neighbors != sample_index]
        
        # Randomly select one of the neighbors
        neighbor_index = np.random.choice(neighbors)
        neighbor = data[neighbor_index]
        
        # Generate a synthetic sample
        diff = neighbor - sample
        gap = np.random.rand()
        synthetic_sample = sample + gap * diff
        synthetic_samples.append(synthetic_sample)
    
    return np.array(synthetic_samples)

# Process each partition
for i in range(num_partitions):
    data = raw_data[i]
    flare_class_labels = labels[i]
    flare_type_labels = flare_type_labels_list[i]

    # Oversampling
    augmented_data = []
    augmented_class_labels = []
    augmented_type_labels = []

    for class_label, factor in [(5, 10), (4, 1.5)]:
        class_indices = np.where(flare_class_labels == class_label)[0]
        class_data = data[class_indices]
        class_type_labels = flare_type_labels[class_indices]
        num_samples = int(len(class_indices) * factor)

        # Gaussian Noise Injection
        gni_data = smote_synthetic_samples(class_data, num_samples)
        gni_labels = np.full(num_samples, class_label)
        gni_type_labels = np.random.choice(class_type_labels, num_samples, replace=True)
        augmented_data.append(gni_data)
        augmented_class_labels.append(gni_labels)
        augmented_type_labels.append(gni_type_labels)

    # Combine original and augmented data
    augmented_data = np.concatenate(augmented_data, axis=0)
    augmented_class_labels = np.concatenate(augmented_class_labels, axis=0)
    augmented_type_labels = np.concatenate(augmented_type_labels, axis=0)
    data = np.concatenate((data, augmented_data), axis=0)
    flare_class_labels = np.concatenate((flare_class_labels, augmented_class_labels), axis=0)
    flare_type_labels = np.concatenate((flare_type_labels, augmented_type_labels), axis=0)

    # Remove class 2 and 3 samples
    valid_indices = np.where((flare_class_labels != 2) & (flare_class_labels != 3))[0]
    data = data[valid_indices]
    flare_class_labels = flare_class_labels[valid_indices]
    flare_type_labels = flare_type_labels[valid_indices]

    # Calculate target number of samples for minority class 1
    total_majority_class_samples = len(flare_class_labels[flare_class_labels == 5]) + len(flare_class_labels[flare_class_labels == 4])
    keep_1 = int(total_majority_class_samples * 1.2)

    # Undersample minority class 1 to match the target number of samples
    minority_class_1_indices = np.where(flare_class_labels == 1)[0]
    minority_class_1_samples_to_keep = np.random.choice(minority_class_1_indices, min(keep_1, len(minority_class_1_indices)), replace=False)

    valid_indices = np.concatenate((minority_class_1_samples_to_keep, np.where(np.isin(flare_class_labels, [4, 5]))[0]))

    data = data[valid_indices]
    flare_class_labels = flare_class_labels[valid_indices]
    flare_type_labels = flare_type_labels[valid_indices]

    # Update binary labels: classes 5 and 4 as 1, class 1 as 0
    binary_labels = np.where(np.isin(flare_class_labels, [4, 5]), 1, 0)

    # Shuffle the data and labels
    indices = np.random.permutation(len(data))
    data = data[indices]
    binary_labels = binary_labels[indices]
    flare_type_labels = flare_type_labels[indices]

    # Save normalized data and binary labels
    with open(processed_data_dir + "Partition" + str(i+1) + "_CCBR_OUS_normalized_data.pkl", 'wb') as f:
        pickle.dump(data, f)
    
    with open(processed_data_dir + "Partition" + str(i+1) + "_CCBR_OUS_binary_labels.pkl", 'wb') as f:
        pickle.dump(binary_labels, f)

    # Save flare_type_labels
    with open(processed_data_dir + "Partition" + str(i+1) + "_CCBR_OUS_flare_type_labels.pkl", 'wb') as f:
        pickle.dump(flare_type_labels, f)

    print(f"Partition {i+1} normalized data shape: {data.shape}")
    print(f"Partition {i+1} binary labels distribution: {np.bincount(binary_labels)}")

Partition 1 normalized data shape: (9981, 60, 24)
Partition 1 binary labels distribution: [5444 4537]
Partition 2 normalized data shape: (9039, 60, 24)
Partition 2 binary labels distribution: [4930 4109]
Partition 3 normalized data shape: (10375, 60, 24)
Partition 3 binary labels distribution: [5659 4716]
Partition 4 normalized data shape: (9268, 60, 24)
Partition 4 binary labels distribution: [5055 4213]
Partition 5 normalized data shape: (5799, 60, 24)
Partition 5 binary labels distribution: [3163 2636]


## Feature Selection

In [20]:
import warnings
warnings.filterwarnings("ignore")

import pickle
import numpy as np
import pandas as pd
import os
from scipy.stats import skew, zscore
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

# Define the paths
data_dir = "/Users/samskanderi/MLP-ContrastiveLR-SWANSF/I_Data/"
processed_data_dir = "/Users/samskanderi/MLP-ContrastiveLR-SWANSF/I_Data/"
os.makedirs(processed_data_dir, exist_ok=True)
raw_data = []
labels = []
flare_type_labels_list = []

num_partitions = 5

# Load processed data
for i in range(num_partitions):
    with open(data_dir + "Partition" + str(i+1) + "_normalized_data.pkl", 'rb') as f:
        raw_data.append(pickle.load(f))
    with open(data_dir + "Partition" + str(i+1) + "_flare_class_labels.pkl", 'rb') as f:
        labels.append(pickle.load(f))
    with open(data_dir + "Partition" + str(i+1) + "_flare_type_labels.pkl", 'rb') as f:
        flare_type_labels_list.append(pickle.load(f))

# Gaussian Noise Injection
def smote_synthetic_samples(data, num_samples, k_neighbors=5):
    n_samples, n_timestamps, n_features = data.shape
    
    # Reshape the data for Nearest Neighbors to work on a 2D array
    reshaped_data = data.reshape(n_samples, -1)
    
    # Nearest Neighbors to determine the points for interpolation
    nn = NearestNeighbors(n_neighbors=k_neighbors+1)
    nn.fit(reshaped_data)
    
    synthetic_samples = []
    for _ in range(num_samples):
        # Randomly pick an index
        sample_index = np.random.randint(0, n_samples)
        sample = data[sample_index]
        
        # Find k-nearest neighbors
        neighbors = nn.kneighbors([reshaped_data[sample_index]], return_distance=False)[0]
        # Exclude the sample itself
        neighbors = neighbors[neighbors != sample_index]
        
        # Randomly select one of the neighbors
        neighbor_index = np.random.choice(neighbors)
        neighbor = data[neighbor_index]
        
        # Generate a synthetic sample
        diff = neighbor - sample
        gap = np.random.rand()
        synthetic_sample = sample + gap * diff
        synthetic_samples.append(synthetic_sample)
    
    return np.array(synthetic_samples)

# Process each partition
for i in range(num_partitions):
    data = raw_data[i]
    flare_class_labels = labels[i]
    flare_type_labels = flare_type_labels_list[i]

    # Oversampling
    augmented_data = []
    augmented_class_labels = []
    augmented_type_labels = []

    for class_label, factor in [(5, 10), (4, 1.5)]:
        class_indices = np.where(flare_class_labels == class_label)[0]
        class_data = data[class_indices]
        class_type_labels = flare_type_labels[class_indices]
        num_samples = int(len(class_indices) * factor)

        # Gaussian Noise Injection
        gni_data = smote_synthetic_samples(class_data, num_samples)
        gni_labels = np.full(num_samples, class_label)
        gni_type_labels = np.random.choice(class_type_labels, num_samples, replace=True)
        augmented_data.append(gni_data)
        augmented_class_labels.append(gni_labels)
        augmented_type_labels.append(gni_type_labels)

    # Combine original and augmented data
    augmented_data = np.concatenate(augmented_data, axis=0)
    augmented_class_labels = np.concatenate(augmented_class_labels, axis=0)
    augmented_type_labels = np.concatenate(augmented_type_labels, axis=0)
    data = np.concatenate((data, augmented_data), axis=0)
    flare_class_labels = np.concatenate((flare_class_labels, augmented_class_labels), axis=0)
    flare_type_labels = np.concatenate((flare_type_labels, augmented_type_labels), axis=0)

    # Remove class 2 and 3 samples
    valid_indices = np.where((flare_class_labels != 2) & (flare_class_labels != 3))[0]
    data = data[valid_indices]
    flare_class_labels = flare_class_labels[valid_indices]
    flare_type_labels = flare_type_labels[valid_indices]

    # Calculate target number of samples for minority class 1
    total_majority_class_samples = len(flare_class_labels[flare_class_labels == 5]) + len(flare_class_labels[flare_class_labels == 4])
    keep_1 = int(total_majority_class_samples * 1.2)

    # Undersample minority class 1 to match the target number of samples
    minority_class_1_indices = np.where(flare_class_labels == 1)[0]
    minority_class_1_samples_to_keep = np.random.choice(minority_class_1_indices, min(keep_1, len(minority_class_1_indices)), replace=False)

    valid_indices = np.concatenate((minority_class_1_samples_to_keep, np.where(np.isin(flare_class_labels, [4, 5]))[0]))

    data = data[valid_indices]
    flare_class_labels = flare_class_labels[valid_indices]
    flare_type_labels = flare_type_labels[valid_indices]

    # Update binary labels: classes 5 and 4 as 1, class 1 as 0
    binary_labels = np.where(np.isin(flare_class_labels, [4, 5]), 1, 0)

    # Shuffle the data and labels
    indices = np.random.permutation(len(data))
    data = data[indices]
    binary_labels = binary_labels[indices]
    flare_type_labels = flare_type_labels[indices]

    # Save normalized data and binary labels
    with open(processed_data_dir + "Partition" + str(i+1) + "_FS_CCBR_OUS_normalized_data.pkl", 'wb') as f:
        pickle.dump(data, f)
    
    with open(processed_data_dir + "Partition" + str(i+1) + "_FS_CCBR_OUS_binary_labels.pkl", 'wb') as f:
        pickle.dump(binary_labels, f)

    # Save flare_type_labels
    with open(processed_data_dir + "Partition" + str(i+1) + "_FS_CCBR_OUS_flare_type_labels.pkl", 'wb') as f:
        pickle.dump(flare_type_labels, f)

    print(f"Partition {i+1} normalized data shape: {data.shape}")
    print(f"Partition {i+1} binary labels distribution: {np.bincount(binary_labels)}")

Partition 1 normalized data shape: (9981, 60, 6)
Partition 1 binary labels distribution: [5444 4537]
Partition 2 normalized data shape: (9050, 60, 6)
Partition 2 binary labels distribution: [4936 4114]
Partition 3 normalized data shape: (10375, 60, 6)
Partition 3 binary labels distribution: [5659 4716]
Partition 4 normalized data shape: (9268, 60, 6)
Partition 4 binary labels distribution: [5055 4213]
Partition 5 normalized data shape: (5799, 60, 6)
Partition 5 binary labels distribution: [3163 2636]


# Classification

In [89]:
import warnings
warnings.filterwarnings('ignore')

def TSS(TP,TN,FP,FN):
    TSS_value = (TP / (TP + FN)) - (FP / (FP + TN))
    return TSS_value

def HSS1(TP,TN,FP,FN):
    HSS1_value = (2 * (TP * TN - FP * FN)) / ((TP + FN) * (FN + TN) + (TP + FP) * (FP + TN))
    return HSS1_value
    
def HSS2(TP,TN,FP,FN):
    HSS2_value = (2 * (TP * TN - FP * FN)) / ((TP + FP) * (FN + TN) + (TP + FN) * (FP + TN))
    return HSS2_value

def GSS(TP,TN,FP,FN):
    GSS_value = (TP - (TP + FP) * (TP + FN) / (TP + FP + FN + TN))
    return GSS_value

def Recall(TP,TN,FP,FN):
    Recall_value = (TP) / (TP + FN)
    return Recall_value

def FPR(TP,TN,FP,FN):
    fpr_value = (FP) / (FP + TN)
    return fpr_value

def Accuracy(TP,TN,FP,FN):
    accuracy_value = (TP + TN) / (TP + TN + FP + FN)
    return accuracy_value

def Precision(TP,TN,FP,FN):
    precision_value = (TP) / (TP + FP)
    return precision_value

In [148]:
def kfold_training(name, X_train, Y_train, X_test, Y_test, training_func, num):
    kfold = np.array([[1,2],[2,3],[3,4],[4,5]])

    metrics = []
    metrics_values = np.array([])
    
    for i in range(0, num):
        train_index = kfold[i,0]
        test_index = kfold[i,1]
        metrics_values = training_func(X_train[train_index-1], Y_train[train_index-1], X_test[test_index-1], Y_test[test_index-1])
        while (metrics_values[4] < 0.01):
            metrics_values = training_func(X_train[train_index-1], Y_train[train_index-1], X_test[test_index-1], Y_test[test_index-1])
        metrics.append(np.append(np.append(train_index, test_index), metrics_values))
    return metrics

In [185]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import GRU, Dense, Dropout
from sklearn.metrics import confusion_matrix

def gru_model(X_train, Y_train, X_test, Y_test):
    
    data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/models/"
    
    verbose, epochs, batch_size = 0, 10, 64
    n_timesteps, n_features = 60, 6
    
    model = Sequential()
    model.add(GRU(units=8, activation='tanh', input_shape=(n_timesteps,n_features)))
    model.add(Dropout(0.3))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(2, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=keras.metrics.SpecificityAtSensitivity(sensitivity=0.98))

    model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, verbose=verbose)
    
    best_threshold = 0.0
    best_tss = 0.0
    y_pred = model.predict(X_test)
    # evaluate model
    for i in range(1, 10):

        threshold = i / 10 # Adjust the threshold as needed
        y_pred_binary = (y_pred > threshold).astype(int)
        confusion = confusion_matrix(Y_test, y_pred_binary)
        tn, fp, fn, tp = confusion.ravel()
        tss = TSS(tp,tn,fp,fn)
        if tss > best_tss:
            best_tss = tss
            best_threshold = i / 10
        
    
    print(str(X_train.shape)+': GRU Classifier is Done! \n')

    
    threshold = best_threshold # Adjust the threshold as needed
    y_pred_binary = (y_pred > threshold).astype(int)
    confusion = confusion_matrix(Y_test, y_pred_binary)
    tn, fp, fn, tp = confusion.ravel()

    tss = TSS(tp,tn,fp,fn)
    hss1 = HSS1(tp,tn,fp,fn)
    hss2 = HSS2(tp,tn,fp,fn)
    gss = GSS(tp,tn,fp,fn)
    recall = Recall(tp,tn,fp,fn)
    precision = Precision(tp,tn,fp,fn)
    
    output_values = np.array([tp, fn, fp, tn, tss, hss1, hss2, gss, recall, precision])


    #joblib.dump(classifier, data_dir + "mlp_model.pkl")

    #loaded_mlp_model = joblib.load(data_dir + "mlp_model.pkl")
    
    return output_values

In [186]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense, Dropout
from sklearn.metrics import confusion_matrix

def rnn_model(X_train, Y_train, X_test, Y_test):
    
    data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/models/"
    
    verbose, epochs, batch_size = 0, 10, 64
    n_timesteps, n_features = 60, 6
    
    model = Sequential()
    model.add(SimpleRNN(units=8, activation='relu', input_shape=(n_timesteps,n_features)))
    model.add(Dropout(0.3))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(2, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=keras.metrics.SpecificityAtSensitivity(sensitivity=0.98))
    
    model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, verbose=verbose)


    best_threshold = 0.0
    best_tss = 0.0
    y_pred = model.predict(X_test)
    # evaluate model
    for i in range(1, 10):

        threshold = i / 10 # Adjust the threshold as needed
        y_pred_binary = (y_pred > threshold).astype(int)
        confusion = confusion_matrix(Y_test, y_pred_binary)
        tn, fp, fn, tp = confusion.ravel()
        tss = TSS(tp,tn,fp,fn)
        if tss > best_tss:
            best_tss = tss
            best_threshold = i / 10
        
    
    print(str(X_train.shape)+': RNN Classifier is Done! \n')


    threshold = best_threshold # Adjust the threshold as needed
    y_pred_binary = (y_pred > threshold).astype(int)
    confusion = confusion_matrix(Y_test, y_pred_binary)
    tn, fp, fn, tp = confusion.ravel()

    tss = TSS(tp,tn,fp,fn)
    hss1 = HSS1(tp,tn,fp,fn)
    hss2 = HSS2(tp,tn,fp,fn)
    gss = GSS(tp,tn,fp,fn)
    recall = Recall(tp,tn,fp,fn)
    precision = Precision(tp,tn,fp,fn)
    
    output_values = np.array([tp, fn, fp, tn, tss, hss1, hss2, gss, recall, precision])


    #joblib.dump(classifier, data_dir + "mlp_model.pkl")

    #loaded_mlp_model = joblib.load(data_dir + "mlp_model.pkl")
    
    return output_values

In [187]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.metrics import confusion_matrix

def cnn_model(X_train, Y_train, X_test, Y_test):
    
    data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/models/"
    
    verbose, epochs, batch_size = 0, 10, 64
    n_timesteps, n_features = 60, 6
    
    model = Sequential()
    model.add(Conv1D(filters=16, kernel_size=5, activation='relu', input_shape=(n_timesteps,n_features)))
    model.add(Dropout(0.2))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=32, kernel_size=5, activation='relu'))
    model.add(Dropout(0.3))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(32, activation='relu'))
    model.add(Dense(2, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=keras.metrics.SpecificityAtSensitivity(sensitivity=0.98))
    
    model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, verbose=verbose)
    
    best_threshold = 0.0
    best_tss = 0.0
    y_pred = model.predict(X_test)
    # evaluate model
    for i in range(1, 10):

        threshold = i / 10 # Adjust the threshold as needed
        y_pred_binary = (y_pred > threshold).astype(int)
        confusion = confusion_matrix(Y_test, y_pred_binary)
        tn, fp, fn, tp = confusion.ravel()
        tss = TSS(tp,tn,fp,fn)
        if tss > best_tss:
            best_tss = tss
            best_threshold = i / 10
        
    
    print(str(X_train.shape)+': CNN Classifier is Done! \n')


    threshold = best_threshold # Adjust the threshold as needed
    y_pred_binary = (y_pred > threshold).astype(int)
    confusion = confusion_matrix(Y_test, y_pred_binary)
    tn, fp, fn, tp = confusion.ravel()

    tss = TSS(tp,tn,fp,fn)
    hss1 = HSS1(tp,tn,fp,fn)
    hss2 = HSS2(tp,tn,fp,fn)
    gss = GSS(tp,tn,fp,fn)
    recall = Recall(tp,tn,fp,fn)
    precision = Precision(tp,tn,fp,fn)
    
    output_values = np.array([tp, fn, fp, tn, tss, hss1, hss2, gss, recall, precision])


    #joblib.dump(classifier, data_dir + "mlp_model.pkl")

    #loaded_mlp_model = joblib.load(data_dir + "mlp_model.pkl")
    
    return output_values

In [155]:
# Import necessary libraries
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

def svm_model(X_train, Y_train, X_test, Y_test):
    
    data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/models/"


    # Create an SVM classifier (you can choose different kernels like 'linear', 'rbf', etc.)
    svm_classifier = SVC(kernel='rbf', C=1.0)
    svm_classifier.fit(X_train[:,1,:], Y_train)
    y_pred = svm_classifier.predict(X_test[:,1,:])
    
    
    print(str(X_train.shape)+': SVM Classifier is Done! \n')
    

    confusion = confusion_matrix(Y_test, y_pred)
    tn, fp, fn, tp = confusion.ravel()

    tss = TSS(tp,tn,fp,fn)
    hss1 = HSS1(tp,tn,fp,fn)
    hss2 = HSS2(tp,tn,fp,fn)
    gss = GSS(tp,tn,fp,fn)
    recall = Recall(tp,tn,fp,fn)
    precision = Precision(tp,tn,fp,fn)
    
    output_values = np.array([tp, fn, fp, tn, tss, hss1, hss2, gss, recall, precision])


    #joblib.dump(classifier, data_dir + "svm_model.pkl")

    #loaded_svm_model = joblib.load(data_dir + "svm_model.pkl")
    
    return output_values

In [188]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import confusion_matrix

def lstm_model(X_train, Y_train, X_test, Y_test):
    
    data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/models/"
    
    verbose, epochs, batch_size = 0, 10, 64
    n_timesteps, n_features = 60, 6

    model = Sequential()
    model.add(LSTM(8, activation='tanh', input_shape=(n_timesteps,n_features)))
    model.add(Dropout(0.3))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=keras.metrics.SpecificityAtSensitivity(sensitivity=0.98))
    
    model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, verbose=verbose)
    
    best_threshold = 0.0
    best_tss = 0.0
    y_pred = model.predict(X_test)
    # evaluate model
    for i in range(1, 10):

        threshold = i / 10 # Adjust the threshold as needed
        y_pred_binary = (y_pred > threshold).astype(int)
        confusion = confusion_matrix(Y_test, y_pred_binary)
        tn, fp, fn, tp = confusion.ravel()
        tss = TSS(tp,tn,fp,fn)
        if tss > best_tss:
            best_tss = tss
            best_threshold = i / 10
        
    
    print(str(X_train.shape)+': LSTM Classifier is Done! \n')


    threshold = best_threshold # Adjust the threshold as needed
    y_pred_binary = (y_pred > threshold).astype(int)
    confusion = confusion_matrix(Y_test, y_pred_binary)
    tn, fp, fn, tp = confusion.ravel()

    tss = TSS(tp,tn,fp,fn)
    hss1 = HSS1(tp,tn,fp,fn)
    hss2 = HSS2(tp,tn,fp,fn)
    gss = GSS(tp,tn,fp,fn)
    recall = Recall(tp,tn,fp,fn)
    precision = Precision(tp,tn,fp,fn)
    
    output_values = np.array([tp, fn, fp, tn, tss, hss1, hss2, gss, recall, precision])


    #joblib.dump(classifier, data_dir + "mlp_model.pkl")

    #loaded_mlp_model = joblib.load(data_dir + "mlp_model.pkl")
    
    return output_values

In [157]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import confusion_matrix

def lstm_WFS_model(X_train, Y_train, X_test, Y_test):
    
    data_dir = "/Users/samskanderi/Documents/Research_Project/SWANSF/code/models/"
    
    verbose, epochs, batch_size = 0, 10, 64
    n_timesteps, n_features = 60, 24

    model = Sequential()
    model.add(LSTM(24, activation='tanh', input_shape=(n_timesteps,n_features)))
    model.add(Dropout(0.3))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=keras.metrics.SpecificityAtSensitivity(sensitivity=0.98))
    
    model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, verbose=verbose)
    
    best_threshold = 0.0
    best_tss = 0.0
    y_pred = model.predict(X_test)
    # evaluate model
    for i in range(1, 1000):

        threshold = i / 1000 # Adjust the threshold as needed
        y_pred_binary = (y_pred > threshold).astype(int)
        confusion = confusion_matrix(Y_test, y_pred_binary)
        tn, fp, fn, tp = confusion.ravel()
        tss = TSS(tp,tn,fp,fn)
        if tss > best_tss:
            best_tss = tss
            best_threshold = i / 1000
        lstm_WFS_model
    
    print(str(X_train.shape)+': LSTM Classifier is Done! \n')


    threshold = best_threshold # Adjust the threshold as needed
    y_pred_binary = (y_pred > threshold).astype(int)
    confusion = confusion_matrix(Y_test, y_pred_binary)
    tn, fp, fn, tp = confusion.ravel()

    tss = TSS(tp,tn,fp,fn)
    hss1 = HSS1(tp,tn,fp,fn)
    hss2 = HSS2(tp,tn,fp,fn)
    gss = GSS(tp,tn,fp,fn)
    recall = Recall(tp,tn,fp,fn)
    precision = Precision(tp,tn,fp,fn)
    
    output_values = np.array([tp, fn, fp, tn, tss, hss1, hss2, gss, recall, precision])


    #joblib.dump(classifier, data_dir + "mlp_model.pkl")

    #loaded_mlp_model = joblib.load(data_dir + "mlp_model.pkl")
    
    return output_values

In [189]:
def save_results(result, name):
    data_dir = "/Users/samskanderi/MLP-ContrastiveLR-SWANSF/results/"

    with open(data_dir + name + ".pkl", 'wb') as f:
        pickle.dump(result, f)
    for i in range(4):
        print("TSS: " + str(result[i][6]) + "    Recall: " + str(result[i][10]))

In [190]:
import os
import pickle
import numpy as np
import pandas as pd
data_dir = "/Users/samskanderi/MLP-ContrastiveLR-SWANSF/I_Data/"
processed_data_dir = "/Users/samskanderi/MLP-ContrastiveLR-SWANSF/I_Data/"
os.makedirs(processed_data_dir, exist_ok=True)
data = []
labels = []
flare_type_labels_list = []

num_partitions = 5

# Load processed data
for i in range(num_partitions):
    with open(data_dir + "Partition" + str(i+1) + "_FS_CCBR_OUS_normalized_data.pkl", 'rb') as f:
        data.append(pickle.load(f))
    with open(data_dir + "Partition" + str(i+1) + "_FS_CCBR_OUS_binary_labels.pkl", 'rb') as f:
        labels.append(pickle.load(f))
    with open(data_dir + "Partition" + str(i+1) + "_FS_CCBR_OUS_flare_type_labels.pkl", 'rb') as f:
        flare_type_labels_list.append(pickle.load(f))

test_data = []
test_labels = []
test_flare_type_labels_list = []

# Load processed data
for i in range(num_partitions):
    with open(data_dir + "Partition" + str(i+1) + "_normalized_data.pkl", 'rb') as f:
        test_data.append(pickle.load(f))
    with open(data_dir + "Partition" + str(i+1) + "_binary_labels.pkl", 'rb') as f:
        test_labels.append(pickle.load(f))
    with open(data_dir + "Partition" + str(i+1) + "_flare_type_labels.pkl", 'rb') as f:
        test_flare_type_labels_list.append(pickle.load(f))

In [22]:
lstm_data = kfold_training('LSTM', data, labels, test_data, test_labels, lstm_WFS_model, 4)

(47002, 60, 24): LSTM Classifier is Done! 

(53511, 60, 24): LSTM Classifier is Done! 

(25531, 60, 24): LSTM Classifier is Done! 

(30167, 60, 24): LSTM Classifier is Done! 



In [23]:
save_results(lstm_data, "lstm_data")

TSS: 0.09508594289574356    Recall: 0.17512508934953538
TSS: 0.1479381717185041    Recall: 0.28370786516853935
TSS: 0.22318869232934357    Recall: 0.9356223175965666
TSS: 0.22674642115515373    Recall: 0.41515151515151516


In [59]:
lstm_normalized = kfold_training('LSTM', data, labels, test_data, test_labels, lstm_WFS_model, 4)

(47002, 60, 24): LSTM Classifier is Done! 

(53511, 60, 24): LSTM Classifier is Done! 

(25531, 60, 24): LSTM Classifier is Done! 

(30167, 60, 24): LSTM Classifier is Done! 



In [60]:
save_results(lstm_normalized, "lstm_normalized")

TSS: 0.6000649917053344    Recall: 0.8842030021443888
TSS: 0.5971610127227721    Recall: 0.764747191011236
TSS: 0.6985251276144046    Recall: 0.8729613733905579
TSS: 0.7259134706678673    Recall: 0.902020202020202


In [54]:
lstm_smote_normalized = kfold_training('LSTM', data, labels, test_data, test_labels, lstm_WFS_model, 4)

(91496, 60, 24): LSTM Classifier is Done! 

(104224, 60, 24): LSTM Classifier is Done! 

(48214, 60, 24): LSTM Classifier is Done! 

(58004, 60, 24): LSTM Classifier is Done! 



In [55]:
save_results(lstm_smote_normalized, "lstm_smote_normalized")

TSS: 0.5624997393857579    Recall: 0.6847748391708363
TSS: 0.5879832388186936    Recall: 0.8167134831460674
TSS: 0.7153310131342133    Recall: 0.9184549356223176
TSS: 0.5796492517656859    Recall: 0.7717171717171717


In [39]:
lstm_OUS_normalized = kfold_training('LSTM', data, labels, test_data, test_labels, lstm_WFS_model, 4)

(14517, 60, 24): LSTM Classifier is Done! 

(13147, 60, 24): LSTM Classifier is Done! 

(13408, 60, 24): LSTM Classifier is Done! 

(12204, 60, 24): LSTM Classifier is Done! 



In [40]:
save_results(lstm_OUS_normalized, "lstm_OUS_normalized")

TSS: 0.6463976226055587    Recall: 0.9027877055039314
TSS: 0.5352514864674022    Recall: 0.6306179775280899
TSS: 0.6369930977085196    Recall: 0.9424892703862661
TSS: 0.7385911265474638    Recall: 0.9565656565656566


In [44]:
lstm_CCBR_OUS_normalized = kfold_training('LSTM', data, labels, test_data, test_labels, lstm_WFS_model, 4)

(9981, 60, 24): LSTM Classifier is Done! 

(9039, 60, 24): LSTM Classifier is Done! 

(10375, 60, 24): LSTM Classifier is Done! 

(9268, 60, 24): LSTM Classifier is Done! 



In [45]:
save_results(lstm_CCBR_OUS_normalized, "lstm_CCBR_OUS_normalized")

TSS: 0.6456352024989119    Recall: 0.8777698355968548
TSS: 0.6365094897607716    Recall: 0.8679775280898876
TSS: 0.7346839184984431    Recall: 0.8918454935622318
TSS: 0.7447808589227631    Recall: 0.9363636363636364


In [191]:
lstm_FS_CCBR_OUS_normalized = kfold_training('LSTM', data, labels, test_data, test_labels, lstm_model, 4)

(9981, 60, 6): LSTM Classifier is Done! 

(9050, 60, 6): LSTM Classifier is Done! 

(10375, 60, 6): LSTM Classifier is Done! 

(9268, 60, 6): LSTM Classifier is Done! 



In [192]:
save_results(lstm_FS_CCBR_OUS_normalized, "lstm_FS_CCBR_OUS_normalized")

TSS: 0.7859160536932386    Recall: 0.9357601713062098
TSS: 0.7616592737267806    Recall: 0.9403089887640449
TSS: 0.8248079198928677    Recall: 0.9527896995708155
TSS: 0.8297956823218593    Recall: 0.9515151515151515


In [193]:
gru_FS_CCBR_OUS_normalized = kfold_training('GRU', data, labels, test_data, test_labels, gru_model, 4)

(9981, 60, 6): GRU Classifier is Done! 

(9050, 60, 6): GRU Classifier is Done! 

(10375, 60, 6): GRU Classifier is Done! 

(9268, 60, 6): GRU Classifier is Done! 



In [194]:
save_results(gru_FS_CCBR_OUS_normalized, "gru_FS_CCBR_OUS_normalized")

TSS: 0.7967563340008494    Recall: 0.9179157744468237
TSS: 0.7554087982415667    Recall: 0.8883426966292135
TSS: 0.8321155126712079    Recall: 0.9339055793991416
TSS: 0.8358704538861197    Recall: 0.9585858585858585


In [195]:
cnn_FS_CCBR_OUS_normalized = kfold_training('CNN', data, labels, test_data, test_labels, cnn_model, 4)

(9981, 60, 6): CNN Classifier is Done! 

(9050, 60, 6): CNN Classifier is Done! 

(10375, 60, 6): CNN Classifier is Done! 

(9268, 60, 6): CNN Classifier is Done! 



In [196]:
save_results(cnn_FS_CCBR_OUS_normalized, "cnn_FS_CCBR_OUS_normalized")

TSS: 0.7223450563966328    Recall: 0.8979300499643112
TSS: 0.6988357385359488    Recall: 0.8033707865168539
TSS: 0.8238489555743317    Recall: 0.9527896995708155
TSS: 0.8028013439011797    Recall: 0.9181818181818182


In [197]:
rnn_FS_CCBR_OUS_normalized = kfold_training('RNN', data, labels, test_data, test_labels, rnn_model, 4)

(9981, 60, 6): RNN Classifier is Done! 

(9050, 60, 6): RNN Classifier is Done! 

(10375, 60, 6): RNN Classifier is Done! 

(9268, 60, 6): RNN Classifier is Done! 



In [198]:
save_results(rnn_FS_CCBR_OUS_normalized, "rnn_FS_CCBR_OUS_normalized")

TSS: 0.7990566672923156    Recall: 0.913633119200571
TSS: 0.7776166601261794    Recall: 0.922752808988764
TSS: 0.8272890514296063    Recall: 0.9725321888412017
TSS: 0.8241373928784765    Recall: 0.9626262626262626


In [199]:
svm_FS_CCBR_OUS_normalized = kfold_training('SVM', data, labels, test_data, test_labels, svm_model, 4)

(9981, 60, 6): SVM Classifier is Done! 

(9050, 60, 6): SVM Classifier is Done! 

(10375, 60, 6): SVM Classifier is Done! 

(9268, 60, 6): SVM Classifier is Done! 



In [200]:
save_results(svm_FS_CCBR_OUS_normalized, "svm_FS_CCBR_OUS_normalized")

TSS: 0.798096131099491    Recall: 0.939329050678087
TSS: 0.7179384767287273    Recall: 0.8230337078651685
TSS: 0.7828193105094566    Recall: 0.9210300429184549
TSS: 0.8301845675402242    Recall: 0.9494949494949495
