In [None]:
# Imports
import keras
import numpy as np
import pandas as pd
import tensorflow as tf
import warnings
from imblearn.over_sampling import KMeansSMOTE, RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import MinMaxScaler

warnings.filterwarnings("ignore") # Ignore warnings
RANDOM_STATE = 42 # Random state default
# Set random seed in Keras
keras.utils.set_random_seed(RANDOM_STATE)
# Set random seed in NumPy
np.random.seed(RANDOM_STATE)

In [None]:
# Functions

def split_train_test(df, size, time_column=' Time'):
    # Build test DataFrame
    test_df = pd.DataFrame(columns=df.columns)

    for i in np.sort(pd.unique(df['class'])): # For each class
        temp_df = df[df['class'] == i] # Select only data from a class
        # Obtain a percentage of data (end of DataFrame)
        temp_df = temp_df.tail(round(len(temp_df) * size))
        # Drop data obtained from the training DataFrame
        df.drop(index=temp_df.index, inplace=True)
        # Add data in test DataFrame
        test_df = pd.concat([test_df, temp_df])

    # Sort by time column and reset index
    # df is training (and validating) DataFrame
    df = df.sort_values(by=[time_column]).reset_index(drop=True)
    test_df = test_df.sort_values(by=[time_column]).reset_index(drop=True)
    # Return df and test_df excluding time column
    return df.drop(columns=[time_column]), test_df.drop(columns=[time_column])



def split_x_y(df):    
    # DataFrame X (features)
    X = df.loc[:, df.columns != 'class']
    y = df.loc[:, 'class'] # y (labels)
    # DataFrame X and y
    return X, y



def normalize_data(df):
    # Split into X and y
    X, y = split_x_y(df)
    # Apply normalization
    X_norm = MinMaxScaler().fit_transform(X)
    # Normalized data
    norm_df = pd.DataFrame(X_norm, columns=df.drop(columns=['class']).columns)
    # Update DataFrame with normalized data
    df = df[['class']].join(norm_df)
    # Return normalized DataFrame
    return df



def generate_gans(X_minor_train, n_features, coding_size):
    # Build the generator
    generator = keras.models.Sequential([
        keras.layers.Dense(100, activation='selu', input_shape=[coding_size]),
        keras.layers.Dense(200, activation='selu'),
        keras.layers.Dense(300, activation='selu'),
        keras.layers.Dense(400, activation='selu'),
        keras.layers.Dense(500, activation='selu'),
        keras.layers.Dense(n_features, activation='sigmoid')
    ])

    # Build the discriminator
    discriminator = keras.models.Sequential([
        keras.layers.Dense(n_features),
        keras.layers.Dense(500, activation='selu'),
        keras.layers.Dense(400, activation='selu'),
        keras.layers.Dense(300, activation='selu'),
        keras.layers.Dense(200, activation='selu'),
        keras.layers.Dense(100, activation='selu'),
        keras.layers.Dense(1, activation='sigmoid')
    ])

    # Build a GAN
    gan = keras.models.Sequential([generator, discriminator])
    # Compile the discriminator
    discriminator.compile(loss='binary_crossentropy',
                        optimizer=keras.optimizers.Adam(learning_rate=10 ** -4))
    # Freeze the discriminator
    discriminator.trainable = False
    # Compile the generator
    gan.compile(loss='binary_crossentropy',
                optimizer=keras.optimizers.Adam(learning_rate=10 ** -4))

    # Get the generator and discriminator
    generator, discriminator = gan.layers
    # Get the indices of the training data of the minority class
    idxs_minor_train = np.array(range(X_minor_train.shape[0]))
    # The batch size
    batch_size = 32
    # Get the number of mini-batches
    n_batch = len(idxs_minor_train) // batch_size
    # The number of maximum epoch
    max_iter = 10

    for _ in range(max_iter): # For each epoch
        # Shuffle the data
        np.random.RandomState(seed=RANDOM_STATE).shuffle(idxs_minor_train)
        
        # For each mini-batch
        for i in range(n_batch):
            # Get the first and last index (exclusive) of the mini-batch
            first_idx = i * batch_size
            last_idx = min((i + 1) * batch_size, len(idxs_minor_train))                         
            # Get the mini-batch
            mb = idxs_minor_train[first_idx : last_idx]
            # Get the real feature matrix
            real_features = X_minor_train[mb, :]
            # Get the noise
            noise = tf.random.normal(shape=[len(mb), coding_size], seed=RANDOM_STATE)
            # Get the gen feature matrix
            gen_features = generator(noise)
            # Combine the generated and real feature matrix
            gen_real_features = tf.concat([gen_features, real_features], axis=0)
            # Get the target vector
            y = tf.constant([[0.]] * len(mb) + [[1.]] * len(mb))
            # Unfreeze the discriminator
            discriminator.trainable = True
            # Train the discriminator
            discriminator.train_on_batch(gen_real_features, y)
            # Get the noise
            noise = tf.random.normal(shape=[len(mb), coding_size], seed=RANDOM_STATE)
            # Get the target
            y = tf.constant([[1.]] * len(mb))
            # Freeze the discriminator
            discriminator.trainable = False
            # Train the generator
            gan.train_on_batch(noise, y)

    # Return GANs after training
    return gan



def resampling_dataset(X, y, over_tech, verbose=True):
    if over_tech == 'none': # Apply only Random Undersampling (RUS)
        if verbose: print('Undersampling: Random Undersampling')
        return RandomUnderSampler(random_state=RANDOM_STATE).fit_resample(X, y)

    # Combine the Undersampling and Oversampling mmethods
    # Obtain the number of samples for each class
    value_counts = pd.Series(y).value_counts(ascending=True)
    # Mean number of samples from minority classes
    mean_minority_classes = int(np.mean(value_counts[:-1].values))
    # Dictionary  that will be used for resampling
    sampling_strategy = dict()

    # Build sampling_strategy dictionary
    for i in range(len(value_counts)):
        if value_counts[i] >= mean_minority_classes: # If there are more than the mean number
            sampling_strategy[i] = mean_minority_classes
        else: # The value is not updated
            sampling_strategy[i] = value_counts[i]

    # Undersampling
    rus = RandomUnderSampler(sampling_strategy=sampling_strategy, 
                             random_state=RANDOM_STATE)
    X_resampled, y_resampled = rus.fit_resample(X, y)
    if verbose: print('Undersampling: Random Undersampling')
    
    # List to save classes that will be applied oversampling
    over_classes = []
    # Update sampling_strategy values ​​after RUS
    for i in range(len(value_counts)):
        if value_counts[i] < mean_minority_classes: # Add the number of samples for oversampling
            sampling_strategy[i] = mean_minority_classes
            over_classes.append(i)

    # Oversampling
    if over_tech == 'ros':
        # Random Oversampling (ROS)
        ros = RandomOverSampler(sampling_strategy=sampling_strategy, 
                                random_state=RANDOM_STATE)
        X_resampled, y_resampled = ros.fit_resample(X_resampled, y_resampled)
        if verbose: print('Oversampling: Random Oversampling')
    elif over_tech == 'sm':
        # SMOTE (SM)
        sm = SMOTE(sampling_strategy=sampling_strategy, 
                   random_state=RANDOM_STATE, n_jobs=-1)
        X_resampled, y_resampled = sm.fit_resample(X_resampled, y_resampled)
        if verbose: print('Oversampling: SMOTE')
    elif over_tech == 'k-sm':
        # K-Means SMOTE (K-SM)
        k_sm = KMeansSMOTE(cluster_balance_threshold='auto', sampling_strategy=sampling_strategy, 
                           random_state=RANDOM_STATE, n_jobs=-1)
        X_resampled, y_resampled = k_sm.fit_resample(X_resampled, y_resampled)
        if verbose: print('Oversampling: K-Means SMOTE')
    elif over_tech == 'gans':
        # Generative Adversarial Networks (GANs)
        # Set the number of features
        n_features = X_resampled.shape[1]
        # Set the coding size, which is the dimension of the noise used as input for the generator
        coding_size = n_features // 2
        # Convert to NumPy arrays
        y_resampled = y_resampled.values
        X_resampled = X_resampled.to_numpy()

        for i in range(len(over_classes)): # For each class that will be applied oversampling 
            # Identifying the minority class
            minor_class = over_classes[i]
            # Get the training feature matrix of the minority class
            X_minor_train = X_resampled[np.where(y_resampled == minor_class)]
            # Get the training target vector of the minority class
            y_minor_train = y_resampled[np.where(y_resampled == minor_class)]
            # Generate trained GANs
            gan = generate_gans(X_minor_train, n_features, coding_size)
            # Get the generator
            generator = gan.layers[0]
            # Get the number of samples to generate
            n_class_diff = mean_minority_classes - y_minor_train.shape[0]
            # Initialize the generated data
            gen_data = np.zeros((n_class_diff, X_minor_train.shape[1] + 1))

            for i in range(n_class_diff):
                # Get the noise
                noise = tf.random.normal(shape=[1, coding_size], seed=RANDOM_STATE)
                # Get the generated features
                gen_features = generator(noise)
                # Update the generated data
                gen_data[i, :-1], gen_data[i, -1] = gen_features, minor_class

            # Augment the minority class in the training data
            # Augment the training feature matrix
            X_resampled = np.vstack((X_resampled, gen_data[:, :-1]))
            # Augment the training target vector
            y_resampled = np.vstack((y_resampled.reshape(-1, 1), gen_data[:, -1].reshape(-1, 1))).reshape(-1)
            
        # Convert to Pandas DataFrame and Series
        X_resampled = pd.DataFrame(X_resampled, columns=X.columns)
        y_resampled = pd.Series(y_resampled).astype(int)
        if verbose: print('Oversampling: Generative Adversarial Networks')
    else:
        if verbose: print('Oversampling: Error!')

    # X and y after resampling the data
    return X_resampled, y_resampled

In [None]:
# Main

# Dataset path
dataset_path = '/home/leandro/remy-project/centralized/datasets/WSN-DS/'

# Load dataset
wsn_df = pd.read_csv(f'{dataset_path}data.csv')
# Sort DataFrame by 'Time' column
wsn_df = wsn_df.sort_values(by=[' Time']).reset_index(drop=True)
# Drop 'id' column
wsn_df.drop(columns=' id', inplace=True)
# Rename class column
wsn_df.rename(columns={"Attack type": "class"}, inplace=True)
# Remove TDMA schedule attack
wsn_df = wsn_df[wsn_df['class'] != 'TDMA'].reset_index(drop=True)

# Convert classes to numeric
wsn_df["class"] = wsn_df["class"].map({
    "Normal": 0,
    "Grayhole": 1,
    "Blackhole": 2,
    "Flooding": 3
}.get)

# Normalize the data 
wsn_df = normalize_data(wsn_df)
# Split the data into 60% for training and 40% for test
train_df, test_df = split_train_test(wsn_df, 0.4) 
# Save test dataset in .csv
test_df.to_csv(f'{dataset_path}test_data.csv', index=False)
# Split train_df into X and y
X_train, y_train = split_x_y(train_df)

# Balance training data with various techniques
for oversampling_tech in ['none', 'ros', 'sm', 'k-sm', 'gans']:
    # Balance data
    X_res, y_res = resampling_dataset(X_train, y_train, oversampling_tech, verbose=False)
    X_res['class'] = y_res.values # Add class column in X_res DataFrame
    # Save balanced dataset in .csv
    X_res.to_csv(f'{dataset_path}balanced/data_{oversampling_tech}.csv', index=False)