In [None]:
# Imports
import numpy as np
import pandas as pd

In [None]:
# Functions

def split_train_test(df, size, time_column):
    # Build test DataFrame
    test_df = pd.DataFrame(columns=df.columns)

    for i in np.sort(pd.unique(df['class'])): # For each class
        temp_df = df[df['class'] == i] # Select only data from a class
        # Obtain a percentage of data (end of DataFrame)
        temp_df = temp_df.tail(round(len(temp_df) * size))
        # Drop data obtained from the training DataFrame
        df.drop(index=temp_df.index, inplace=True)
        # Add data in test DataFrame
        test_df = pd.concat([test_df, temp_df])

    # Sort by time column and reset index
    # df is training (and validating) DataFrame
    df = df.sort_values(by=[time_column]).reset_index(drop=True)
    test_df = test_df.sort_values(by=[time_column]).reset_index(drop=True)
    # Return df and test_df excluding time column
    return df, test_df



def split_clients_dataset(df, time_column, n_clients=4):
    dfs = [] # List to save DataFrames for each client
    classes_arr = np.sort(pd.unique(df['class'])) # Classes array

    for class_num in classes_arr: # # For each class
        temp_df = df[df['class'] == class_num] # # Select only data from a class

        first_position = 0 # First position (row) of the DataFrame
        # Calculating the approximate size of each DataFrame per client
        approx_size = round(len(temp_df) / n_clients)
        last_position = approx_size # Last position of the DataFrame
        client_dfs = list() # Client DataFrames list

        for client_num in range(n_clients): # For each client      
            if client_num != n_clients - 1: # Not last client
                client_dfs.append(temp_df.iloc[first_position:last_position,:])
            else: # Last client
                client_dfs.append(temp_df.iloc[first_position:,:])
            
            # Update first and last position
            first_position += approx_size
            last_position += approx_size
        
        # Append client_dfs in dfs
        dfs.append(client_dfs)
    
    client_dfs = [] # Reset client DataFrames list
    
    for client_num in range(n_clients): # Build the DataFrame for each client
        # Client DataFrame with all classes
        client_df = pd.concat([dfs[class_num][client_num] for class_num in classes_arr])
        client_dfs.append(client_df) # Append client_df in client_dfs
    
    return client_dfs # Return clients_dfs with all classes



def save_client_datasets(train_df, test_df, time_column, n_clients=4):
    for client_num in range(n_clients): # For each client
        final_df = pd.DataFrame() # Create empty DataFrame
        
        for type_df in [train_df, test_df]: # For each type DataFrame
            # Split clients DataFrame
            temp_df = split_clients_dataset(type_df, time_column, n_clients=n_clients)[client_num]
            final_df = pd.concat([final_df, temp_df]) # Concatenate training and test DataFrames
            
        final_df = final_df.sort_values(by=[time_column]).reset_index(drop=True) # Sort by time column
        final_df.to_csv(f'{save_datapath}uav_data_{client_num + 1}.csv', index=False) # Save client dataset
        
    print('Clients datasets saved!') # Show that datasets have been saved 

In [None]:
# Path to load the dataset
load_datapath = '/home/leandro/remy-project/centralized/datasets/UAVGPSAttacks/'
# Path to save the dataset
save_datapath = '/home/leandro/remy-project/data/'

# Split the data into 80% for training and 20% for test
uav_df = pd.read_csv(load_datapath + 'data_norm.csv')
train_df, test_df = split_train_test(uav_df, 0.2, 'timestamp')
# Save client datasets
save_client_datasets(train_df, test_df, 'timestamp', 4)