In [None]:
# Imports
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import warnings
from sklearn.preprocessing import MinMaxScaler

warnings.filterwarnings("ignore") # Ignore warnings

In [None]:
# Functions

def load_data(path):
    # uORB topics used
    uorb_topics = ( 
        'vehicle_gps_position_0.csv',
        'vehicle_local_position_0.csv',
        'vehicle_attitude_setpoint_0.csv',
        'rate_ctrl_status_0.csv',
        'vehicle_attitude_0.csv',
        'vehicle_angular_velocity_0.csv',
        'vehicle_magnetometer_0.csv',
        'vehicle_air_data_0.csv',
        'battery_status_0.csv'
    )
    # Create dfs list
    dfs = list()

    for filename in sorted(os.listdir(path)): # Select the .csv to load
        if filename.endswith(uorb_topics):
            df = pd.read_csv(path + filename) # Read .csv

            # Selected features
            if filename.endswith(uorb_topics[0]): # vehicle_gps_position OK
                df = df[['timestamp', 'lat', 'lon', 'alt', 'eph', 'epv', 'hdop', 'vdop', 'noise_per_ms', 
                         'jamming_indicator', 'vel_m_s', 'cog_rad']]
            elif filename.endswith(uorb_topics[1]): # vehicle_local_position
                 df = df[['timestamp', 'x', 'y', 'z', 'vx', 'vy', 'vz', 'z_deriv', 'ax', 'ay', 'az', 
                          'heading', 'evh']]
            elif filename.endswith(uorb_topics[2]): # vehicle_attitude_setpoint 'ACELERATION OK'
                df = df[['timestamp', 'roll_body', 'pitch_body', 'yaw_body', 'yaw_sp_move_rate', 'thrust_body[2]']]
            elif filename.endswith(uorb_topics[3]): # rate_ctrl_status
                df = df[['timestamp', 'rollspeed_integ', 'pitchspeed_integ', 'yawspeed_integ']]
            elif filename.endswith(uorb_topics[4]): # vehicle_attitude
                df = df[['timestamp', 'q[0]', 'q[1]', 'q[2]', 'q[3]']]
            elif filename.endswith(uorb_topics[5]): # vehicle_angular_velocity
                df = df[['timestamp', 'xyz[0]', 'xyz[1]', 'xyz[2]']]
            elif filename.endswith(uorb_topics[6]): # vehicle_magnetometer OK
                df = df[['timestamp', 'magnetometer_ga[0]', 'magnetometer_ga[1]', 'magnetometer_ga[2]']]
            elif filename.endswith(uorb_topics[7]): # vehicle_air_data
                df = df[['timestamp', 'baro_pressure_pa', 'baro_alt_meter']]
            elif filename.endswith(uorb_topics[8]): # battery_status
                df = df[['timestamp', 'temperature']]
            
            # Drop constant columns
            df = df.loc[:, df.apply(pd.Series.nunique) != 1]
            # Drop all the NaN rows
            df.dropna(inplace=True)
            # Reset DataFrame index
            df.reset_index(drop=True, inplace=True)

            if not df.empty: # Check if DataFrame is not empty
                dfs.append(df) # Add DataFrame to list

    # Return dfs list
    return dfs



def find_min_max_timestamp(dfs):
    # Minimum and maximum timestamp of the first DataFrame
    min = dfs[0][['timestamp']].min()[0]
    max = dfs[0][['timestamp']].max()[0]

    for df in dfs[1:]: # Loop through DataFrame list
        # Compare if any DataFrame timestamp is less than min variable
        if df[['timestamp']].min()[0] < min:
            min = df[['timestamp']].min()[0] # Update min timestamp
        
        # Compare if any DataFrame timestamp is greater than max variable
        if df[['timestamp']].max()[0] > max:
            max = df[['timestamp']].max()[0] # Update max timestamp
        
    # Return minimum and maximum timestamp
    return min, max



def calculate_ideal_time_window(dfs_list):
    acc_1 = 0 # Accumulator (1) of the mean values ​​of the time window of each list of DataFrame list
    for dfs in dfs_list: # Loop through list of DataFrame list 
        time_window_avg = 0 # DataFrame mean time window

        for df in dfs: # Loop through DataFrame list
            acc_2 = 0 # Accumulator (2) of the subtraction of each timestamp

            for i in range(len(df['timestamp'].to_list())): # Number of timestamp in DataFrame
                try: # Try subtracting timestamps
                    sub = df['timestamp'][i+1] - df['timestamp'][i]
                    acc_2 += sub # Subtraction
                except KeyError: # If it reaches the last index
                    time_window_avg += acc_2 / df.shape[0] # Mean time window in DataFrame
        
        # Mean time window in DataFrame list
        acc_1 += time_window_avg / len(dfs)

    # Return rounded ideal time window
    return round((acc_1 / len(dfs_list)) / 100000) * 100000



def merge_dfs(dfs, time_window=200000): # Ideal time window already inserted
    columns = [] # Empty columns list

    for df in dfs: # Loop through DataFrame list
        columns += list(df.columns) # Add columns to the list

    columns = pd.unique(columns).tolist() # Remove duplicate values ​​from list
    merge_df = pd.DataFrame(columns=columns) # Generate empty merge DataFrame

    # Minimum and maximum timestamp
    min_timestamp, max_timestamp = find_min_max_timestamp(dfs)
    timestamp = min_timestamp # First timestamp is the minimum timestamp
    timestamps = [] # Empty timestamp list

    # Generate timestamps based on time window
    while timestamp < max_timestamp:
        timestamps.append(timestamp) # Add timestamp to list
        timestamp += time_window # Sum timestamp with time window
    else: # timestamp >= max_timestamp
        timestamps.append(max_timestamp) # Last timestamp is the maximum timestamp
    
    merge_df.loc[:, 'timestamp'] = timestamps # Add timestamps with time window

    for df in dfs: # Loop through DataFrame list
        values = [] # Empty values

        for i in range(len(timestamps)): # # Number of timestamp in merge DataFrame
            try: # Group the data using the mean
                mean_df = list(df[(df['timestamp'] >= merge_df['timestamp'][i]) & 
                                  (df['timestamp'] < merge_df['timestamp'][i + 1])].mean())[1:]
                values.append(mean_df) # Add in values
            except KeyError: # If it reaches the last index
                mean_df = list(df[df['timestamp'] >= merge_df['timestamp'][i]].mean())[1:]
                values.append(mean_df) # Add in values

        # Add new values to the merge DataFrame
        merge_df.loc[:, list(df.columns)[1:]] = values
    
    # Interpolation with the nearest method to fill NaN values
    merge_df.interpolate(method='nearest', inplace=True)
    # Drop all the NaN rows
    merge_df.dropna(inplace=True)
    # Return merge DataFrame
    return merge_df.reset_index(drop=True)



def merge_single_df(benign_df, jamming_df, spoofing_df):
    # Add class column in each DataFrame
    benign_df['class'] = 0 # Benign
    jamming_df['class'] = 1 # GPS Jamming
    spoofing_df['class'] = 2 # GPS Spoofing

    # Concatenate the three DataFranes
    df = pd.concat([benign_df, jamming_df, spoofing_df]).reset_index(drop=True)
    # Drop all the NaN columns
    df.dropna(axis=1, inplace=True)
    # Return Train DataFrame
    return df



def split_x_y(df, columns_to_drop):
    # Columns to drop
    df = df.drop(columns=columns_to_drop)   
    # DataFrame X (features)
    X = df.loc[:, df.columns != 'class']
    y = df.loc[:, 'class'] # y (labels)
    # Return DataFrame X and y
    return X, y



def normalize_data(df):
    # Split into X and y
    X, y = split_x_y(df, ['timestamp'])
    # Apply normalization
    # Timestamp is not normalized
    X_norm = MinMaxScaler().fit_transform(X)
    # Normalized data
    norm_df = pd.DataFrame(X_norm, columns=X.columns)
    # Update DataFrame with normalized data
    df = df[['class', 'timestamp']].join(norm_df)
    # Return normalized DataFrame
    return df



def drop_highly_correlated_features(X):
    # Create correlation matrix
    corr_matrix = X.corr().abs()
    # Select upper traingle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    # Find index of columns with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    # Drop the columns
    X.drop(columns=list(to_drop), axis=1, inplace=True)
    # Return X after dropping columns with high correlation
    return X

In [None]:
# Main

# Dataset path
dataset_path = '/home/leandro/remy-project/centralized/datasets/UAVGPSAttacks/'

# Path of each category
benign_path = dataset_path + 'Benign Flight/'
jamming_path = dataset_path + 'GPS Jamming/'
spoofing_path = dataset_path + 'GPS Spoofing/'

# DataFrames list
benign_dfs = []
jamming_dfs = []
spoofing_dfs = []

# Load data of each class
benign_dfs = load_data(benign_path)
jamming_dfs = load_data(jamming_path)
spoofing_dfs = load_data(spoofing_path)

# Display ideal time window
print('Ideal Time Window: ' + str(calculate_ideal_time_window([benign_dfs, jamming_dfs, spoofing_dfs])))

In [None]:
# Merge DataFrames
benign_df = merge_dfs(benign_dfs)
jamming_df = merge_dfs(jamming_dfs)
spoofing_df = merge_dfs(spoofing_dfs)

# Drop DataFrames list (no used)
del benign_dfs, jamming_dfs, spoofing_dfs

# Merge into a single DataFrame
uav_df = merge_single_df(benign_df, jamming_df, spoofing_df)
# Save data without normalizing (for analysis)
uav_df.to_csv(f'{dataset_path}data.csv', index=False)
# Normalize the data
uav_df = normalize_data(uav_df)

# Split into X and Y
X, y = split_x_y(uav_df, [])
# Drop columns with high correlation
X = drop_highly_correlated_features(X)

X['class'] = y.values # Add class column in X DataFrame
# Save dataset in .csv
X.to_csv(f'{dataset_path}data_norm.csv', index=False)
# The next step is to apply the data to the unsupervised algorithms