## Imports

In [None]:
import pandas as pd
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

pd.set_option('mode.chained_assignment', None) # Ignore warning
drive.mount('/content/drive') # Mount drive

## Functions

In [None]:
def txt_to_list(file_path):
    # Opening the .txt file
    txt_file = open(file_path, "r")
    # Reading the file
    data = txt_file.read()
    # Replacing end splitting the text when newline ('\n') is seen
    data_into_list = data.split("\n")
    # Closing the file
    txt_file.close()
    # Return file lines in list
    return data_into_list



def convert_class_to_numeric(x):
    if x == 'normal': # Normal label
        return 0
    else:
        return 1 # Other labels



def pre_processing_dataset(df_1, df_2, labels_names):
    # Select only normal and flooding attacks (also known as Denial of Service (DoS) labels
    df_1 = df_1[df_1['class'].isin(labels_names)]
    df_2 = df_2[df_2['class'].isin(labels_names)]
    # Concate df_1 and df_2
    df = pd.concat([df_1, df_2])
    # Replace '?' by None value
    df.replace({"?":None}, inplace=True)
    # Drop all the NaN rows
    df.dropna(inplace=True)

    # Try to convert each column to numeric
    for column in df.columns:
        # If it gives an error (is ignored), the column is object type
        df[column] = pd.to_numeric(df[column], errors='ignore')

    # Drop constant columns
    df = df.loc[:, df.apply(pd.Series.nunique) != 1]
    # Reset index
    df.reset_index(drop=True, inplace=True)
    # Columns to rescale
    columns_to_scale = ['frame.len', 'radiotap.dbm_antsignal', 'wlan.duration']
    # Apply MinMaxScaler to rescale data
    df[columns_to_scale] = MinMaxScaler().fit_transform(df[columns_to_scale])
    # Create training and test datasets
    df_train = pd.DataFrame(columns=list(df.columns))
    df_test = pd.DataFrame(columns=list(df.columns))

    for label_name in labels_names: # Select each of the labels
        # Temporary training and test datasets
        # Split dataset into training and test based on label
        tmp_train, tmp_test = train_test_split(
            df[df['class'] == label_name],
            test_size=0.25, # 25% of data for test
            shuffle=False) # Without shuffle
        # Append the temporary dataset to the definitive dataset
        df_train = df_train.append(tmp_train, ignore_index=True)
        df_test = df_test.append(tmp_test, ignore_index=True)

    # Convert classes to numeric values
    df_train['class'] = df_train['class'].apply(convert_class_to_numeric)
    df_test['class'] = df_test['class'].apply(convert_class_to_numeric)

    # Return processed datasets
    return df_train, df_test

## Main

In [None]:
# Dataset path
dataset_path = '/content/drive/MyDrive/DoS_Detection/datasets/'
# Columns present in the dataset
columns = txt_to_list(f'{dataset_path}columns.txt')
# Features (useful columns) used
# Selected based on: Pick Quality Over Quantity: Expert Feature Selection and Data Preprocessing for 802.11 Intrusion Detection Systems
# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9797689
features = txt_to_list(f'{dataset_path}features.txt')

# Labels names
labels_names = ['amok', 'authentication_request', 'beacon', 'cts', 'deauthentication',
                'disassociation', 'power_saving', 'probe_request', 'probe_response', 'rts', 'normal']

# Loading the AWID-ATK-R-Trn dataset
df_awid_trn = pd.read_csv(f'{dataset_path}AWID-ATK-R-Trn.csv', names=columns, usecols=features)
# Loading the AWID-ATK-R-Tst dataset
df_awid_tst = pd.read_csv(f'{dataset_path}AWID-ATK-R-Tst.csv', names=columns, usecols=features)

In [None]:
# Processed training and test datasets
df_train, df_test = pre_processing_dataset(df_awid_trn, df_awid_tst, labels_names)

In [None]:
# Saving datasets to Google Drive
df_train.to_csv(f'{dataset_path}train_dataset.csv')
df_test.to_csv(f'{dataset_path}test_dataset.csv')