In [34]:
import numpy as np
import pandas as pd

DATASET_PATH = 'wireless_attack/dataset.csv'

In [35]:
def load_dataset(filename):
    '''Reads some columns of the the CSV dataset, already transforming them to categories,
    That is, mimicking the factors function from R. 
    Also skips data from the TELLO API Exploit type of attack.

    Arguments:
        filename: Location of the dataset in csv format in the filesystem.

    Returns:
        The dataset in pandas.DataFrame format.
    '''
    cols_to_read = ['Length', 'Info', 'Type', 'Attack.Scenario']
    dtypes = {'Length': np.int32, 'Info': 'category', 'Type': 'category'}
    dataset = []
    dataset = pd.read_csv(filename, usecols=cols_to_read, dtype=dtypes, skipfooter=209, engine='python')

    return dataset

dataset = load_dataset(DATASET_PATH)
print(dataset.Type.cat.codes)
print(dataset.Info)

0        1
1        1
2        1
3        1
4        1
        ..
54278    1
54279    1
54280    1
54281    1
54282    1
Length: 54283, dtype: int8
0                          Acknowledgement, Flags=........
1                          Acknowledgement, Flags=........
2                          Acknowledgement, Flags=........
3                          Acknowledgement, Flags=........
4        QoS Null function (No data), SN=549, FN=0, Fla...
                               ...                        
54278                      Acknowledgement, Flags=........
54279              QoS Data, SN=1066, FN=1, Flags=.pm..MF.
54280                      Acknowledgement, Flags=........
54281              QoS Data, SN=1066, FN=2, Flags=.pm...F.
54282                      Acknowledgement, Flags=........
Name: Info, Length: 54283, dtype: category
Categories (14101, object): ['Acknowledgement, Flags=........', 'Association Response, SN=430, FN=0, Flags=......, 'Authentication, SN=429, FN=0, Flags=........

In [38]:
deauth_dataset = dataset[dataset['Attack.Scenario'] == 'WPA2-PSK WIFI Cracking Attack']
legit_deauth = deauth_dataset[deauth_dataset.Type == "Normal"]
false_deauth = deauth_dataset[deauth_dataset.Type == "Attack"]
print(f'Legitimate Deauthentication: {legit_deauth.count()}')
print(f'False deathentication: {false_deauth.count()}')
print(f'''Dataset balance:
\tTotal: {len(deauth_dataset)}
\tLegitimate: {(len(legit_deauth) / len(deauth_dataset) * 100):.3f}%
\tFalse: {(len(false_deauth) / len(deauth_dataset) * 100):.3f}%''')

Legitimate Deauthentication: Length             23263
Info               23263
Type               23263
Attack.Scenario    23263
dtype: int64
False deathentication: Length             17263
Info               17263
Type               17263
Attack.Scenario    17263
dtype: int64
Dataset balance:
	Total: 40526
	Legitimate: 57.403%
	False: 42.597%
