## Importing libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

## Importing functions

In [None]:
from pre_processing_functions import (
    create_nslkdd_groups, remove_attributes, handle_categorical_values, handle_categoricals, normalize
)

## Selecting the dataset

In [None]:
datasets = ['cic_ids_2017', 'nsl_kdd', 'unsw_nb15']

dataset = datasets[1]

## Loading the data

In [None]:
df =  pd.read_csv(f"../untreated-datasets/{dataset}.csv")
# train = pd.read_csv(f"../untreated-datasets/{dataset}_train.csv")
# test = pd.read_csv(f"../untreated-datasets/{dataset}_test.csv")

## Splitting into training and testing

In [None]:
train, test = train_test_split(df, test_size=0.3, random_state=42)

In [None]:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

## Pre-processing

In [None]:
train = create_nslkdd_groups(train)
test = create_nslkdd_groups(test)

In [None]:
attributes = ['difficulty', 'num_outbound_cmds']

train = remove_attributes(train, attributes)

test = remove_attributes(test, attributes)

In [None]:
attributes = ['service']

train = handle_categorical_values(train, attributes)
test = handle_categorical_values(test, attributes)

In [None]:
# List of categorical columns
categorical_columns = ['protocol_type', 'service', 'flag']

df = pd.concat([train, test], ignore_index=True)

df = handle_categoricals(df, categorical_columns)

train = df[:len(train)]
test = df[len(train):]

train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

## Transforming the datasets for binary classification (optional)

In [None]:
# binary_train = train.copy()
# binary_train['label'] = binary_train['grupo'].map(lambda a: 'normal' if a == 'normal' else 'attack')
# binary_train.drop(['grupo'],axis=1,inplace=True)

# binary_test = test.copy()
# binary_test['label'] = binary_test['grupo'].map(lambda a: 'normal' if a == 'normal' else 'attack')
# binary_test.drop(['grupo'],axis=1,inplace=True)

## Generating a file for the datasets without normalization

In [None]:
train.to_csv(f"../processed-datasets/{dataset}_train_processed.csv", index=False)
test.to_csv(f"../processed-datasets/{dataset}_test_processed.csv", index=False)

# binary_train.to_csv(f"../processed-datasets/binary_{dataset}_train_processed.csv", index=False)
# binary_test.to_csv(f"../processed-datasets/binary_{dataset}_test_processed.csv", index=False)

## Normalization

In [None]:
train, test = normalize(train, test)

# binary_train, binary_test = normalize(binary_train, binary_test)

## Generating a file for the normalized datasets

In [None]:
train.to_csv(f"../processed-datasets/{dataset}_train_normalized.csv", index=False)
test.to_csv(f"../processed-datasets/{dataset}_test_normalized.csv", index=False)

# binary_train.to_csv(f"../processed-datasets/binary_{dataset}_train_normalized.csv", index=False)
# binary_test.to_csv(f"../processed-datasets/binary_{dataset}_test_normalized.csv", index=False)