## Importing libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

## Importing functions

In [None]:
from pre_processing_functions import (
    remove_attributes, convert_to_numeric, handle_categorical_values, handle_categoricals, normalize
)

## Selecting the dataset

In [None]:
datasets = ['cic_ids_2017', 'nsl_kdd', 'unsw_nb15']

dataset = datasets[2]

## Loading the data

In [None]:
df =  pd.read_csv(f"../untreated-datasets/{dataset}.csv")
# train = pd.read_csv(f"../untreated-datasets/{dataset}_train.csv")
# test = pd.read_csv(f"../untreated-datasets/{dataset}_test.csv")

## Splitting into training and testing

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
train, test = train_test_split(df, test_size=0.3, random_state=42)

In [None]:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

## Pre-processing

In [None]:
atributos = ['Stime', 'Ltime', 'srcip', 'dstip', 'sport', 'dsport', 'Label']

train = remove_attributes(train, atributos)
test = remove_attributes(test, atributos)

In [None]:
train = convert_to_numeric(train, 'ct_ftp_cmd')
test = convert_to_numeric(test, 'ct_ftp_cmd')

In [None]:
# Label Normal appears as NaN
train['attack_cat'] = train['attack_cat'].fillna('Normal').str.strip()
test['attack_cat'] = test['attack_cat'].fillna('Normal').str.strip()

In [None]:
# List of categorical columns
categorical_columns = ['proto', 'service', 'state']

In [None]:
train = handle_categorical_values(train, categorical_columns)
test = handle_categorical_values(test, categorical_columns)

In [None]:
df = pd.concat([train, test], ignore_index=True)

df = handle_categoricals(df, categorical_columns)

train = df[:len(train)]
test = df[len(train):]

train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [None]:
# Columns with missing values ['ct_flw_http_mthd', 'is_ftp_login']
train = train.fillna(0)
test = test.fillna(0)

## Transforming the datasets for binary classification (optional)

In [None]:
# binary_train = train.copy()
# binary_train['attack_cat'] = binary_train['attack_cat'].map(lambda a: 'normal' if a == 'Normal' else 'attack')

# binary_test = test.copy()
# binary_test['attack_cat'] = binary_test['attack_cat'].map(lambda a: 'normal' if a == 'Normal' else 'attack')

## Generating a file for the datasets without normalization

In [None]:
train.to_csv(f"../processed-datasets/{dataset}_train_processed.csv", index=False)
test.to_csv(f"../processed-datasets/{dataset}_test_processed.csv", index=False)

# binary_train.to_csv(f"../processed-datasets/binary_{dataset}_train_processed.csv", index=False)
# binary_test.to_csv(f"../processed-datasets/binary_{dataset}_test_processed.csv", index=False)

## Normalization

In [None]:
train, test = normalize(train, test)

# binary_train, binary_test = normalize(binary_train, binary_test)

## Generating a file for the normalized datasets

In [None]:
train.to_csv(f"../processed-datasets/{dataset}_train_normalized.csv", index=False)
test.to_csv(f"../processed-datasets/{dataset}_test_normalized.csv", index=False)

# binary_train.to_csv(f"../processed-datasets/binary_{dataset}_train_normalized.csv", index=False)
# binary_test.to_csv(f"../processed-datasets/binary_{dataset}_test_normalized.csv", index=False)