# KDDCup99 Preprocessing
#### This notebook implements data preprocessing on KDDCup datasets. The final outputs of this notebook are `2 different preprocessed datasets.`
#### They are:
* `KDDCup99-preprocessed-sub-features` data, which is a preprocessed dataset from main KDDCup99 dataset that contains all selected features for binary classification.
* `KDDCup99-preprocessed-full-features` data, which is a preprocessed dataset from main KDDCup99 dataset that contains all the selected features for binary classification. The selected features are chosen based on Recursive Feature Elimination (RFE) algorithm.

#### Recursive Feature Elimination (RFE) on KDDCup99. n is 16

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

# Step 1: Download the KDDCUP99 Dataset
url = "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz"
dataset_path = "kddcup99_dataset.gz"

# Download the dataset
try:
    import urllib.request
    urllib.request.urlretrieve(url, dataset_path)
except ImportError:
    import urllib
    urllib.urlretrieve(url, dataset_path)

# Step 2: Load the dataset into a Pandas DataFrame
column_names = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in",
    "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations",
    "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login",
    "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate",
    "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate",
    "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
    "dst_host_serror_rate", "dst_host_srv_serror_rate",
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "target"
]

df = pd.read_csv(dataset_path, names=column_names)

# Step 3: Data preprocessing (convert non-numeric features to numeric)
df['protocol_type'] = pd.Categorical(df['protocol_type']).codes
df['service'] = pd.Categorical(df['service']).codes
df['flag'] = pd.Categorical(df['flag']).codes
df['target'] = pd.Categorical(df['target']).codes

# Step 4: Perform Recursive Feature Elimination (RFE) with Random Forest
X = df.drop('target', axis=1)
y = df['target']

# Create a Random Forest classifier
rf = RandomForestClassifier()

# Create the RFE object and specify the number of features to select
rfe = RFE(estimator=rf, n_features_to_select=16, step=1)

# Fit the RFE model to the data
rfe.fit(X, y)

# Get the selected features
selected_features = X.columns[rfe.support_]

print("Selected Features:")
print(selected_features)

Selected Features:
Index(['protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'hot',
       'num_compromised', 'count', 'srv_count', 'same_srv_rate',
       'diff_srv_rate', 'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate'],
      dtype='object')


#### Preprocessing for all features of KDDCup99 dataset.

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

# Step 1.1: Load the dataset into a Pandas DataFrame
column_names = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in",
    "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations",
    "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login",
    "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate",
    "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate",
    "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
    "dst_host_serror_rate", "dst_host_srv_serror_rate",
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "target"
]

df = pd.read_csv("../../data/kddcup99_dataset.gz", names=column_names)

# Step 1.2: Separate categorical and numerical columns
categorical_columns = ['protocol_type', 'service', 'flag']
numerical_columns = [col for col in df.columns if col not in categorical_columns and col != 'target']

# Step 1.3: Perform One-Hot Encoding for categorical columns
one_hot_encoder = OneHotEncoder(sparse=False, drop='first')
encoded_categorical = one_hot_encoder.fit_transform(df[categorical_columns])

# Create a DataFrame for the encoded categorical columns
df_encoded_categorical = pd.DataFrame(encoded_categorical, columns=one_hot_encoder.get_feature_names_out(categorical_columns))

# Step 1.4: Perform Min-Max Normalization for numerical columns
scaler = MinMaxScaler()
normalized_numerical = scaler.fit_transform(df[numerical_columns])

# Create a DataFrame for the normalized numerical columns
df_normalized_numerical = pd.DataFrame(normalized_numerical, columns=numerical_columns)

# Combine the encoded categorical and normalized numerical DataFrames
df_preprocessed = pd.concat([df_encoded_categorical, df_normalized_numerical], axis=1)
df_preprocessed['target'] = df["target"].map(lambda x: 0 if x == "normal." else 1)


df_preprocessed.to_csv("../../data/preprocessed/KDDCup99-preprocessed-full-features.csv", index=False)



In [2]:
len(df_preprocessed.columns)

116

#### Preprocessing for selected features of KDDCup99 dataset.

In [3]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

x = pd.DataFrame()

column_names = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in",
    "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations",
    "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login",
    "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate",
    "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate",
    "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
    "dst_host_serror_rate", "dst_host_srv_serror_rate",
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "target"
]

df = pd.read_csv("../../data/kddcup99_dataset.gz", names=column_names)
y = df["target"]
# Selected Features based on RFE on KDDCup99
selected_features = ['protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'hot',
                     'num_compromised', 'count', 'srv_count', 'same_srv_rate',
                     'diff_srv_rate', 'dst_host_srv_count', 'dst_host_same_srv_rate',
                     'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
                     'dst_host_srv_diff_host_rate']

df_selected = df[selected_features]

numerical_features = ['src_bytes', 'dst_bytes', 'hot', 'num_compromised', 'count', 'srv_count',
                      'same_srv_rate', 'diff_srv_rate', 'dst_host_srv_count',
                      'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
                      'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate']

categorical_features = ['protocol_type', 'service', 'flag']

# Perform preprocessing on selected features
scaler = MinMaxScaler()
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

x = pd.DataFrame() # a new empty dataframe. We will add preprocessed data into x dataframe. 

def min_max_normalize(column):
    min_val = column.min()
    max_val = column.max()
    column_normalized = (column - min_val) / (max_val - min_val)
    return column_normalized

# We move forward Column by column to see which column is categorical or numerical.
for i in range(len(selected_features)):
    feature = selected_features[i]
    if feature in categorical_features and feature in selected_features:
        x = pd.concat([x, pd.get_dummies(df_selected[feature], prefix=feature, dtype='int8')],axis=1)
    elif feature in numerical_features and feature in selected_features:
        # Apply min-max normalization to the specified column
        x = pd.concat([x, min_max_normalize(df_selected[feature])],axis=1)
y = df["target"].map(lambda x: 0 if x == "normal." else 1)
x = pd.concat([x, y],axis=1)
# Step 4: Save the preprocessed selected features as a CSV file
x.to_csv("../../data/preprocessed/KDDCup99-preprocessed-sub-features.csv", index=False)

In [4]:
len(x.columns)

94