In [1]:
import pandas as pd
import numpy as np

train_df = pd.read_csv("../data/KDDTrain+.txt", header=None)

column_names = [
        "duration","protocol_type","service","flag","src_bytes","dst_bytes","land",
        "wrong_fragment","urgent","hot","num_failed_logins","logged_in",
        "num_compromised","root_shell","su_attempted","num_root","num_file_creations",
        "num_shells","num_access_files","num_outbound_cmds","is_host_login",
        "is_guest_login","count","srv_count","serror_rate","srv_serror_rate",
        "rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate",
        "srv_diff_host_rate","dst_host_count","dst_host_srv_count",
        "dst_host_same_srv_rate","dst_host_diff_srv_rate",
        "dst_host_same_src_port_rate","dst_host_srv_diff_host_rate",
        "dst_host_serror_rate","dst_host_srv_serror_rate",
        "dst_host_rerror_rate","dst_host_srv_rerror_rate",
        "label","difficulty"
    ]

train_df.columns = column_names


In [2]:
train_df=train_df.drop(columns=['difficulty'])

In [3]:
train_df['binary_label']=train_df['label'].apply(lambda x:0 if x=='normal' else 1)

In [4]:
X = train_df.drop(columns=["label", "binary_label"])
y = train_df["binary_label"]

In [5]:
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns
skew_values = X[numerical_cols].skew().sort_values(ascending=False)
skew_values.head(10)


is_host_login         354.926753
dst_bytes             290.052911
num_compromised       250.107883
num_root              236.913724
src_bytes             190.669347
urgent                149.914509
land                   70.965063
num_shells             59.592151
num_file_creations     55.665341
num_failed_logins      53.764424
dtype: float64

In [6]:
for col in numerical_cols:
    if abs(X[col].skew()) > 1:
        X[col] = np.log1p(X[col])


In [7]:
##here log1p is used to resolve the error that can come when x=0 so logarithmic transformation helps to shrink the value nicely without harming the original value much(realistically)

In [8]:
categorical_cols = ["protocol_type", "service", "flag"]

X = pd.get_dummies(X, columns=categorical_cols)


In [9]:
X

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.000000,6.198479,0.000000,0.0,0.0,0.0,0.0,0.0,0,0.0,...,False,False,False,False,False,False,False,False,True,False
1,0.000000,4.990433,0.000000,0.0,0.0,0.0,0.0,0.0,0,0.0,...,False,False,False,False,False,False,False,False,True,False
2,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0,0.0,...,False,False,False,False,True,False,False,False,False,False
3,0.000000,5.451038,9.006264,0.0,0.0,0.0,0.0,0.0,1,0.0,...,False,False,False,False,False,False,False,False,True,False
4,0.000000,5.298317,6.042633,0.0,0.0,0.0,0.0,0.0,1,0.0,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0,0.0,...,False,False,False,False,True,False,False,False,False,False
125969,2.197225,4.663439,4.983607,0.0,0.0,0.0,0.0,0.0,0,0.0,...,False,False,False,False,False,False,False,False,True,False
125970,0.000000,7.710653,5.953243,0.0,0.0,0.0,0.0,0.0,1,0.0,...,False,False,False,False,False,False,False,False,True,False
125971,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0,0.0,...,False,False,False,False,True,False,False,False,False,False


In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [11]:
X_normal = X_scaled[y == 0]


In [12]:
X_normal.shape

(67343, 122)

In [13]:
# here we have used the X_normal because the isolation forest works on the normal first and then detect the anamoly if found except the normal one so