### Učitavanje i podjela podataka

In [2]:
from fastparquet import ParquetFile
import pandas as pd

pd.set_option('display.max_columns', None) # za prikaz svih kolona

train_set = '../data/UNSW_NB15_training-set.parquet'
test_set = '../data/UNSW_NB15_testing-set.parquet'
pf_train_set = ParquetFile(train_set)
pf_test_set = ParquetFile(test_set)

df_train = pf_train_set.to_pandas()
df_test = pf_test_set.to_pandas()

# Train set
X_train = df_train.iloc[:, :35]
y_train = df_train['label'] # ciljna promjenljiva, 36. kolona

# Test set
X_test = df_test.iloc[:, :35]
y_test = df_test['label']

In [3]:
X_train.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,dwin,tcprtt,synack,ackdat,smean,dmean,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat
0,0.121478,tcp,-,FIN,6,4,258,172,74.087486,14158.942383,8495.365234,0,0,24.295601,8.375,30.177547,11.830604,255,621772692,2202533631,255,0.0,0.0,0.0,43,43,0,0,1,1,0,0,0,0,Normal
1,0.649902,tcp,-,FIN,14,38,734,42014,78.473373,8395.112305,503571.3125,2,17,49.915001,15.432865,61.426933,1387.77832,255,1417884146,3077387971,255,0.0,0.0,0.0,52,1106,0,0,1,1,0,0,0,0,Normal
2,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,1572.271851,60929.230469,1,6,231.875565,102.737206,17179.585938,11420.925781,255,2116150707,2963114973,255,0.111897,0.061458,0.050439,46,824,0,0,1,1,0,0,0,0,Normal
3,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,2740.178955,3358.62207,1,3,152.876541,90.235725,259.08017,4991.784668,255,1107119177,1047442890,255,0.0,0.0,0.0,52,64,0,0,1,1,1,1,0,0,Normal
4,0.449454,tcp,-,FIN,10,6,534,268,33.373825,8561.499023,3987.059814,2,1,47.750332,75.659599,2415.837646,115.806999,255,2436137549,1977154190,255,0.128381,0.071147,0.057234,53,45,0,0,2,1,0,0,0,0,Normal


In [4]:
X_test.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,dwin,tcprtt,synack,ackdat,smean,dmean,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat
0,1.1e-05,udp,-,INT,2,0,496,0,90909.09375,180363632.0,0.0,0,0,0.011,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,248,0,0,0,1,1,0,0,0,0,Normal
1,8e-06,udp,-,INT,2,0,1762,0,125000.0,881000000.0,0.0,0,0,0.008,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,881,0,0,0,1,1,0,0,0,0,Normal
2,5e-06,udp,-,INT,2,0,1068,0,200000.0,854400000.0,0.0,0,0,0.005,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,534,0,0,0,1,1,0,0,0,0,Normal
3,6e-06,udp,-,INT,2,0,900,0,166666.65625,600000000.0,0.0,0,0,0.006,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,450,0,0,0,2,1,0,0,0,0,Normal
4,1e-05,udp,-,INT,2,0,2126,0,100000.0,850400000.0,0.0,0,0,0.01,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,1063,0,0,0,2,1,0,0,0,0,Normal


### Normalizacija i enkodiranje vrijednosti

In [5]:
from sklearn.preprocessing import MinMaxScaler

# Normalizacija kontinualnih vrijednosti - Min Max Scaling
scaler = MinMaxScaler()

# Train set
train_set_continuous_data = X_train.select_dtypes(include=['float32', 'int16', 'int32', 'int64', 'int8'])
train_set_scaled = scaler.fit_transform(train_set_continuous_data)
X_train[train_set_continuous_data.columns] = train_set_scaled # Azuriranje dataframe-a

X_train.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,dwin,tcprtt,synack,ackdat,smean,dmean,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat
0,0.002025,tcp,-,FIN,0.00052,0.000364,1.8e-05,1.2e-05,7.4e-05,2.364553e-06,0.000379,0.0,0.0,0.000288,0.000148,2.1e-05,4.1e-05,1.0,0.144768,0.512828,1.0,0.0,0.0,0.0,0.010163,0.029492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Normal
1,0.010832,tcp,-,FIN,0.001352,0.003463,5.4e-05,0.002867,7.8e-05,1.401989e-06,0.022458,0.000416,0.0031,0.000592,0.000272,4.2e-05,0.004796,1.0,0.330128,0.716524,1.0,0.0,0.0,0.0,0.01626,0.758573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Normal
2,0.027052,tcp,-,FIN,0.000728,0.001458,2.6e-05,0.0009,1.4e-05,2.625704e-07,0.002717,0.000208,0.001094,0.002748,0.001811,0.011763,0.039466,1.0,0.492706,0.689918,1.0,0.044423,0.029261,0.033164,0.012195,0.565158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Normal
3,0.028027,tcp,ftp,FIN,0.001144,0.001093,4.6e-05,5.3e-05,1.4e-05,4.576117e-07,0.00015,0.000208,0.000547,0.001812,0.001591,0.000177,0.017249,1.0,0.257772,0.243882,1.0,0.0,0.0,0.0,0.01626,0.043896,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.0,Normal
4,0.007491,tcp,-,FIN,0.000936,0.000547,3.9e-05,1.8e-05,3.3e-05,1.429776e-06,0.000178,0.000416,0.000182,0.000566,0.001334,0.001654,0.0004,1.0,0.567209,0.460351,1.0,0.050967,0.033874,0.037632,0.016938,0.030864,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,Normal


In [6]:
# Test set
test_set_continuous_data = X_test.select_dtypes(include=['float32', 'int16', 'int32', 'int64', 'int8'])
test_set_scaled = scaler.transform(test_set_continuous_data)
X_test[test_set_continuous_data.columns] = test_set_scaled # Azuriranje dataframe-a

X_test.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,dwin,tcprtt,synack,ackdat,smean,dmean,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat
0,1.833334e-07,udp,-,INT,0.000104,0.0,3.6e-05,0.0,0.090909,0.030121,0.0,0.0,0.0,1.303758e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.149051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Normal
1,1.333334e-07,udp,-,INT,0.000104,0.0,0.000134,0.0,0.125,0.147128,0.0,0.0,0.0,9.481876e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.577913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Normal
2,8.333335e-08,udp,-,INT,0.000104,0.0,8e-05,0.0,0.2,0.142685,0.0,0.0,0.0,5.926172e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.342818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Normal
3,1e-07,udp,-,INT,0.000104,0.0,6.7e-05,0.0,0.166667,0.1002,0.0,0.0,0.0,7.111407e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285908,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,Normal
4,1.666667e-07,udp,-,INT,0.000104,0.0,0.000162,0.0,0.1,0.142017,0.0,0.0,0.0,1.185234e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.70122,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,Normal


In [7]:
# Enkodiranje kategorickih vrijednosti - Ordinal Encoder
from sklearn.preprocessing import OrdinalEncoder

ord_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# Train set
train_set_discrete_data = X_train.select_dtypes(include=['category'])
cols = list(train_set_discrete_data) # Lista naziva kolona

train_set_encoded = ord_encoder.fit_transform(train_set_discrete_data[cols])
X_train[train_set_discrete_data.columns] = train_set_encoded # Azuriranje dataframe-a

X_train.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,dwin,tcprtt,synack,ackdat,smean,dmean,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat
0,0.002025,113.0,0.0,2.0,0.00052,0.000364,1.8e-05,1.2e-05,7.4e-05,2.364553e-06,0.000379,0.0,0.0,0.000288,0.000148,2.1e-05,4.1e-05,1.0,0.144768,0.512828,1.0,0.0,0.0,0.0,0.010163,0.029492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
1,0.010832,113.0,0.0,2.0,0.001352,0.003463,5.4e-05,0.002867,7.8e-05,1.401989e-06,0.022458,0.000416,0.0031,0.000592,0.000272,4.2e-05,0.004796,1.0,0.330128,0.716524,1.0,0.0,0.0,0.0,0.01626,0.758573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
2,0.027052,113.0,0.0,2.0,0.000728,0.001458,2.6e-05,0.0009,1.4e-05,2.625704e-07,0.002717,0.000208,0.001094,0.002748,0.001811,0.011763,0.039466,1.0,0.492706,0.689918,1.0,0.044423,0.029261,0.033164,0.012195,0.565158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
3,0.028027,113.0,3.0,2.0,0.001144,0.001093,4.6e-05,5.3e-05,1.4e-05,4.576117e-07,0.00015,0.000208,0.000547,0.001812,0.001591,0.000177,0.017249,1.0,0.257772,0.243882,1.0,0.0,0.0,0.0,0.01626,0.043896,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.0,6.0
4,0.007491,113.0,0.0,2.0,0.000936,0.000547,3.9e-05,1.8e-05,3.3e-05,1.429776e-06,0.000178,0.000416,0.000182,0.000566,0.001334,0.001654,0.0004,1.0,0.567209,0.460351,1.0,0.050967,0.033874,0.037632,0.016938,0.030864,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,6.0


In [8]:
# Test set
test_set_discrete_data = X_test.select_dtypes(include=['category'])
cols = list(test_set_discrete_data) # Lista naziva kolona

test_set_encoded = ord_encoder.transform(test_set_discrete_data[cols])
X_test[test_set_discrete_data.columns] = test_set_encoded # Azuriranje dataframe-a

X_test.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,dwin,tcprtt,synack,ackdat,smean,dmean,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat
0,1.833334e-07,119.0,0.0,3.0,0.000104,0.0,3.6e-05,0.0,0.090909,0.030121,0.0,0.0,0.0,1.303758e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.149051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
1,1.333334e-07,119.0,0.0,3.0,0.000104,0.0,0.000134,0.0,0.125,0.147128,0.0,0.0,0.0,9.481876e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.577913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
2,8.333335e-08,119.0,0.0,3.0,0.000104,0.0,8e-05,0.0,0.2,0.142685,0.0,0.0,0.0,5.926172e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.342818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
3,1e-07,119.0,0.0,3.0,0.000104,0.0,6.7e-05,0.0,0.166667,0.1002,0.0,0.0,0.0,7.111407e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285908,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,6.0
4,1.666667e-07,119.0,0.0,3.0,0.000104,0.0,0.000162,0.0,0.1,0.142017,0.0,0.0,0.0,1.185234e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.70122,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,6.0


### Skladištenje podataka

In [10]:
import joblib

joblib.dump(X_train, '../data/processed/X_train.joblib')
joblib.dump(y_train, '../data/processed/y_train.joblib')
joblib.dump(X_test, '../data/processed/X_test.joblib')
joblib.dump(y_test, '../data/processed/y_test.joblib')

['../data/processed/y_test.joblib']