# UNSW Pre-processing

In [2]:
cd C:\\Users\\Sharuka Thirimanne\\Desktop\\FYP-ML-IDS

C:\Users\Sharuka Thirimanne\Desktop\FYP-ML-IDS


In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [4]:
def train_preprocess():
    df_train = pd.read_csv('Datasets/UNSW_NB15_training-set.csv')
    df_train.drop(['id'], axis = 'columns' , inplace = True)
    num_features = len(df_train.columns) - 1
    print(df_train.columns)
    print('Number of features before feature selection :',num_features)
    print('Original dataset duplicates :',df_train.duplicated().sum())
    df_train.drop(['attack_cat','sbytes', 'sjit', 'sloss', 'dinpkt', 'is_ftp_login', 'ct_ftp_cmd', 
                   'ct_src_ltm', 'response_body_len', 'spkts', 'djit','dtcpb',
                   'stcpb','ct_dst_ltm','dbytes'], axis = 'columns' , inplace = True)
    num_features = len(df_train.columns) - 1
    print('Number of features after feature selection :',num_features)
    print('Duplicates after feature selection :',df_train.duplicated().sum())
    df_train.drop_duplicates(keep='first',inplace=True)
    
    for col in ['proto','service','state']:
        df_train[col] = df_train[col].astype('category')
    
    label1 = 'other_proto'
    others2 = df_train['proto'].value_counts().index[2:]
    # apply new category label
    df_train['proto'] = df_train['proto'].cat.add_categories([label1])
    df_train['proto'] = df_train['proto'].replace(others2, label1)
    
    label2 = 'other_state'
    others3 = df_train['state'].value_counts().index[3:]
    # apply new category label
    df_train['state'] = df_train['state'].cat.add_categories([label2])
    df_train['state'] = df_train['state'].replace(others3, label2)
    
    label3 = 'other_service'
    others_service = df_train['service'].value_counts().index[6:]
    # apply new category label
    df_train['service'] = df_train['service'].cat.add_categories([label3])
    df_train['service'] = df_train['service'].replace(others_service, label3)
    
    print('Duplicates after category reduction :',df_train.duplicated().sum())
    
    df_train.drop_duplicates(subset=['dur', 'proto', 'service', 'state', 'dpkts', 'rate', 'sttl', 'dttl',
                           'sload', 'dload', 'dloss', 'sinpkt', 'swin', 'dwin', 'tcprtt', 'synack',
                           'ackdat', 'smean', 'dmean', 'trans_depth', 'ct_srv_src', 'ct_state_ttl',
                           'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
                           'ct_flw_http_mthd', 'ct_srv_dst', 'is_sm_ips_ports'],inplace=True)
    
    print('Duplicates after duplicate drop :',df_train.duplicated().sum())
    
    return df_train
df_train = train_preprocess()

Index(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label'],
      dtype='object')
Number of features before feature selection : 43
Original dataset duplicates : 67601
Number of features after feature selection : 28
Duplicates after feature selection : 76463
Duplicates after category reduction : 524
Duplicates after duplicate drop : 0


In [6]:
def test_preprocess():
    df_test = pd.read_csv('Datasets/UNSW_NB15_testing-set.csv')
    df_test.drop(['id'], axis = 'columns' , inplace = True)
    print('Original dataset duplicates :',df_test.duplicated().sum())
    df_test.drop(['attack_cat','sbytes', 'sjit', 'sloss', 'dinpkt', 'is_ftp_login', 'ct_ftp_cmd', 
         'ct_src_ltm', 'response_body_len', 'spkts', 'djit','dtcpb',
         'stcpb','ct_dst_ltm','dbytes'], axis = 'columns' , inplace = True)
    
    print('Duplicates after feature selection :',df_test.duplicated().sum())
    df_test.drop_duplicates(keep='first',inplace=True)
    
    for col in ['proto','service','state']:
        df_test[col] = df_test[col].astype('category')
    
    label1 = 'other_proto'
    others4 = df_test['proto'].value_counts().index[2:]
    # apply new category label
    df_test['proto'] = df_test['proto'].cat.add_categories([label1])
    df_test['proto'] = df_test['proto'].replace(others4, label1)
    
    label2 = 'other_state'
    others5 = df_test['state'].value_counts().index[3:]
    # apply new category label
    df_test['state'] = df_test['state'].cat.add_categories([label2])
    df_test['state'] = df_test['state'].replace(others5, label2)
    
    label3 = 'other_service'
    others_service1 = df_test['service'].value_counts().index[6:]
    # apply new category label
    df_test['service'] = df_test['service'].cat.add_categories([label3])
    df_test['service'] = df_test['service'].replace(others_service1, label3)
    
    print('Duplicates after category reduction :',df_test.duplicated().sum())
    
    df_test.drop_duplicates(subset=['dur', 'proto', 'service', 'state', 'dpkts', 'rate', 'sttl', 'dttl',
                                    'sload', 'dload', 'dloss', 'sinpkt', 'swin', 'dwin', 'tcprtt', 'synack',
                                    'ackdat', 'smean', 'dmean', 'trans_depth', 'ct_srv_src', 'ct_state_ttl',
                                    'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
                                    'ct_flw_http_mthd', 'ct_srv_dst', 'is_sm_ips_ports'],inplace=True)
    
    print('Duplicates after duplicate drop :',df_test.duplicated().sum())
    
    return df_test
df_test = test_preprocess()

Original dataset duplicates : 26387
Duplicates after feature selection : 29844
Duplicates after category reduction : 117
Duplicates after duplicate drop : 0


In [7]:
print('Train set shape:',df_train.shape)
print('Test set shape:',df_test.shape)

Train set shape: (98066, 29)
Test set shape: (52347, 29)


In [8]:
print('Train',len(df_train['proto'].unique()))
print('Test',len(df_test['proto'].unique()))
print('=======================================')
print('Train',len(df_train['service'].unique()))
print('Test',len(df_test['service'].unique()))
print('=======================================')
print('Train',len(df_train['state'].unique()))
print('Test',len(df_test['state'].unique()))

Train 3
Test 3
Train 7
Test 7
Train 4
Test 4


In [9]:
limit1 = df_train.shape[1]-1
    
X_train = df_train.iloc[:,0:limit1] # train set features
Y_train = df_train.iloc[:,limit1]
    
limit2 = df_test.shape[1]-1
    
X_test = df_test.iloc[:,0:limit2] # train set features
Y_test = df_test.iloc[:,limit2]

In [10]:
def one_hot_encode(X_train,X_test):
    
    categorical_cols = ['proto','service','state']

    # Training dataset one hot encoding
    ohe = OneHotEncoder(handle_unknown = 'ignore')
    ohe.fit(X_train[categorical_cols])
    array_hot_encoded1 = ohe.transform(X_train[categorical_cols]).toarray()
    data_hot_encoded1 = pd.DataFrame(array_hot_encoded1, index=X_train.index)
    X_train = X_train.drop(columns=categorical_cols)
    X_train = pd.concat([data_hot_encoded1,X_train], axis=1)
    
    print('X_train shape :',X_train.shape)
    
    # Test dataset one hot encoding
    array_hot_encoded2 = ohe.transform(X_test[categorical_cols]).toarray()
    data_hot_encoded2 = pd.DataFrame(array_hot_encoded2, index=X_test.index)
    X_test = X_test.drop(columns=categorical_cols)
    X_test = pd.concat([data_hot_encoded2,X_test], axis=1)
    
    print('X_test shape :',X_test.shape)
    
    return X_train, X_test

X_train, X_test = one_hot_encode(X_train,X_test)

X_train shape : (98066, 39)
X_test shape : (52347, 39)


In [11]:
def scaling(X_train,X_test):
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test

X_train, X_test = scaling(X_train,X_test)

In [12]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

Y_train.reset_index(drop=True,inplace=True)
Y_test.reset_index(drop=True,inplace=True)

In [13]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(98066, 39)
(98066,)
(52347, 39)
(52347,)


In [14]:
df_train = pd.concat([X_train,Y_train],axis=1)
df_test = pd.concat([X_test,Y_test],axis=1)

In [16]:
# Class count
count_class_0, count_class_1 = df_train['label'].value_counts()

# Divide by class
df_class_0 = df_train[df_train['label'] == 0]
df_class_1 = df_train[df_train['label'] == 1]
    
df_class_0_under = df_class_0.sample(count_class_1)
df_train = pd.concat([df_class_0_under, df_class_1], axis=0)

In [18]:
df_train.groupby('label')['label'].count()

label
0    46942
1    46942
Name: label, dtype: int64

In [19]:
df_train.reset_index(drop=True,inplace=True)
df_test.reset_index(drop=True,inplace=True)

In [22]:
df_train.to_csv('USNW_train-set.csv')
df_test.to_csv('USNW_test-set.csv')