# NSL Pre-processing

In [1]:
cd C:\\Users\\Sharuka Thirimanne\\Desktop\\FYP-ML-IDS

C:\Users\Sharuka Thirimanne\Desktop\FYP-ML-IDS


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [3]:
def train_preprocess():
    df_train = pd.read_csv('Datasets/csv_result-KDDTrain+.csv')
    df_train.drop(['id'], axis = 'columns' , inplace = True)
    num_features = len(df_train.columns) - 1
    print('Number of features before feature selection :',num_features)
    print('Original dataset duplicates :',df_train.duplicated().sum())

    num_features = len(df_train.columns) - 1
    print('Number of features after feature selection :',num_features)
    print('Duplicates after feature selection :',df_train.duplicated().sum())
    for col in ['protocol_type','flag','service']:
        df_train[col] = df_train[col].astype('category')
    
    label1 = 'Other_service'
    others1 = df_train['service'].value_counts().index[30:]
    df_train['service'] = df_train['service'].cat.add_categories([label1])
    df_train['service'] = df_train['service'].replace(others1, label1)

    df_train['service'].replace({'time':'Other_service','urp_i':'Other_service',
                                 'csnet_ns':'Other_service','supdup':'Other_service',
                                 'discard':'Other_service','http_443':'Other_service'}, inplace = True)
    
    print('Duplicates after category reduction :',df_train.duplicated().sum())
    df_train['class'] = df_train['class'].map( {'normal':0, 'anomaly':1} )
    df_train.drop_duplicates(keep='first',inplace=True)
    print('Duplicates after duplicate drop :',df_train.duplicated().sum())
    
    return df_train
df_train = train_preprocess()

Number of features before feature selection : 41
Original dataset duplicates : 9
Number of features after feature selection : 41
Duplicates after feature selection : 9
Duplicates after category reduction : 1468
Duplicates after duplicate drop : 0


In [4]:
df_train.groupby('class')['class'].count()

class
0    67343
1    57162
Name: class, dtype: int64

In [5]:
def test_preprocess():
    df_test = pd.read_csv('Datasets/csv_result-KDDTest+.csv')
    df_test.drop(['id'], axis = 'columns' , inplace = True)
    print('Original dataset duplicates :',df_test.duplicated().sum())
    
    print('Duplicates after feature selection :',df_test.duplicated().sum())
    for col in ['protocol_type','flag','service']:
        df_test[col] = df_test[col].astype('category')
    
    label1 = 'Other_service'
    others1 = df_test['service'].value_counts().index[30:]
    # apply new category label
    df_test['service'] = df_test['service'].cat.add_categories([label1])
    df_test['service'] = df_test['service'].replace(others1, label1)

    print('Duplicates after category reduction :',df_test.duplicated().sum())
    df_test['service'].replace({'pop_3':'Other_service','sunrpc':'Other_service',
                                'link':'Other_service','name':'Other_service','echo':'Other_service',
                                'netbios_ns':'Other_service'}, inplace = True)
    
    df_test['class'] = df_test['class'].map( {'normal':0, 'anomaly':1} )
    df_test.drop_duplicates(keep='first',inplace=True)
    print('Duplicates after duplicate drop :',df_test.duplicated().sum())
    
    return df_test
df_test = test_preprocess()

Original dataset duplicates : 3
Duplicates after feature selection : 3
Duplicates after category reduction : 22
Duplicates after duplicate drop : 0


In [6]:
print('Train set shape:',df_train.shape)
print('Test set shape:',df_test.shape)

Train set shape: (124505, 42)
Test set shape: (22514, 42)


In [7]:
print('Train',len(df_train['service'].unique()))
print('Test',len(df_test['service'].unique()))
print('=======================================')
print('Train',len(df_train['flag'].unique()))
print('Test',len(df_test['flag'].unique()))
print('=======================================')
print('Train',len(df_train['protocol_type'].unique()))
print('Test',len(df_test['protocol_type'].unique()))

Train 25
Test 25
Train 11
Test 11
Train 3
Test 3


In [8]:
limit1 = df_train.shape[1]-1
    
X_train = df_train.iloc[:,0:limit1] # train set features
Y_train = df_train.iloc[:,limit1]
    
limit2 = df_test.shape[1]-1
    
X_test = df_test.iloc[:,0:limit2] # train set features
Y_test = df_test.iloc[:,limit2]

In [9]:
def one_hot_encode(X_train,X_test):
    
    categorical_cols = ['protocol_type','flag','service']

    # Training dataset one hot encoding
    ohe = OneHotEncoder(handle_unknown = 'ignore')
    ohe.fit(X_train[categorical_cols])
    array_hot_encoded1 = ohe.transform(X_train[categorical_cols]).toarray()
    data_hot_encoded1 = pd.DataFrame(array_hot_encoded1, index=X_train.index)
    X_train = X_train.drop(columns=categorical_cols)
    X_train = pd.concat([data_hot_encoded1,X_train], axis=1)
    
    print('X_train shape :',X_train.shape)
    
    # Test dataset one hot encoding
    array_hot_encoded2 = ohe.transform(X_test[categorical_cols]).toarray()
    data_hot_encoded2 = pd.DataFrame(array_hot_encoded2, index=X_test.index)
    X_test = X_test.drop(columns=categorical_cols)
    X_test = pd.concat([data_hot_encoded2,X_test], axis=1)
    
    print('X_test shape :',X_test.shape)
    
    return X_train, X_test

X_train, X_test = one_hot_encode(X_train,X_test)

X_train shape : (124505, 77)
X_test shape : (22514, 77)


In [10]:
def scaling(X_train,X_test):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test

X_train, X_test = scaling(X_train,X_test)

In [11]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

Y_train.reset_index(drop=True,inplace=True)
Y_test.reset_index(drop=True,inplace=True)

In [12]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(124505, 77)
(124505,)
(22514, 77)
(22514,)


In [13]:
df_train = pd.concat([X_train,Y_train],axis=1)
df_test = pd.concat([X_test,Y_test],axis=1)

In [14]:
print('Train set shape:',df_train.shape)
print('Test set shape:',df_test.shape)

Train set shape: (124505, 78)
Test set shape: (22514, 78)


In [14]:
# Class count
count_class_0, count_class_1 = df_train['class'].value_counts()

# Divide by class
df_class_0 = df_train[df_train['class'] == 0]
df_class_1 = df_train[df_train['class'] == 1]
    
df_class_0_under = df_class_0.sample(count_class_1)
df_train = pd.concat([df_class_0_under, df_class_1], axis=0)

In [15]:
df_train.groupby('class')['class'].count()

class
0    57162
1    57162
Name: class, dtype: int64

In [16]:
df_train.reset_index(drop=True,inplace=True)
df_test.reset_index(drop=True,inplace=True)

In [17]:
df_train.to_csv('NSL_train-set.csv')
df_test.to_csv('NSL_test-set.csv')

In [18]:
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,68,69,70,71,72,73,74,75,76,class
0,-0.266945,-2.085498,2.70263,-0.019225,-0.312206,-0.111431,-0.028774,-0.139869,-0.608618,-0.054224,...,0.921938,0.742849,-0.384842,-0.482677,-0.290946,-0.629677,-0.614882,-0.38698,-0.375558,0
1,-0.266945,0.479502,-0.37001,-0.019225,-0.312206,-0.111431,-0.028774,-0.139869,-0.608618,-0.054224,...,1.246925,1.054935,-0.437850,-0.418083,-0.114220,-0.584381,-0.614882,-0.38698,-0.375558,0
2,-0.266945,-2.085498,2.70263,-0.019225,-0.312206,-0.111431,-0.028774,-0.139869,-0.608618,-0.054224,...,-1.046042,-1.174249,2.212533,1.713510,-0.290946,-0.629677,-0.614882,-0.38698,-0.375558,0
3,-0.266945,0.479502,-0.37001,-0.019225,-0.312206,-0.111431,-0.028774,-0.139869,-0.608618,18.442060,...,1.246925,1.054935,-0.437850,-0.482677,-0.290946,-0.629677,-0.614882,-0.38698,-0.375558,0
4,-0.266945,-2.085498,2.70263,-0.019225,-0.312206,-0.111431,-0.028774,-0.139869,-0.608618,-0.054224,...,-1.027987,-1.151957,0.675311,0.744604,-0.290946,-0.629677,-0.614882,-0.38698,-0.375558,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114319,-0.266945,0.479502,-0.37001,-0.019225,-0.312206,-0.111431,-0.028774,-0.139869,1.643067,-0.054224,...,-1.018959,-1.129665,-0.172812,-0.482677,-0.290946,1.635128,1.645768,-0.38698,-0.375558,1
114320,-0.266945,0.479502,-0.37001,-0.019225,-0.312206,-0.111431,-0.028774,-0.139869,1.643067,-0.054224,...,-0.946740,-1.062789,-0.119804,-0.482677,-0.290946,1.635128,1.645768,-0.38698,-0.375558,1
114321,-0.266945,0.479502,-0.37001,-0.019225,-0.312206,-0.111431,-0.028774,-0.139869,1.643067,-0.054224,...,-0.937712,-1.062789,-0.066796,-0.482677,-0.290946,1.635128,1.645768,-0.38698,-0.375558,1
114322,-0.266945,0.479502,-0.37001,-0.019225,-0.312206,-0.111431,-0.028774,-0.139869,1.643067,-0.054224,...,-0.829383,-0.951330,-0.119804,-0.482677,-0.290946,1.635128,1.645768,-0.38698,-0.375558,1
