# Intrusion Detection Evaluation Dataset (CIC-IDS2017)
By AliK604 

Intrusion Detection Systems (IDSs) and Intrusion Prevention Systems (IPSs) are the most important defense tools against the sophisticated and ever-growing network attacks. Due to the lack of reliable test and validation datasets, anomaly-based intrusion detection approaches are suffering from consistent and accurate performance evolutions.

In [None]:
# %config IPCompleter.greedy=True
import pandas as pd
import seaborn as sns
import numpy as np

import matplotlib as matplot
import matplotlib.pyplot as plt
# %matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings, os 
# warnings.filterwarnings("ignore")

# from keras import Sequential
# from keras.models import Model, load_model
# from keras.layers import *
# from keras.callbacks import ModelCheckpoint
# from keras import regularizers

from sklearn.metrics import *
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA, TruncatedSVD, PCA
from sklearn.svm import LinearSVC

import xgboost, lightgbm
from mlxtend.classifier import EnsembleVoteClassifier 

In [None]:
ls = [] 
for filename in os.listdir(r'./'):
  if '.csv' in  filename:
    print(filename)
    df = pd.read_csv(filename)
    ls.append(df)
    print(f'Shape: {df.shape}. Attack Type {df[" Label"].unique()}')

In [None]:
for df in ls:
  cols = df.columns.to_list()
  for i in range(len(cols)):
    cols[i] = cols[i].strip()
  df.columns = cols

In [None]:
df = pd.concat(ls)
df.head(3)
assert df.shape[1] == 79

In [None]:
# mix of ints and floats. Label is a object (words)
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    # df.dtypes


In [None]:
from collections import Counter

Counter(df["Label"])

In [None]:
print(f'df.shape {df.shape} before sampling out most of benign data')
df = pd.concat([df[df['Label'] != 'BENIGN'], df[df['Label'] == 'BENIGN'].sample(frac=.1, random_state=42)]) # 20% of total is Malicious; Use only a sample of the BENIGN data
print(f'df.shape {df.shape} after sampling out most of benign data')

In [None]:
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Label'])
df.head(3)
le.classes_

In [None]:
lowSTD = list(df.std().to_frame().nsmallest(20, columns=0).index)
df[lowSTD].head(3)

In [None]:
lowCORR = list(df.corr().abs().sort_values('Label')['Label'].nsmallest(20).index) # .where(lambda x: x < 0.005).dropna()
df[lowCORR].head(3)

In [None]:
print(f'Intersection: {set(lowSTD).intersection(set(lowCORR))}')
print(f'Union:        {len(set(lowSTD).union(set(lowCORR)))}')

In [None]:
import gc 
gc.collect()

In [None]:
df.dropna(inplace=True)
indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
df = df[indices_to_keep]


for i in df.columns:
    df = df[df[i] != "Infinity"]
    df = df[df[i] != np.nan]
    df = df[df[i] != np.inf]
    df = df[df[i] != -np.inf]
    df = df[df[i] != ",,"]
    df = df[df[i] != ", ,"]
    
print(np.any(np.isnan(df)))
print(np.any(np.isfinite(df)))

X_train, X_test, y_train, y_test = train_test_split(df.drop(['Label'],axis=1), df['Label'], test_size=.20, random_state=42)
X_train.head(2)
y_train.head(2)

In [None]:
def benchmark(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test):
  GBM = lightgbm.LGBMClassifier(objective='binary', n_estimators= 50)
  clfs = [GBM]
  for clf in clfs:
      _ = clf.fit(X_train, y_train)
      pred = clf.score(X_test, y_test)
      name = str(type(clf)).split(".")[-1][:-2]
      print("Acc: %0.5f for the %s" % (pred, name))
  return clfs

In [None]:
print('Baseline with all features')
clfs = benchmark()

In [None]:
print('Solely with features identified as useless')

GBM = clfs[0]
remove = X_train.columns.to_numpy()[GBM.feature_importances_ == 0] 
clfs = benchmark(X_train[remove], X_test[remove])
print(remove)

In [None]:
print('Solely with features identified as useful')
clfs = benchmark(X_train.drop(remove,1), X_test.drop(remove,1))

In [None]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

print('PCA 5')
pca = PCA(5)
_ = benchmark(pca.fit_transform(X_train), pca.transform(X_test))

In [None]:
print('PCA 15')
pca = PCA(15)
_ = benchmark(pca.fit_transform(X_train), pca.transform(X_test))

In [None]:
print('PCA 25')
pca = PCA(25)
_ = benchmark(pca.fit_transform(X_train), pca.transform(X_test))

In [None]:
tmp = np.argsort(GBM.feature_importances_)
tmp = tmp[::-1] # large to small 
top = tmp[:20] # index of the top N
rest = tmp[20:]
print(GBM.feature_importances_[top]) # check 

In [None]:
print('Top 20 features (per `LGBMClassifier`) + PCA(15) of remaining') # 2nd highest 
pca = PCA(15)

a = pd.concat([X_train.iloc[:, top], pd.DataFrame(pca.fit_transform(X_train.iloc[:, rest]))], axis=1, ignore_index=True) 
b = pd.concat([X_test.iloc[:, top] , pd.DataFrame(pca.transform    (X_test.iloc[:, rest])) ], axis=1, ignore_index=True) 

_ = benchmark(a, b)