# Problem statement
We have a clean dataset that is generated from an IDS, also labeled denoting '0' for No-Attack and '1' for Attack.
In this dataset, we will go through different pre-processing well-known to drive this notebook to understandability.
The data is clean, yet we need to ensure that every feature matters to the model via 'Feature Importance' that comes out with the generated model.

Source: http://www.secrepo.com/


In [None]:
import math, time, random, datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import missingno
import pickle

#### Read the training and test set.

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
train = pd.read_csv('/kaggle/input/unsw-nb15/UNSW_NB15_training-set.csv')
test = pd.read_csv('/kaggle/input/unsw-nb15/UNSW_NB15_testing-set.csv')
test.head()



In [None]:
train.head()

We can do per below but we'd like to concatenate both the training set and the test set to avoid doing the preprocessing twice.

In [None]:
#X_train = train.drop(['label'], axis=1)
#Y_train = train['label']
#X_test = test.drop(['label'], axis=1)
#Y_test = test['label']
#

In [None]:
data = pd.concat([train,test]).reset_index(drop=True)
cols_cat = data.select_dtypes('object').columns # To be explained later
cols_numeric = data._get_numeric_data().columns # To be explained later

In [None]:
data.head()

In [None]:
data.describe()

# Missing Values:

   <ul>
        <li>
        Check for missing values.
        </li>
         <li>
        Replace those missing values.
        </li>
    </ul>

In [None]:
print(data.isnull().sum())

In [None]:
missingno.matrix(data)

Data is clean and there are no missing values. 

# Insights and steps: 
   <ul>
        <li>
        Data is clean.
        </li>
         <li>
        Data still needs furthur processing in terms of One-hot-encoding for categorical data.
                     E.g.: 'service' consists of different types, we have ftp, http, and '-' denoting (not available or None), So we will need to treat it as a missing value as we will change it from '-' to 'None' instead of dropping the whole column.
        </li>
            <li>
        Removing unnecessary features like 'id'.
        </li>
    </ul>

In [None]:
data['attack_cat'].unique()

In [None]:
data['proto'].unique() #This is definitely a categorical feature.

In [None]:
data['service'].unique() #Here, we'll deal with the type of service that is '-'
data['service']= np.where(data['service'] == '-', 'None', data['service'])
print(data['service'].unique())

In [None]:
data['state'].unique() #Keep it.

##### Now, let's try to automate this process.

In [None]:
def Remove_dump_values(data, cols):
    for col in cols:
        data[col] = np.where(data[col] == '-', 'None', data[col])
    return data

In [None]:
cols = data.columns
data_bin = Remove_dump_values(data, cols)

**Removing unnecessary features:**

In [None]:
data_bin = data_bin.drop(['id'], axis=1) #Remove Unnecessary features

In [None]:
data['attack_cat'].unique()

In [None]:
cols_cat['attack_cat'].unique()

##### Categorical Features:
One Hot Encoding using cols_cat

In [None]:
data_bin.drop(['attack_cat'], axis=1, inplace=True)

In [None]:
cols_cat = cols_cat.drop(['attack_cat'])

In [None]:
 cols_cat.unique()

**Do one-hot encoding**

In [None]:
data_bin_hot = pd.get_dummies(data_bin,columns=cols_cat)

In [None]:
data_bin_hot.shape

##### Normalization:
Normalize all the values in the dataset.


In [None]:
cols_numeric = list(cols_numeric)
cols_numeric.remove('label')
cols_numeric.remove('id')

In [None]:
data_bin_hot[cols_numeric] = data_bin_hot[cols_numeric].astype('float') 

In [None]:
data_bin_hot[cols_numeric] = (data_bin_hot[cols_numeric] - np.min(data_bin_hot[cols_numeric])) / np.std(data_bin_hot[cols_numeric])

In [None]:
data_bin_hot['attack_cat'].unique()

# Train and Test set splitting:


We won't split the Data.. We do care about precision in our case!

In [None]:
#from sklearn.model_selection import train_test_split
#X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 42)

In [None]:
from sklearn import model_selection
from sklearn import metrics
X = data_bin_hot.drop('label', axis=1)
Y = data_bin_hot['label']

In [None]:
global X #To include in upcoming functions.

# Machine Learning Models:
<ul>
    <li>
        This is a Classification problem where we want to detect whether there is an attack or not.
    </li>
    <li>
        We will use simple Logistic Regression.
    </li>
        <li>
            K-Nearest Neighbour (Lazy Algorithm)
    </li>
        <li>
        Decision Trees
    </li>
        <li>
        Random Forest (gini)
    </li>
        <li>
        Random Forest (Entropy or Information-gain)
    </li>
    </ul>

In [None]:
def fit_algo(algo, x, y, cv):
    #Fit the model
    model = algo.fit(x, y)
    
    #Check its score
    acc = round(model.score(x, y) *100, 2)
    y_pred = model_selection.cross_val_predict(algo, x, y, cv=cv, n_jobs = -1)
    
    acc_cv = round(metrics.accuracy_score(Y,y_pred)*100, 2)
    
    return y_pred, acc, acc_cv, model

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
start_time = time.time()
pred_now, acc_lr, acc_cv_lr, lr = fit_algo(LogisticRegression(C=0.1)
                                        , X, Y, 10)

lr_time = (time.time() - start_time)

print("Accuracy: %s" % acc_lr)
print("Accuracy of CV: %s" % acc_cv_lr)
print("Execution time: %s" % lr_time)

# Feature Importance

In [None]:
def feature_plot(imp):
    global X
    fimp = pd.DataFrame({'Feature': X.columns, 'Importance' : np.round(imp)})
    fimp =fimp.sort_values(by='Importance', ascending=False)
    plt.figure(figsize=(10,10))
    plt.plot(fimp['Feature'], fimp['Importance'])
    plt.xticks(rotation=90);

In [None]:
feature_plot(lr.coef_[0])

In [None]:
fimp_lr = pd.DataFrame({'Feature': X.columns, 'Importance' : np.round(lr.coef_[0])})
fimp_lr =fimp_lr.sort_values(by='Importance', ascending=False)
fimp_lr

In [None]:
#from sklearn.neighbors import KNeighborsClassifier
#start_time = time.time()
#pred_now, acc_knn, acc_cv_knn, knn = fit_algo(KNeighborsClassifier(n_neighbors = 3)
 #                                       , X, Y, 10)
#knn_time = (time.time() - start_time)

#print("Accuracy: %s" % acc_knn)
#print("Accuracy of CV: %s" % acc_cv_knn)
#print("Execution time: %s" % knn_time)


[](http://)# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
start_time = time.time()
pred_now, acc_dt, acc_cv_dt, dt = fit_algo(DecisionTreeClassifier(random_state = 1)
                                        , X, Y, 10)

dt_time = (time.time() - start_time)

print("Accuracy: %s" % acc_dt)
print("Accuracy of CV: %s" % acc_cv_dt)
print("Execution time: %s" % dt_time)

# Random Forest (Gini)

In [None]:
from sklearn.ensemble import RandomForestClassifier
start_time = time.time()
pred_now, acc_rf, acc_cv_rf, rf = fit_algo(RandomForestClassifier(n_estimators = 100)
                                        , X, Y, 10)

rf_time = (time.time() - start_time)

print("Accuracy: %s" % acc_rf)
print("Accuracy of CV: %s" % acc_cv_rf)
print("Execution time: %s" % rf_time)


# Random Forest (Information Gain)

In [None]:
from sklearn.ensemble import RandomForestClassifier
start_time = time.time()
pred_now, acc_rf2, acc_cv_rf2, rf2 = fit_algo(RandomForestClassifier(n_estimators = 100, criterion='entropy')
                                        , X, Y, 10)

rf2_time = (time.time() - start_time)

print("Accuracy: %s" % acc_rf2)
print("Accuracy of CV: %s" % acc_cv_rf2)
print("Execution time: %s" % rf2_time)

**# Neural Networks

In [None]:
from sklearn.neural_network import MLPClassifier

start_time = time.time()
pred_now, acc_nn, acc_cv_nn, nn = fit_algo(MLPClassifier(hidden_layer_sizes = (20,), activation='relu', solver='adam')
                                        , X, Y, 5)

nn_time = (time.time() - start_time)

print("Accuracy: %s" % acc_nn)
print("Accuracy of CV: %s" % acc_cv_nn)
print("Execution time: %s" % nn_time)

# Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
start_time = time.time()

pred_now, acc_gnb, acc_cv_gnb, gnb= fit_algo(GaussianNB()
                                        ,X,Y,5)

gnb_time = (time.time() - start_time)

print("Accuracy: %s" % acc_gnb)
print("Accuracy of CV: %s" % acc_cv_gnb)
print("Execution time: %s" % gnb_time)

# Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
start_time = time.time()

pred_now, acc_gbt, acc_cv_gbt, gbt= fit_algo(GradientBoostingClassifier()
                                        , X, Y, 10)

gbt_time = (time.time() - start_time)

print("Accuracy: %s" % acc_gbt)
print("Accuracy of CV: %s" % acc_cv_gbt)
print("Execution time: %s" % gbt_time)

# SVM

In [None]:
from sklearn.svm import LinearSVC
start_time = time.time()

pred_now, acc_svc, acc_cv_svc, svc= fit_algo(LinearSVC()
                                        ,X,Y,10)

svc_time = (time.time() - start_time)

print("Accuracy: %s" % acc_svc)
print("Accuracy of CV: %s" % acc_cv_svc)
print("Execution time: %s" % svc_time)

# Cross-Validation Accuracy Comparison:

In [None]:
algo_name = ['Log. Reg.', 'Decision Tree', 'RandomForest Gini', 'RandomForest IG', 'Neural Network', 'Gaussian NB', 'GBC', 'SVM']
acc_df = pd.DataFrame({'Algorithm' : algo_name, 'Accuracy %' : [acc_cv_lr, acc_cv_dt, acc_cv_rf, acc_cv_rf2, acc_cv_nn, acc_cv_gnb, acc_cv_gbt, acc_cv_svc] })
acc_df = acc_df.sort_values(by='Accuracy %', ascending = False)
acc_df = acc_df.reset_index(drop=True)
acc_df

In [None]:
fimp_rf = pd.DataFrame({'Feature' : X.columns, 'Importance' : (rf.feature_importances_).astype(float)})
fimp_rf = fimp_rf.sort_values(by='Importance', ascending=False)
fimp_rf

In [None]:
feature_plot(rf.feature_importances_*100)

# Save the best accuracy model

In [None]:
filename = 'RandomForest_IG_IDS.sav'
pickle.dump(rf2, open(filename, 'wb'))

# Loading the model (for future use)

In [None]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X, Y)
print(result)