In [None]:
# import relevant modules
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import imblearn
import time

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Settings
pd.set_option('display.max_columns', None)
np.set_printoptions(threshold=np.nan)
np.set_printoptions(precision=3)
sns.set(style="darkgrid")
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# LOAD DATA

In [None]:
train = pd.read_csv("../input/Train_data.csv")
trainB = pd.read_csv("../input/Test_data.csv")

In [None]:
print(train.shape)

print("Training data has {} rows & {} columns".format(train.shape[0],train.shape[1]))

In [None]:
print(trainB.shape)

print("Testing data has {} rows & {} columns".format(trainB.shape[0],trainB.shape[1]))

# EXPLORATORY ANALYSIS

In [None]:
# Descriptive statistics
train.describe()


In [None]:
trainB.describe()

In [None]:
print(train['num_outbound_cmds'].value_counts())
print(trainB['num_outbound_cmds'].value_counts())

In [None]:
#'num_outbound_cmds' is a redundant column so remove it from both train & test datasets
train.drop(['num_outbound_cmds'], axis=1, inplace=True)
trainB.drop(['num_outbound_cmds'], axis=1, inplace=True)
train.drop(['dst_host_srv_count'],axis=1, inplace =True)
trainB.drop(['dst_host_srv_count'],axis=1, inplace =True)
train.drop(['src_bytes'],axis=1, inplace =True)
trainB.drop(['src_bytes'],axis=1, inplace =True)
train.drop(['flag'],axis=1, inplace =True)
trainB.drop(['flag'],axis=1, inplace =True)
train.drop(['dst_bytes'],axis=1, inplace =True)
trainB.drop(['dst_bytes'],axis=1, inplace =True)
train.drop(['same_srv_rate'],axis=1, inplace =True)
trainB.drop(['same_srv_rate'],axis=1, inplace =True)
train.drop(['dst_host_same_srv_rate'],axis=1, inplace =True)
trainB.drop(['dst_host_same_srv_rate'],axis=1, inplace =True)

In [None]:
# Attack Class Distribution
train['class'].value_counts()

# SCALING NUMERICAL ATTRIBUTES

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# extract numerical attributes and scale it to have zero mean and unit variance  
cols = train.select_dtypes(include=['float64','int64']).columns
sc_train = scaler.fit_transform(train.select_dtypes(include=['float64','int64']))
sc_trainB = scaler.fit_transform(trainB.select_dtypes(include=['float64','int64']))

# turn the result back to a dataframe
sc_traindf = pd.DataFrame(sc_train, columns = cols)
sc_trainBdf = pd.DataFrame(sc_trainB, columns = cols)

# ENCODING CATEGORICAL ATTRIBUTES

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

# extract categorical attributes from both training and test sets 
cattrain = train.select_dtypes(include=['object']).copy()
cattrainB = trainB.select_dtypes(include=['object']).copy()

# encode the categorical attributes
traincat = cattrain.apply(encoder.fit_transform)
trainBcat = cattrainB.apply(encoder.fit_transform)

# separate target column from encoded data 
enctrain = traincat.drop(['class'], axis=1)
cat_Ytrain = traincat[['class']].copy()


In [None]:
#cols = train.select_dtypes(include=['float64','int64']).columns
#colval = train.select_dtypes(include=['float64','int64'])
#sc_traindf = pd.DataFrame(colval, columns = cols)
train_x = pd.concat([sc_traindf,enctrain],axis=1)
train_y = train['class']
train_x.shape

In [None]:
#cols = test.select_dtypes(include=['float64','int64']).columns
#colval = test.select_dtypes(include=['float64','int64'])
#sc_traindf = pd.DataFrame(colval, columns = cols)
trainB_df = pd.concat([sc_trainBdf,trainBcat],axis=1)

trainB_df.shape

# FEATURE SELECTION

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier();

# fit random forest classifier on the training set
rfc.fit(train_x, train_y);
# extract important features
score = np.round(rfc.feature_importances_,3)
importances = pd.DataFrame({'feature':train_x.columns,'importance':score})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
# plot importances
plt.rcParams['figure.figsize'] = (11, 4)
importances.plot.bar();

In [None]:
from sklearn.feature_selection import RFE
import itertools
rfc = RandomForestClassifier()

# create the RFE model and select 10 attributes
rfe = RFE(rfc, n_features_to_select=15)
rfe = rfe.fit(train_x, train_y)

# summarize the selection of the attributes
feature_map = [(i, v) for i, v in itertools.zip_longest(rfe.get_support(), train_x.columns)]
selected_features = [v for i, v in feature_map if i==True]

len(selected_features)

# DATASET PARTITION

In [None]:
from sklearn.model_selection import train_test_split
#train_x= train_x[selected_features]
X_train,X_test,Y_train,Y_test = train_test_split(train_x,train_y,train_size=0.80, random_state=35)
X_train,X_trainB,Y_train,Y_trainB = train_test_split(X_train,Y_train,train_size=0.50, random_state=25)
X_test,X_testB,Y_test,Y_testB = train_test_split(X_test,Y_test,train_size=0.50, random_state=25)
print(X_train.shape, Y_train.shape)
print(X_trainB.shape, Y_trainB.shape)
print(X_test.shape, Y_test.shape)
print(X_testB.shape, Y_testB.shape)

# FITTING MODELS

In [None]:
from sklearn.svm import SVC 
from sklearn.naive_bayes import BernoulliNB 
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
"""
# Train KNeighborsClassifier Model
KNN_Classifier = KNeighborsClassifier(n_jobs=-1)
KNN_Classifier.fit(X_train, Y_train); 

# Train LogisticRegression Model
LGR_Classifier = LogisticRegression(n_jobs=-1, random_state=0)
LGR_Classifier.fit(X_train, Y_train);
"""
# Train Gaussian Naive Baye Model
BNB_Classifier = BernoulliNB()
BNB_Classifier.fit(X_train, Y_train)

# Train Decision Tree Model
t1=time.time()
DTC_Classifier = tree.DecisionTreeClassifier(criterion='gini',max_depth=2, random_state=10)
DTC_Classifier.fit(X_train, Y_train)
t2=time.time()
print("Training time for Decision Tree: ", t2-t1)
print()

t1=time.time()
XGB_Classifier = XGBClassifier(base_score=0.3, n_estimators=5)
XGB_Classifier.fit(X_train, Y_train)
t2=time.time()
print("Training time for XGBoost: ", t2-t1)
print()

t1=time.time()
RandomForest_Classifier = RandomForestClassifier(n_estimators=1)
RandomForest_Classifier.fit(X_train, Y_train)
t2=time.time()
print("Training time for Random Forest: ", t2-t1)
print()


# EVALUATE MODELS

In [None]:
from sklearn import metrics
from sklearn.metrics import f1_score
models = []
models.append(('Naive Baye Classifier', BNB_Classifier))
models.append(('Decision Tree Classifier', DTC_Classifier))
models.append(('XG Boost Classifier', XGB_Classifier))
models.append(('Random Forest Classifier', RandomForest_Classifier))



for i, v in models:
    scores = cross_val_score(v, X_test, Y_test, cv=10)
    accuracy = metrics.accuracy_score(Y_test, v.predict(X_test))
    confusion_matrix = metrics.confusion_matrix(Y_test, v.predict(X_test))
    classification = metrics.classification_report(Y_test, v.predict(X_test), digits=8)
    print()
    print('============================== {} Model Evaluation =============================='.format(i))
    print()
    print ("Cross Validation Mean Score:" "\n", scores.mean())
    print()
    print ("Model Accuracy:" "\n", accuracy)
    print()
    print("Confusion matrix:" "\n", confusion_matrix)
    print()
    print("Classification report:" "\n", classification) 
    print()
    

# VALIDATING MODELS

In [None]:
for i, v in models:
    t1=time.time()
    accuracy = metrics.accuracy_score(Y_test, v.predict(X_test))
    t2=time.time()
    confusion_matrix = metrics.confusion_matrix(Y_test, v.predict(X_test))
    classification = metrics.classification_report(Y_test, v.predict(X_test))
    print()
    print('============================== {} Model Test Results =============================='.format(i))
    print()
    print("Prediction time of ", i, t2-t1)
    print()
    print ("Model Accuracy:" "\n", accuracy)
    print()
    print("Confusion matrix:" "\n", confusion_matrix)
    print()
    print("Classification report:" "\n", classification) 
    print()        


# Functions to extract locations of FP, FN as a pandas series

# To generate the last row after NB prediction function 

In [None]:
def genPredRow(y_actual, y_pred):
    FP = []

    for i in range(len(y_pred)): 
        if y_pred[i]=='anomaly' and y_actual.iat[i]!=y_pred[i]:
           FP.append(1)
        elif y_pred[i]=='normal' and y_actual.iat[i]!=y_pred[i]:
           FP.append(1) 
        else:
            FP.append(0)
    return (pd.Series(FP))
    


# Combining Decision Tree and XG Boost

## Getting FP and FN row location from NB output as pd.Series

In [None]:
"""models = []
models.append(('Naive Baye Classifier', BNB_Classifier))
models.append(('Decision Tree Classifier', DTC_Classifier))
models.append(('XG Boost Classifier', XGB_Classifier))
models.append(('Random Forest Classifier', RandomForest_Classifier))

"""

t1= time.time()

##USING DECISION TREE TO PREDICT PREVIOUS VALUES
X_testB_XGB_DCT = X_testB.copy()
Y_testB_XGB_DCT = Y_testB.copy()
X_trainB_XGB_DCT = X_trainB.copy()
Y_trainB_XGB_DCT = Y_trainB.copy()
finalPred= genPredRow(Y_trainB_XGB_DCT, models[1][1].predict(X_trainB_XGB_DCT))
print("Size of number of FP:", finalPred.size) 

finalPredTest = genPredRow(Y_testB_XGB_DCT, models[1][1].predict(X_testB_XGB_DCT))

X_trainB_XGB_DCT['prevPred'] = np.array(finalPred)
X_testB_XGB_DCT['prevPred'] = np.array(finalPredTest)
print(X_trainB_XGB_DCT.shape)
print(X_testB.shape)



## PASSING X_TRAIN B TO TRAIN XGBOOST 


In [None]:
XGB_Classifier_DCT = XGBClassifier(base_score=0.3, n_estimators=5)
XGB_Classifier_DCT.fit(X_trainB_XGB_DCT, Y_trainB_XGB_DCT)
print()

t2=time.time()
print ("Training Time for Decision Tree and XG Boost: ", t2-t1)

## Testing new XG Boost Model

In [None]:
t1=time.time()
Y_testB_XGB_DCT_pred = XGB_Classifier_DCT.predict(X_testB_XGB_DCT)
t2=time.time()
print ("Prediction Time for Decision Tree and XG Boost: ", t2-t1)
print()
scores_XGB_DCT = cross_val_score(XGB_Classifier_DCT, X_testB_XGB_DCT, Y_testB_XGB_DCT, cv=10)
accuracy_XGB_DCT = metrics.accuracy_score(Y_testB_XGB_DCT, Y_testB_XGB_DCT_pred)
confusion_matrix_XGB_DCT = metrics.confusion_matrix(Y_testB_XGB_DCT,Y_testB_XGB_DCT_pred)
classification_XGB_DCT = metrics.classification_report(Y_testB_XGB_DCT,Y_testB_XGB_DCT_pred, digits=8)
print()
print('============================== {} Model Evaluation =============================='.format('Decision tree + XG Boost'))
print()
print ("Cross Validation Mean Score:" "\n", scores_XGB_DCT.mean())
print()
print ("Model Accuracy:" "\n", accuracy_XGB_DCT)
print()
print("Confusion matrix:" "\n", confusion_matrix_XGB_DCT)
print()
print("Classification report:" "\n", classification_XGB_DCT) 
print()
 

# Combining Random Forest with XG BOOST 

In [None]:
"""models = []
models.append(('Naive Baye Classifier', BNB_Classifier))
models.append(('Decision Tree Classifier', DTC_Classifier))
models.append(('XG Boost Classifier', XGB_Classifier))
models.append(('Random Forest Classifier', RandomForest_Classifier))

"""

t1=time.time()
##USING DECISION TREE TO PREDICT PREVIOUS VALUES
X_testB_XGB_RF = X_testB.copy()
Y_testB_XGB_RF = Y_testB.copy()
X_trainB_XGB_RF = X_trainB.copy()
Y_trainB_XGB_RF = Y_trainB.copy()
finalPred= genPredRow(Y_trainB_XGB_RF, models[1][1].predict(X_trainB_XGB_RF))
print("Size of number of FP:", finalPred.size) 

finalPredTest = genPredRow(Y_testB_XGB_RF, models[1][1].predict(X_testB_XGB_RF))

X_trainB_XGB_RF['prevPred'] = np.array(finalPred)
X_testB_XGB_RF['prevPred'] = np.array(finalPredTest)
print(X_trainB_XGB_RF.shape)
print(X_testB.shape)




## Passing updated dataframe to XG BOOST

In [None]:
XGB_Classifier_RF = XGBClassifier(base_score=0.3, n_estimators=5)
XGB_Classifier_RF.fit(X_trainB_XGB_RF, Y_trainB_XGB_RF)
print()

t2=time.time()
print ("Training Time for Random Forest with XG BOOST: ", t2-t1)
print()

## Testing new XG Boost Model

In [None]:
t1=time.time()
Y_testB_XGB_RF_pred = XGB_Classifier_RF.predict(X_testB_XGB_RF)
t2=time.time()
print ("Prediction Time for Random Forest with XG BOOST: ", t2-t1)
print()
scores_XGB_RF = cross_val_score(XGB_Classifier_RF, X_testB_XGB_RF, Y_testB_XGB_RF, cv=10)
accuracy_XGB_RF = metrics.accuracy_score(Y_testB_XGB_RF, Y_testB_XGB_RF_pred)
confusion_matrix_XGB_RF = metrics.confusion_matrix(Y_testB_XGB_RF,Y_testB_XGB_RF_pred)
classification_XGB_RF = metrics.classification_report(Y_testB_XGB_RF,Y_testB_XGB_RF_pred, digits=8)
print()
print('============================== {} Model Evaluation =============================='.format('Random Forest + XG Boost'))
print()
print ("Cross Validation Mean Score:" "\n", scores_XGB_RF.mean())
print()
print ("Model Accuracy:" "\n", accuracy_XGB_RF)
print()
print("Confusion matrix:" "\n", confusion_matrix_XGB_RF)
print()
print("Classification report:" "\n", classification_XGB_RF) 
print()

# Random Forest + Decsion tree

In [None]:
"""models = []
models.append(('Naive Baye Classifier', BNB_Classifier))
models.append(('Decision Tree Classifier', DTC_Classifier))
models.append(('XG Boost Classifier', XGB_Classifier))
models.append(('Random Forest Classifier', RandomForest_Classifier))

"""

t1=time.time()

##USING DECISION TREE TO PREDICT PREVIOUS VALUES
X_testB_DCT_RF = X_testB.copy()
Y_testB_DCT_RF = Y_testB.copy()
X_trainB_DCT_RF = X_trainB.copy()
Y_trainB_DCT_RF = Y_trainB.copy()
finalPred= genPredRow(Y_trainB_DCT_RF, models[1][1].predict(X_trainB_DCT_RF))
print("Size of number of FP:", finalPred.size) 

finalPredTest = genPredRow(Y_testB_DCT_RF, models[1][1].predict(X_testB_DCT_RF))

X_trainB_DCT_RF['prevPred'] = np.array(finalPred)
X_testB_DCT_RF['prevPred'] = np.array(finalPredTest)
print(X_trainB_DCT_RF.shape)
print(X_testB.shape)


## passing updated dataframe to Decision tree classifier

In [None]:
DTC_Classifier_RF = tree.DecisionTreeClassifier(criterion='gini',max_depth=2, random_state=10)
DTC_Classifier_RF.fit(X_trainB_DCT_RF, Y_trainB_DCT_RF)
print()

t2=time.time()
print ("Training Time for Random Forest with XG BOOST: ", t2-t1)
print()

## Testing new Decision Tree Model

In [None]:
t1=time.time()
Y_testB_DCT_RF_pred = XGB_Classifier_RF.predict(X_testB_DCT_RF)
t2=time.time()
print ("Prediction Time for Random Forest with XG BOOST: ", t2-t1)
print()
scores_DCT_RF = cross_val_score(XGB_Classifier_RF, X_testB_DCT_RF, Y_testB_DCT_RF, cv=10)
accuracy_DCT_RF = metrics.accuracy_score(Y_testB_DCT_RF, Y_testB_DCT_RF_pred)
confusion_matrix_DCT_RF = metrics.confusion_matrix(Y_testB_DCT_RF,Y_testB_DCT_RF_pred)
classification_DCT_RF = metrics.classification_report(Y_testB_DCT_RF,Y_testB_DCT_RF_pred, digits=8)
print()
print('============================== {} Model Evaluation =============================='.format('Random Forest + Decision'))
print()
print ("Cross Validation Mean Score:" "\n", scores_DCT_RF.mean())
print()
print ("Model Accuracy:" "\n", accuracy_DCT_RF)
print()
print("Confusion matrix:" "\n", confusion_matrix_DCT_RF)
print()
print("Classification report:" "\n", classification_DCT_RF) 
print()