In [None]:
# import relevant modules
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import imblearn

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Settings
pd.set_option('display.max_columns', None)
np.set_printoptions(threshold=np.nan)
np.set_printoptions(precision=3)
sns.set(style="darkgrid")
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# LOAD DATA

In [None]:
train = pd.read_csv("../input/Train_data.csv")
test = pd.read_csv("../input/Test_data.csv")

In [None]:
print(train.head(4))

print("Training data has {} rows & {} columns".format(train.shape[0],train.shape[1]))

In [None]:
print(test.head(4))

print("Testing data has {} rows & {} columns".format(test.shape[0],test.shape[1]))

# EXPLORATORY ANALYSIS

In [None]:
# Descriptive statistics
train.describe()

In [None]:
print(train['num_outbound_cmds'].value_counts())
print(test['num_outbound_cmds'].value_counts())

In [None]:
#'num_outbound_cmds' is a redundant column so remove it from both train & test datasets
train.drop(['num_outbound_cmds'], axis=1, inplace=True)
test.drop(['num_outbound_cmds'], axis=1, inplace=True)

In [None]:
# Attack Class Distribution
train['class'].value_counts()

# SCALING NUMERICAL ATTRIBUTES

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# extract numerical attributes and scale it to have zero mean and unit variance  
cols = train.select_dtypes(include=['float64','int64']).columns
sc_train = scaler.fit_transform(train.select_dtypes(include=['float64','int64']))
sc_test = scaler.fit_transform(test.select_dtypes(include=['float64','int64']))

# turn the result back to a dataframe
sc_traindf = pd.DataFrame(sc_train, columns = cols)
sc_testdf = pd.DataFrame(sc_test, columns = cols)

# ENCODING CATEGORICAL ATTRIBUTES

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

# extract categorical attributes from both training and test sets 
cattrain = train.select_dtypes(include=['object']).copy()
cattest = test.select_dtypes(include=['object']).copy()

# encode the categorical attributes
traincat = cattrain.apply(encoder.fit_transform)
testcat = cattest.apply(encoder.fit_transform)

# separate target column from encoded data 
enctrain = traincat.drop(['class'], axis=1)
cat_Ytrain = traincat[['class']].copy()


In [None]:
train_x = pd.concat([sc_traindf,enctrain],axis=1)
train_y = train['class']
train_x.shape

In [None]:
test_df = pd.concat([sc_testdf,testcat],axis=1)
test_df.shape

# FEATURE SELECTION

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier();

# fit random forest classifier on the training set
rfc.fit(train_x, train_y);
# extract important features
score = np.round(rfc.feature_importances_,3)
importances = pd.DataFrame({'feature':train_x.columns,'importance':score})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
# plot importances
plt.rcParams['figure.figsize'] = (11, 4)
importances.plot.bar();

In [None]:
from sklearn.feature_selection import RFE
import itertools
rfc = RandomForestClassifier()

# create the RFE model and select 10 attributes
rfe = RFE(rfc, n_features_to_select=15)
rfe = rfe.fit(train_x, train_y)

# summarize the selection of the attributes
feature_map = [(i, v) for i, v in itertools.zip_longest(rfe.get_support(), train_x.columns)]
selected_features = [v for i, v in feature_map if i==True]

selected_features

# DATASET PARTITION

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(train_x,train_y,train_size=0.70, random_state=2)

# FITTING MODELS

In [None]:
from sklearn.svm import SVC 
from sklearn.naive_bayes import BernoulliNB 
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Train KNeighborsClassifier Model
KNN_Classifier = KNeighborsClassifier(n_jobs=-1)
KNN_Classifier.fit(X_train, Y_train); 

# Train LogisticRegression Model
LGR_Classifier = LogisticRegression(n_jobs=-1, random_state=0)
LGR_Classifier.fit(X_train, Y_train);

# Train Gaussian Naive Baye Model
BNB_Classifier = BernoulliNB()
BNB_Classifier.fit(X_train, Y_train)
            
# Train Decision Tree Model
DTC_Classifier = tree.DecisionTreeClassifier(criterion='entropy', random_state=0)
DTC_Classifier.fit(X_train, Y_train)

# EVALUATE MODELS

In [None]:
from sklearn import metrics

models = []
models.append(('Naive Baye Classifier', BNB_Classifier))
models.append(('Decision Tree Classifier', DTC_Classifier))
models.append(('KNeighborsClassifier', KNN_Classifier))
models.append(('LogisticRegression', LGR_Classifier))

for i, v in models:
    scores = cross_val_score(v, X_train, Y_train, cv=10)
    accuracy = metrics.accuracy_score(Y_train, v.predict(X_train))
    confusion_matrix = metrics.confusion_matrix(Y_train, v.predict(X_train))
    classification = metrics.classification_report(Y_train, v.predict(X_train))
    print()
    print('============================== {} Model Evaluation =============================='.format(i))
    print()
    print ("Cross Validation Mean Score:" "\n", scores.mean())
    print()
    print ("Model Accuracy:" "\n", accuracy)
    print()
    print("Confusion matrix:" "\n", confusion_matrix)
    print()
    print("Classification report:" "\n", classification) 
    print()

# VALIDATING MODELS

In [None]:
for i, v in models:
    accuracy = metrics.accuracy_score(Y_test, v.predict(X_test))
    confusion_matrix = metrics.confusion_matrix(Y_test, v.predict(X_test))
    classification = metrics.classification_report(Y_test, v.predict(X_test))
    print()
    print('============================== {} Model Test Results =============================='.format(i))
    print()
    print ("Model Accuracy:" "\n", accuracy)
    print()
    print("Confusion matrix:" "\n", confusion_matrix)
    print()
    print("Classification report:" "\n", classification) 
    print()        


In [None]:
# PREDICTING FOR TEST DATA using KNN
pred_knn = KNN_Classifier.predict(test_df)
pred_NB = BNB_Classifier.predict(test_df)
pred_log = LGR_Classifier.predict(test_df)
pred_dt = DTC_Classifier.predict(test_df)

# Extracting TP FP TN FN

In [None]:
def perf_measure(y_actual, y_pred):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_pred)): 
        if y_actual.iat[i]==y_pred[i]=='anomaly':
           TP += 1
        if y_pred[i]=='anomaly' and y_actual.iat[i]!=y_pred[i]:
           FP += 1
        if y_actual.iat[i]==y_pred[i]=='normal':
           TN += 1
        if y_pred[i]=='normal' and y_actual.iat[i]!=y_pred[i]:
           FN += 1
        
    return (TP, FP, TN, FN)


for i, v in models:
    print("For model:", i)
    TP, FP, TN, FN = perf_measure(Y_test, v.predict(X_test))
    print ("TP:", TP, "\tFP:", FP, "\t\tTN:", TN, "\tFN:", FN)
    
    # Testing for first row
    #print ("Expected: ", Y_test.iloc[0], "Predicted: ", v.predict(X_test).reshape(1, -1)[0][0] )
    print()
    

In [None]:
# Testing for second row
for i, v in models:
    print("For model: ", i)
    print ("Expected: ", Y_test.iloc[2], "\tPredicted: ", v.predict(X_test).reshape(1, -1)[0][2] )
    print()

In [None]:
type(Y_test)

# Functions to extract locations of FP, FN as a pandas series

In [None]:
def find_FP(y_actual, y_pred):
    FP = []

    for i in range(len(y_pred)): 
        if y_pred[i]=='anomaly' and y_actual.iat[i]!=y_pred[i]:
           FP.append(i)   
    return (pd.Series(FP))
    
def find_FN(y_actual, y_pred):
    FN = []

    for i in range(len(y_pred)): 
        if y_pred[i]=='normal' and y_actual.iat[i]!=y_pred[i]:
           FN.append(i)
    return (pd.Series(FN))


# Combining Naive Bayes and Decision Tree

## Getting FP and FN row location from NB output as pd.Series

In [None]:
FP_NB= find_FP(Y_test, models[0][1].predict(X_test))
print("Size of number of FP:", FP_NB.size) 
FN_NB= find_FN(Y_test, models[0][1].predict(X_test))
print("Size of number of FN:", FN_NB.size) 

# Testing 
FP_NB.head(4)

## Getting FP FN row entry from X_test and Y_test as pd.DataFrame and pd.Series respectively

In [None]:
X_test_subset=[]
Y_test_subset=[]
for i in FP_NB:
    X_test_subset.append(X_test.iloc[i])
    Y_test_subset.append(Y_test.iat[i])
for i in FN_NB:
    X_test_subset.append(X_test.iloc[i])
    Y_test_subset.append(Y_test.iat[i])
    
X_test_sub=pd.DataFrame(X_test_subset)
Y_test_sub=pd.Series(Y_test_subset)
print("Size of X_test_sub:", X_test_sub.shape[0]) 
print("Size of Y_test_sub:", Y_test_sub.size) 

# To check for each false positive
#for i in FP_NB:
#    print ("Expected: ", Y_test.iloc[i], "Predicted: ", models[1][1].predict(X_test).reshape(1, -1)[0][i] )

In [None]:
print(type(X_test)) 
X_test.head(4)

In [None]:
print(type(X_test_sub))
X_test_sub.head(4)


In [None]:
print(type(Y_test)) 
Y_test.head(4)

In [None]:
print(type(Y_test_sub))
Y_test_sub.head(4)

# Validating the combined model

## Validating results for the FP FN subset in the combined model

In [None]:
accuracy = metrics.accuracy_score(Y_test_sub, models[1][1].predict(X_test_sub))
confusion_matrix = metrics.confusion_matrix(Y_test_sub, models[1][1].predict(X_test_sub))
classification = metrics.classification_report(Y_test_sub, models[1][1].predict(X_test_sub))
print()
print('============================== {} Model Test Results =============================='.format("NB -> DT"))
print()
print ("Model Accuracy:" "\n", accuracy)
print()
print("Confusion matrix:" "\n", confusion_matrix)
print()
print("Classification report:" "\n", classification) 
print() 

## Validating hybrid model, NB + DT

In [None]:
print("For Naive Bayes:")
TP_old, FP_old, TN_old, FN_old = perf_measure(Y_test, BNB_Classifier.predict(X_test))
print ("TP:", TP_old, "\tFP:", FP_old, "\t\tTN:", TN_old, "\tFN:", FN_old)

print()
print("For Naive Bayes -> Decision Tress:")
TP_new, FP_new, TN_new, FN_new = perf_measure(Y_test_sub, DTC_Classifier.predict(X_test_sub))
print ("TP:", TP_new, "\tFP:", FP_new, "\t\tTN:", TN_new, "\tFN:", FN_new)

print()
print("For Naive Bayes + Decision Tress:")
tp = TP_old +TP_new
fp = FP_new
tn = TN_old +TN_new
fn = FN_new
print ("TP:", tp, "\tFP:", fp, "\t\tTN:", tn, "\tFN:", fn)

## 1. Accuracy (all correct / all) = TP + TN / TP + TN + FP + FN
## 2. Misclassification (all incorrect / all) = FP + FN / TP + TN + FP + FN
## 3. Precision (true positives / predicted positives) = TP / TP + FP
## 4. Sensitivity aka Recall (true positives / all actual positives) = TP / TP + FN
## 5. Specificity (true negatives / all actual negatives) =TN / TN + FP

In [None]:
acc_old= (TP_old + TN_old) / (TP_old + FP_old + TN_old + FN_old)
mis_old= (FP_old + FN_old) / (TP_old + FP_old + TN_old + FN_old)
prec_old= TP_old / (TP_old + FP_old)
sen_old= TP_old / (TP_old + FN_old)
spec_old= TN_old / (TN_old + FP_old)

acc= (tp + tn) / (tp + fp + tn + fn)
mis= (fp + fn) / (tp + fp + tn + fn)
prec= tp / (tp + fp)
sen= tp / (tp + fn)
spec= tn / (tn + fp)

print ("Accuracy")
print ("Old: ", acc_old, "\tNew: ", acc)
print ("\nMisclassification")
print ("Old: ", mis_old, "\tNew: ", mis)
print ("\nPrecision")
print ("Old: ", prec_old, "\tNew: ", prec)
print ("\nSensitivity")
print ("Old: ", sen_old, "\tNew: ", sen)
print ("\nSpecificity")
print ("Old: ", spec_old, "\tNew: ", spec)

# Plotting Results

In [None]:
# set width of bar
barWidth = 0.25
fig = plt.subplots(figsize =(12, 8))
 
# set height of bar
NB = [TP_old, FP_old, TN_old, FN_old]
NBandDT = [tp, fp, tn, fn]
 
# Set position of bar on X axis
br1 = np.arange(len(NB))
br2 = [x + barWidth for x in br1]
 
# Make the plot
plt.bar(br1, NB, color ='b', width = barWidth, edgecolor ='grey', label ='Naive Bayes')
plt.bar(br2, NBandDT, color ='g', width = barWidth, edgecolor ='grey', label ='Naive Bayes and Decision Tree')
 
# Adding Xticks
plt.xlabel('Confusion Matrix Element', fontweight ='bold', fontsize = 15)
plt.ylabel('Value', fontweight ='bold', fontsize = 15)
plt.xticks([r + barWidth for r in range(len(NB))], ['TP', 'FP', 'TN', 'FN'])
 
plt.legend()
plt.title("Confusion Matrix")
plt.show()

In [None]:
# set width of bar
barWidth = 0.25
fig = plt.subplots(figsize =(12, 8))
 
# set height of bar
Old = [acc_old, mis_old, prec_old, sen_old, spec_old]
New = [acc, mis, prec, sen, spec]
 
# Set position of bar on X axis
br1 = np.arange(len(Old))
br2 = [x + barWidth for x in br1]
 
# Make the plot
plt.bar(br1, Old, color ='b', width = barWidth, edgecolor ='grey', label ='Old')
plt.bar(br2, New, color ='g', width = barWidth, edgecolor ='grey', label ='New')


# Adding Xticks
plt.xlabel('Performance Metrics', fontweight ='bold', fontsize = 15)
plt.ylabel('Value', fontweight ='bold', fontsize = 15)
plt.xticks([r + barWidth for r in range(len(NB))], ['Accuracy', 'Misclassification', 'Precision', 'Sensitivity', 'Specificity'])
 
plt.legend()
plt.title("Comparison of performance metrics")
plt.show()

In [None]:
accPercent= ((acc- acc_old)/acc_old) *100
misPercent= ((mis_old- mis)/mis_old) *100
precPercent= ((prec- prec_old)/prec_old) *100
senPercent= ((sen- sen_old)/sen_old) *100
specPercent= ((spec- spec_old)/spec_old) *100

print ("Accuracy increase percentage: " ,accPercent, "%")
print ("Missclasification decrease percentage: " ,misPercent, "%")
print ("Precison increase percentage: ", precPercent, "%")
print ("Sensitivity increase percentage: ", senPercent, "%")
print ("Specificity increase percentage: ", specPercent, "%")