# This Notebook starts from Preprocessed Dataframework. "df_spark.csv" is the dataframe

In [1]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
Path = '/content/drive/My Drive/Data/DF_Spark.csv'
df_spark = pd.read_csv(Path)

In [4]:
df_spark.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0,7.0,21.0,30.0,2.0,3.0,30.0,2.0,3.0,59.0,7.0,2.0,0.0
1,1,7.0,32.0,70.0,2.0,4.0,68.0,2.0,4.0,134.0,7.0,2.0,0.0
2,2,7.0,15.0,1.0,2.0,2.0,0.0,2.0,2.0,0.0,7.0,2.0,0.0
3,3,7.0,33.0,77.0,2.0,7.0,73.0,2.0,7.0,144.0,7.0,2.0,0.0
4,4,7.0,55.0,78.0,3.0,7.0,74.0,3.0,7.0,146.0,8.0,2.0,0.0


In [5]:
df_spark = df_spark.drop(columns="Unnamed: 0")

In [6]:
df_spark.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,7.0,21.0,30.0,2.0,3.0,30.0,2.0,3.0,59.0,7.0,2.0,0.0
1,7.0,32.0,70.0,2.0,4.0,68.0,2.0,4.0,134.0,7.0,2.0,0.0
2,7.0,15.0,1.0,2.0,2.0,0.0,2.0,2.0,0.0,7.0,2.0,0.0
3,7.0,33.0,77.0,2.0,7.0,73.0,2.0,7.0,144.0,7.0,2.0,0.0
4,7.0,55.0,78.0,3.0,7.0,74.0,3.0,7.0,146.0,8.0,2.0,0.0


# In the following code X contains features and y contains label

In [7]:
y = df_spark.iloc[:,0].values
X = df_spark.iloc[:,1:].values

# The whole dataset is split into 80:20 ratio. X_train contains 80% of the features, X_test contains 20% of the features and y_train contains 80% corresponding label of X_train and y_test contains 20% corresponding label of X_test

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state = 1)

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
import numpy as np

# 5-Fold Cross validation Estimation for Logistic Regression

In [10]:
pipe_lr = Pipeline([('scl', StandardScaler()),('clf', LogisticRegression(penalty='l2', random_state=0))])
train_sizes, train_scores, test_scores = learning_curve(estimator=pipe_lr,X=X, y=y, train_sizes=np.linspace(0.2,1.0,5), cv=5, n_jobs=-1)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

In [None]:
for i in train_sizes:
    print(i)

57270
114540
171811
229081
286352


In [None]:
for i in train_mean:
    print(i)

0.9864361795006111
0.986911122751877
0.9868308781160694
0.9865846578284538
0.9883136838576296


In [None]:
for i in test_mean:
    print(i)

0.8969772041405518
0.8779824156058529
0.9781109282961822
0.9846064215159688
0.9880902458937426


# 5-Fold Cross validation Estimation for SVM

In [11]:
from sklearn.svm import LinearSVC

In [12]:
pipe_svc = Pipeline([('scl', StandardScaler()),('clf', LinearSVC())])
train_sizes_svc, train_scores_svc, test_scores_svc = learning_curve(estimator=pipe_svc,X=X, y=y, train_sizes=np.linspace(0.2,1.0,5), cv=5, n_jobs=-1)
train_mean_svc = np.mean(train_scores_svc, axis=1)
train_std_svc = np.std(train_scores_svc, axis=1)
test_mean_svc = np.mean(test_scores_svc, axis=1)
test_std_svc = np.std(test_scores_svc, axis=1)

In [None]:
for i in train_mean_svc:
    print(i)

0.9826540946394273
0.981639601885804
0.9856668082951614
0.9823267752454372
0.9829803877744873


In [None]:
for i in test_mean_svc:
    print(i)

0.9603062688826753
0.9711907748218677
0.9774767477765417
0.9796111876180309
0.9825781652457415


# 5-Fold Cross validation Estimation for Decision Tree

In [13]:
from sklearn import tree

In [14]:
pipe_tree = Pipeline([('scl', StandardScaler()),('clf', tree.DecisionTreeClassifier())])
train_sizes_tree, train_scores_tree, test_scores_tree = learning_curve(estimator=pipe_tree,X=X, y=y, train_sizes=np.linspace(0.2,1.0,5), cv=5, n_jobs=-1)
train_mean_tree = np.mean(train_scores_tree, axis=1)
train_std_tree = np.std(train_scores_tree, axis=1)
test_mean_tree = np.mean(test_scores_tree, axis=1)
test_std_tree = np.std(test_scores_tree, axis=1)

In [None]:
for i in train_mean_tree:
    print(i)

In [None]:
for i in test_mean_tree:
    print(i)

# 5-Fold Cross validation Estimation for Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
pipe_rnd = Pipeline([('scl', StandardScaler()),('clf', RandomForestClassifier(n_estimators=10))])
train_sizes_rnd, train_scores_rnd, test_scores_rnd = learning_curve(estimator=pipe_rnd,X=X, y=y, train_sizes=np.linspace(0.2,1.0,5), cv=5, n_jobs=-1)
train_mean_rnd = np.mean(train_scores_rnd, axis=1)
train_std_rnd = np.std(train_scores_rnd, axis=1)
test_mean_rnd = np.mean(test_scores_rnd, axis=1)
test_std_rnd = np.std(test_scores_rnd, axis=1)

In [None]:
for i in train_mean_rnd:
    print(i)

0.9866876200453991
0.9884634188929631
0.9906862231534834
0.992856643967173
0.9942168674698795


In [None]:
for i in test_mean_rnd:
    print(i)

0.9439088829638763
0.9685588432554251
0.9801726981234566
0.9884841598922233
0.9932225637013902


# 5-Fold Cross validation Estimation for ANN

In [17]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=10, alpha=1e-4, solver='sgd', verbose=10, tol=1e-4, random_state=1, learning_rate_init=.1)

In [None]:
pipe_mlp = Pipeline([('scl', StandardScaler()),('clf', mlp)])
train_sizes_mlp, train_scores_mlp, test_scores_mlp = learning_curve(estimator=pipe_mlp,X=X, y=y, train_sizes=np.linspace(0.2,1.0,5), cv=5, n_jobs=-1)
train_mean_mlp = np.mean(train_scores_mlp, axis=1)
train_std_mlp = np.std(train_scores_mlp, axis=1)
test_mean_mlp = np.mean(test_scores_mlp, axis=1)
test_std_mlp = np.std(test_scores_mlp, axis=1)

In [None]:
for i in train_mean_mlp:
    print(i)

0.985339619346953
0.9879745067225423
0.990625691170479
0.9928016413480008
0.9941484197660205


In [None]:
for i in test_mean_mlp:
    print(i)

0.9150290340041607
0.9029206196671373
0.9803263726837024
0.9884702046127953
0.9930130422658466


# Mean values of Training and Testing accuracies and Standard Deviation of Training and Testing accuracies are given below

In [None]:
np.mean(train_mean) , np.mean(train_mean_svc), np.mean(train_mean_tree), np.mean(train_mean_rnd), np.mean(train_mean_mlp)

(0.9825350341636725,
 0.9827354393092291,
 0.9899853295348444,
 0.9899843194255066,
 0.988992106220478)

In [None]:
np.mean(train_std) , np.mean(train_std_svc), np.mean(train_std_tree), np.mean(train_std_rnd), np.mean(train_std_mlp)

(0.0012669016539291768,
 0.0015004132547957309,
 0.000812529982066585,
 0.0008124067495914715,
 0.0013105445891091817)

In [None]:
np.mean(test_mean) , np.mean(test_mean_svc), np.mean(test_mean_tree), np.mean(test_mean_rnd), np.mean(test_mean_mlp)

(0.9745988472667129,
 0.9762471837085901,
 0.9707884454993927,
 0.9725458663799749,
 0.9611862286384287)

In [None]:
np.mean(test_std) , np.mean(test_std_svc), np.mean(test_std_tree), np.mean(test_std_rnd), np.mean(test_std_mlp)

(0.005597696152259861,
 0.006471826851518314,
 0.016850233251830146,
 0.014866038573080196,
 0.021343692164590724)

# Evaluation Metrics Calculations for Logisitic Regression

In [None]:
pipe_lr = pipe_lr.fit(X_train, y_train)

In [20]:
y_pred_train = pipe_lr.predict(X_train)

In [21]:
y_pred_test = pipe_lr.predict(X_test)

In [22]:
from sklearn.metrics import accuracy_score

In [23]:
accuracy_score(y_train, y_pred_train)

0.9885455662960273

In [24]:
accuracy_score(y_test, y_pred_test)

0.9881406361312493

In [25]:
from sklearn.metrics import classification_report

In [26]:
target_names = ['DoSattack','dataProbing','malitiousControl', 'malitiousOperation','scan','spying', 'wrongSetUp', 'Normal']

In [27]:
print(classification_report(y_train, y_pred_train, target_names=target_names))

  _warn_prf(average, modifier, msg_start, len(result))


                    precision    recall  f1-score   support

         DoSattack       0.96      0.65      0.78      4602
       dataProbing       1.00      0.59      0.74       279
  malitiousControl       0.98      0.96      0.97       720
malitiousOperation       0.97      0.48      0.64       650
              scan       0.90      0.47      0.62      1242
            spying       0.00      0.00      0.00       412
        wrongSetUp       0.90      1.00      0.95        94
            Normal       0.99      1.00      0.99    278353

          accuracy                           0.99    286352
         macro avg       0.84      0.64      0.71    286352
      weighted avg       0.99      0.99      0.99    286352



In [28]:
print(classification_report(y_test, y_pred_test, target_names=target_names))

                    precision    recall  f1-score   support

         DoSattack       0.95      0.66      0.78      1178
       dataProbing       1.00      0.57      0.73        63
  malitiousControl       0.98      0.97      0.98       169
malitiousOperation       0.99      0.50      0.67       155
              scan       0.88      0.41      0.56       305
            spying       0.00      0.00      0.00       120
        wrongSetUp       0.93      1.00      0.97        28
            Normal       0.99      1.00      0.99     69571

          accuracy                           0.99     71589
         macro avg       0.84      0.64      0.71     71589
      weighted avg       0.99      0.99      0.99     71589



  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
from sklearn.metrics import confusion_matrix
import itertools

In [None]:
cnf_matrix = confusion_matrix(y_test, y_pred_test)

In [None]:
for i in cnf_matrix:
    for j in i:
        print(j, end='&')
    print()

775&0&0&0&0&0&0&403&
0&0&0&0&0&0&0&63&
0&0&10&0&0&0&0&159&
0&0&0&78&0&0&0&77&
5&0&2&0&0&0&0&298&
0&0&0&0&0&0&0&120&
0&0&0&0&0&0&0&28&
34&0&0&9&0&0&0&69528&


# Evaluation Metrics Calculations for SVM

In [29]:
pipe_svc = pipe_svc.fit(X_train, y_train)
y_pred_train = pipe_svc.predict(X_train)
y_pred_test = pipe_svc.predict(X_test)



In [30]:
accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred_test)

(0.9826332625579706, 0.9827627149422398)

In [31]:
print(classification_report(y_train, y_pred_train, target_names=target_names))

  _warn_prf(average, modifier, msg_start, len(result))


                    precision    recall  f1-score   support

         DoSattack       0.97      0.65      0.78      4602
       dataProbing       0.00      0.00      0.00       279
  malitiousControl       0.78      0.04      0.07       720
malitiousOperation       1.00      0.16      0.27       650
              scan       0.00      0.00      0.00      1242
            spying       0.00      0.00      0.00       412
        wrongSetUp       0.00      0.00      0.00        94
            Normal       0.98      1.00      0.99    278353

          accuracy                           0.98    286352
         macro avg       0.47      0.23      0.26    286352
      weighted avg       0.98      0.98      0.98    286352



In [32]:
print(classification_report(y_test, y_pred_test, target_names=target_names))

                    precision    recall  f1-score   support

         DoSattack       0.96      0.66      0.78      1178
       dataProbing       0.00      0.00      0.00        63
  malitiousControl       0.83      0.06      0.11       169
malitiousOperation       1.00      0.21      0.35       155
              scan       0.00      0.00      0.00       305
            spying       0.00      0.00      0.00       120
        wrongSetUp       0.00      0.00      0.00        28
            Normal       0.98      1.00      0.99     69571

          accuracy                           0.98     71589
         macro avg       0.47      0.24      0.28     71589
      weighted avg       0.98      0.98      0.98     71589



  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
cnf_matrix = confusion_matrix(y_test, y_pred_test)
for i in cnf_matrix:
    for j in i:
        print(j, end='&')
    print()

775&0&0&0&0&0&0&403&
0&0&0&0&0&0&0&63&
0&0&10&0&0&0&0&159&
0&0&0&33&0&0&0&122&
0&0&2&0&0&0&0&303&
0&0&0&0&0&0&0&120&
0&0&0&0&0&0&0&28&
34&0&0&0&0&0&0&69537&


# Evaluation Metrics Calculations for Decision Tree

In [36]:
pipe_tree = pipe_tree.fit(X_train, y_train)
y_pred_train = pipe_tree.predict(X_train)
y_pred_test = pipe_tree.predict(X_test)

In [37]:
accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred_test)

(0.9942413532994356, 0.994091271005322)

In [38]:
print(classification_report(y_train, y_pred_train, target_names=target_names))

                    precision    recall  f1-score   support

         DoSattack       0.98      0.65      0.78      4602
       dataProbing       1.00      1.00      1.00       279
  malitiousControl       1.00      1.00      1.00       720
malitiousOperation       1.00      1.00      1.00       650
              scan       1.00      1.00      1.00      1242
            spying       1.00      1.00      1.00       412
        wrongSetUp       1.00      1.00      1.00        94
            Normal       0.99      1.00      1.00    278353

          accuracy                           0.99    286352
         macro avg       1.00      0.96      0.97    286352
      weighted avg       0.99      0.99      0.99    286352



In [39]:
print(classification_report(y_test, y_pred_test, target_names=target_names))

                    precision    recall  f1-score   support

         DoSattack       0.98      0.66      0.79      1178
       dataProbing       1.00      1.00      1.00        63
  malitiousControl       1.00      1.00      1.00       169
malitiousOperation       1.00      1.00      1.00       155
              scan       1.00      1.00      1.00       305
            spying       0.98      1.00      0.99       120
        wrongSetUp       1.00      1.00      1.00        28
            Normal       0.99      1.00      1.00     69571

          accuracy                           0.99     71589
         macro avg       0.99      0.96      0.97     71589
      weighted avg       0.99      0.99      0.99     71589



In [40]:
cnf_matrix = confusion_matrix(y_test, y_pred_test)
for i in cnf_matrix:
    for j in i:
        print(j, end='&')
    print()

775&0&0&0&0&0&0&403&
0&63&0&0&0&0&0&0&
0&0&169&0&0&0&0&0&
0&0&0&155&0&0&0&0&
0&0&0&0&305&0&0&0&
0&0&0&0&0&120&0&0&
0&0&0&0&0&0&28&0&
18&0&0&0&0&2&0&69551&


# Evaluation Metrics Calculations for Random Forest

In [58]:
pipe_rnd = pipe_rnd.fit(X_train, y_train)
y_pred_train = pipe_rnd.predict(X_train)
y_pred_test = pipe_rnd.predict(X_test)

In [59]:
y_pred_train = pipe_rnd.predict(X_train)
y_pred_test = pipe_rnd.predict(X_test)

In [60]:
accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred_test)

(0.9942413532994356, 0.994105239631787)

In [61]:
print(classification_report(y_train, y_pred_train, target_names=target_names))

                    precision    recall  f1-score   support

         DoSattack       0.98      0.65      0.78      4602
       dataProbing       1.00      1.00      1.00       279
  malitiousControl       1.00      1.00      1.00       720
malitiousOperation       1.00      1.00      1.00       650
              scan       1.00      1.00      1.00      1242
            spying       1.00      1.00      1.00       412
        wrongSetUp       1.00      1.00      1.00        94
            Normal       0.99      1.00      1.00    278353

          accuracy                           0.99    286352
         macro avg       1.00      0.96      0.97    286352
      weighted avg       0.99      0.99      0.99    286352



In [62]:
print(classification_report(y_test, y_pred_test, target_names=target_names))


                    precision    recall  f1-score   support

         DoSattack       0.98      0.66      0.79      1178
       dataProbing       1.00      1.00      1.00        63
  malitiousControl       1.00      1.00      1.00       169
malitiousOperation       1.00      1.00      1.00       155
              scan       1.00      1.00      1.00       305
            spying       1.00      1.00      1.00       120
        wrongSetUp       1.00      1.00      1.00        28
            Normal       0.99      1.00      1.00     69571

          accuracy                           0.99     71589
         macro avg       1.00      0.96      0.97     71589
      weighted avg       0.99      0.99      0.99     71589



In [46]:
cnf_matrix = confusion_matrix(y_test, y_pred_test)
for i in cnf_matrix:
    for j in i:
        print(j, end='&')
    print()

775&0&0&0&0&0&0&403&
0&63&0&0&0&0&0&0&
0&0&169&0&0&0&0&0&
0&0&0&155&0&0&0&0&
0&0&0&0&305&0&0&0&
0&0&0&0&0&120&0&0&
0&0&0&0&0&0&28&0&
18&0&0&0&0&0&0&69553&


# Evaluation Metrics Calculations for ANN

In [47]:
pipe_mlp = pipe_mlp.fit(X_train, y_train)
y_pred_train = pipe_mlp.predict(X_train)
y_pred_test = pipe_mlp.predict(X_test)

Iteration 1, loss = 0.05584959
Iteration 2, loss = 0.02541104
Iteration 3, loss = 0.01882284
Iteration 4, loss = 0.01670067
Iteration 5, loss = 0.01501241
Iteration 6, loss = 0.01462482
Iteration 7, loss = 0.01410045
Iteration 8, loss = 0.01378341
Iteration 9, loss = 0.01346050
Iteration 10, loss = 0.01328968




In [48]:
accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred_test)

(0.9942273844778454, 0.994077302378857)

In [49]:
print(classification_report(y_train, y_pred_train, target_names=target_names))

                    precision    recall  f1-score   support

         DoSattack       0.98      0.65      0.78      4602
       dataProbing       1.00      1.00      1.00       279
  malitiousControl       1.00      1.00      1.00       720
malitiousOperation       1.00      1.00      1.00       650
              scan       1.00      1.00      1.00      1242
            spying       1.00      1.00      1.00       412
        wrongSetUp       1.00      1.00      1.00        94
            Normal       0.99      1.00      1.00    278353

          accuracy                           0.99    286352
         macro avg       1.00      0.96      0.97    286352
      weighted avg       0.99      0.99      0.99    286352



In [51]:
print(classification_report(y_test, y_pred_test, target_names=target_names))

                    precision    recall  f1-score   support

         DoSattack       0.98      0.66      0.79      1178
       dataProbing       1.00      1.00      1.00        63
  malitiousControl       0.99      1.00      1.00       169
malitiousOperation       1.00      1.00      1.00       155
              scan       1.00      1.00      1.00       305
            spying       0.98      1.00      0.99       120
        wrongSetUp       1.00      1.00      1.00        28
            Normal       0.99      1.00      1.00     69571

          accuracy                           0.99     71589
         macro avg       0.99      0.96      0.97     71589
      weighted avg       0.99      0.99      0.99     71589



In [55]:
from sklearn.metrics import confusion_matrix
import itertools

In [54]:
cnf_matrix = confusion_matrix(y_test, y_pred_test)

In [53]:
cnf_matrix

array([[  775,     0,     0,     0,     0,     0,     0,   403],
       [    0,    63,     0,     0,     0,     0,     0,     0],
       [    0,     0,   169,     0,     0,     0,     0,     0],
       [    0,     0,     0,   155,     0,     0,     0,     0],
       [    0,     0,     0,     0,   305,     0,     0,     0],
       [    0,     0,     0,     0,     0,   120,     0,     0],
       [    0,     0,     0,     0,     0,     0,    28,     0],
       [   18,     0,     0,     0,     0,     0,     0, 69553]])

In [52]:
for i in cnf_matrix:
    for j in i:
        print(j, end=' ')
    print()

775 0 0 0 0 0 0 403 
0 63 0 0 0 0 0 0 
0 0 169 0 0 0 0 0 
0 0 0 155 0 0 0 0 
0 0 0 0 305 0 0 0 
0 0 0 0 0 120 0 0 
0 0 0 0 0 0 28 0 
18 0 0 0 0 0 0 69553 
