In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
import time

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

data = import_data('../input/10kfile/10k.csv')

In [None]:
data.columns = data.columns.str.strip()

#### I .Data Analysis

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.columns

In [None]:
data.columns

In [None]:
data.rename(columns={'Unnamed: 0':'unnamed',
                   'Flow ID':'Flow_Id',
                   'Source IP':'Source_IP',
                   'Source Port':'Source_Port',
                   'Destination IP':'Destination_IP',
                   'Destination Port':'Destination_Port',
                    'Flow Duration':'Flow_Duration',
                    'Total Fwd Packets':'Total_Fwd_Packets',
                    'Total Backward Packets':'Total_Backward_Packets',
                    'Total Length of Bwd Packets':'Total_Length_of_Bwd_Packets',
                     'Fwd Packet Length Max':'Fwd_Packet_Length_Max',
                     'Fwd Packet Length Min':'Fwd_Packet_Length_Min',
                    'Fwd Packet Length Mean':'Fwd_Packet_Length_Mean',
                     'Fwd Packet Length Std':'Fwd_Packet_Length_Std',
                       'Bwd Packet Length Max':'Bwd_Packet_Length_Max',
                     'Bwd Packet Length Min':'Bwd_Packet_Length_Min',
                   'Bwd Packet Length Mean':'Bwd_Packet_Length_Mean',
                     'Bwd Packet Length Std':'Bwd_Packet_Length_Std',
                     'Flow Bytes/s':'Flow_Bytes',                   
                       'Flow Packets/s':'Flow_Packets',
                     'Flow IAT Mean':'Flow_IAT_Mean',
                     'Flow IAT Std':'Flow_IAT_Std',
                     'Flow IAT Max':'Flow_IAT_Max',
                       'Flow IAT Min':'Flow_IAT_Min',
                     'Fwd IAT Total':'Fwd_IAT_Total',
                     'Fwd IAT Mean':'Fwd_IAT_Mean',
                     'Fwd IAT Std':'Fwd_IAT_Std',
                   'Fwd IAT Max':'Fwd_IAT_Max',
                     'Fwd IAT Min':'Fwd_IAT_Min',
                     'Bwd IAT Total':'Bwd_IAT_Total',
                     'Bwd IAT Mean':'Bwd_IAT_Mean',
                       'Bwd IAT Std':'Bwd_IAT_Std',
                     'Bwd IAT Max':'Bwd_IAT_Max',
                     'Bwd IAT Min':'Bwd_IAT_Min',
                     'Fwd PSH Flags':'Fwd_PSH_Flags',
                       'Bwd PSH Flags':'Bwd_PSH_Flags',
                     'Fwd URG Flags':'Fwd_URG_Flags',
                     'Bwd URG Flags':'Bwd_URG_Flags',
                     'Fwd Header Length':'Fwd_Header_Length',
                    'Bwd Header Length':'Bwd_Header_Length',
                     'Fwd Packets/s':'Fwd_Packets',
                     'Bwd Packets/s':'Bwd_Packets',
       'Min Packet Length':'Min_Packet_Length',
                     'Max Packet Length':'Max_Packet_Length',
                     'Packet Length Mean':'Packet_Length_Mean',
       'Packet Length Std':'Packet_Length_Std',
                     'Packet Length Variance':'Packet_Length_Variance',
                     'FIN Flag Count':'FIN_Flag_Count',
       'SYN Flag Count':'SYN_Flag_Count', 
                     'RST Flag Count':'RST_Flag_Count',
                     'PSH Flag Count':'PSH_Flag_Count',
                     'ACK Flag Count':'ACK_Flag_Count',
       'URG Flag Count':'URG_Flag_Count',
                     'CWE Flag Count':'SYN_Flag_Count',
                     'ECE Flag Count': 'ECE_Flag_Count',
                     'Down/Up Ratio':'Down_Up_Ratio',
    
                     'Average Packet Size':'Average_Packet_Size', 
                     'Avg Fwd Segment Size':'Avg_Fwd_Segment_Size', 
                     'Avg Bwd Segment Size':'Avg_Bwd_Segment_Size',
       'Fwd Header Length.1':'Fwd_Header_Length_1', 
                     'Fwd Avg Bytes/Bulk':'Fwd_Avg_Bytes_Bulk', 
                     'Fwd Avg Packets/Bulk':'Fwd_Avg_Packets_Bulk',
       'Fwd Avg Bulk Rate':'Fwd_Avg_Bulk_Rate', 
                     'Bwd Avg Bytes/Bulk':'Bwd_Avg_Bytes_Bulk', 
                     'Bwd Avg Packets/Bulk':'Bwd_Avg_Packets_Bulk',
       'Bwd Avg Bulk Rate':'Bwd_Avg_Bulk_Rate', 
                     'Subflow Fwd Packets':'Subflow_Fwd_Packets', 
                     'Subflow Fwd Bytes':'Subflow_Fwd_Bytes',
       'Subflow Bwd Packets':'Subflow_Bwd_Packets', 
                     'Subflow Bwd Bytes':'Subflow_Bwd_Bytes', 

       'Active Mean':'Active_Mean', 
                     'Active Std':'Active_Std', 
                     'Active Max':'Active_Max', 
                     'Active Min':'Active_Min', 
                     'Idle Mean':'Idle_Mean',
       'Idle Std':'Idle_Std', 
                     'Idle Max':'Idle_Max', 
                     'Idle Min':'Idle_Min', 
                  },
          inplace=True, errors='raise')

In [None]:
##### Here we see that the label contains boolean values: 0 - Benign, 1-Maliciuous 
data.Label.unique()

In [None]:
data.Label.value_counts()

In [None]:
label_dict = dict(data.Label.value_counts())
sns.countplot(data.Label)

In [None]:
labels = ["Syn",'Benign']
# sizes = [dict(data.Label.value_counts())[0], dict(data.Label.value_counts())[1]]
plt.figure(figsize = (13,8))
plt.pie((13,8), labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
plt.legend(["Syn", "Benign"])
plt.title('The percentage of Benign and Syn Requests in dataset')
plt.show()

In [None]:
data.describe()

In [None]:
# Let's look at the vizualisation of Null valued features
figure(figsize=(9, 5), dpi=80)
data[data.columns[data.isna().sum() >= 0]].isna().sum().sort_values().plot.bar()
plt.title("Features which has NuLL values")

In [None]:
data.isnull().sum()

In [None]:
#### Let's support which columns NUMERIC and which is OBJECT

numeric_df = data.select_dtypes(include=['int64', 'float64'])
object_df = data.select_dtypes(include=['object'])
numeric_cols = numeric_df.columns
object_cols = object_df.columns
print('Numeric Columns: ')
print(numeric_cols, '\n')
print('Object Columns: ')
print(object_cols, '\n')
print('Number of Numeric Features: ', len(numeric_cols))
print('Number of Object Features: ', len(object_cols))

In [None]:
object_df.head()

In [None]:
#### Let's look at Oblect columns (Source Destination Protocol)

figure(figsize=(12, 7), dpi=80)
plt.barh(list(dict(data.Source_IP.value_counts()).keys()), dict(data.Source_IP.value_counts()).values(), color='lawngreen')

for idx, val in enumerate(dict(data.Source_IP.value_counts()).values()):
    plt.text(x = val, y = idx-0.2, s = str(val), color='r', size = 13)

plt.xlabel('Number of Requests')
plt.ylabel('IP addres of sender')
plt.title('Number of all reqests')

In [None]:
figure(figsize=(12, 7), dpi=80)
plt.barh(list(dict(data[data.Label == "Syn"].Source_IP.value_counts()).keys()), dict(data[data.Label == "Syn"].Source_IP.value_counts()).values(), color='blue')

for idx, val in enumerate(dict(data[data.Label == "Syn"].Source_IP.value_counts()).values()):
    plt.text(x = val, y = idx-0.2, s = str(val), color='r', size = 13)

plt.xlabel('Number of Requests')
plt.ylabel('IP addres of sender')
plt.title('Number of Attack requests')

In [None]:
figure(figsize=(12, 7), dpi=80)
plt.barh(list(dict(data.Source_IP.value_counts()).keys()), dict(data.Source_IP.value_counts()).values(), color='lawngreen')
plt.barh(list(dict(data[data.Label == "Syn"].Source_IP.value_counts()).keys()), dict(data[data.Label == "Syn"].Source_IP.value_counts()).values(), color='blue')

for idx, val in enumerate(dict(data.Source_IP.value_counts()).values()):
    plt.text(x = val, y = idx-0.2, s = str(val), color='r', size = 13)

for idx, val in enumerate(dict(data[data.Label == "Syn"].Source_IP.value_counts()).values()):
    plt.text(x = val, y = idx-0.2, s = str(val), color='w', size = 13)


plt.xlabel('Number of Requests')
plt.ylabel('IP addres of sender')
plt.legend(['All','malicious'])
plt.title('Number of requests from different IP adress')

In [None]:
figure(figsize=(10, 6), dpi=80)
plt.bar(list(dict(data.Protocol.value_counts()).keys()), dict(data.Protocol.value_counts()).values(), color='r')
plt.bar(list(dict(data[data.Label == "Syn"].Protocol.value_counts()).keys()), dict(data[data.Label == "Syn"].Protocol.value_counts()).values(), color='b')

plt.text(x = 0 - 0.15, y = 41321 + 200, s = str(41321), color='black', size=17)
plt.text(x = 1 - 0.15, y = 33588 + 200, s = str(33588), color='black', size=17)
plt.text(x = 2 - 0.15, y = 29436 + 200, s = str(29436), color='black', size=17)

plt.text(x = 0 - 0.15, y = 9419 + 200, s = str(9419), color='w', size=17)
plt.text(x = 1 - 0.15, y = 17499 + 200, s = str(17499), color='w', size=17)
plt.text(x = 2 - 0.15, y = 13866 + 200, s = str(13866), color='w', size=17)

plt.xlabel('Protocol')
plt.ylabel('Count')
plt.legend(['All', 'malicious'])
plt.title('The number of requests from different protocols')

In [None]:
df = data.copy()

In [None]:
#correlation map
f,ax = plt.subplots(figsize=(20, 20))
sns.heatmap(df.corr(), annot=True, linewidths=0.9, fmt= '.1f',ax=ax)

#### II. Classical ML models

In [None]:

class Model:
    global y
    def __init__(self, data):
        self.data = data
        X = preprocessing.StandardScaler().fit(self.data).transform(self.data)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, random_state=42, test_size=0.3)  
    
    def LogisticRegression(self):
        solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

        start_time = time.time()
        results_lr = []
        accuracy_list = []
        for solver in solvers:
            LR = LogisticRegression(C=0.03, solver=solver).fit(self.X_train, self.y_train)
            predicted_lr = LR.predict(self.X_test)
            accuracy_lr = accuracy_score(self.y_test, predicted_lr)
            #print("Accuracy: %.2f%%" % (accuracy_lr * 100.0))
            #print('################################################################')
            results_lr.append({'solver' : solver, 'accuracy': str(round(accuracy_lr * 100, 2)) + "%", 
                                  'Coefficients': {'W' : LR.coef_, 'b': LR.intercept_}})
            
            accuracy_list.append(accuracy_lr)
       
        solver_name = solvers[accuracy_list.index(max(accuracy_list))]
        LR = LogisticRegression(C=0.03, solver=solver_name).fit(self.X_train,self.y_train)
        predicted_lr = LR.predict(self.X_test)
        accuracy_lr = accuracy_score(self.y_test, predicted_lr)
        print("Accuracy: %.2f%%" % (accuracy_lr * 100.0), '\n')
        print("########################################################################")
        print('Best solver is : ', solver_name)
        print("########################################################################")
        print(classification_report(predicted_lr, self.y_test), '\n')
        print("########################################################################")
        print("--- %s seconds --- time for LogisticRegression" % (time.time() - start_time))
        
        
    def SupportVectorMachine(self):
        start_time = time.time()
        accuracy_list = []
        result_svm = []
        kernels = ['linear', 'poly','rbf', 'sigmoid']
        #kernels = ['rbf']
        for kernel in kernels:
            SVM = svm.SVC(kernel=kernel).fit(self.X_train, self.y_train)
            predicted_svm = SVM.predict(self.X_test)
            accuracy_svm = accuracy_score(self.y_test, predicted_svm)
            result_svm.append({"kernel" : kernel, "accuracy": f"{round(accuracy_svm*100,2)}%"})
            print("Accuracy: %.2f%%" % round((accuracy_svm * 100.0),2))
            print('######################################################################')
            accuracy_list.append(accuracy_svm)
        
        kernel_name = kernels[accuracy_list.index(max(accuracy_list))]
        SVM = svm.SVC(kernel=kernel_name).fit(self.X_train, self.y_train)
        predicted_svm = SVM.predict(self.X_test)
        accuracy_svm = accuracy_score(self.y_test, predicted_svm)
        print(f"Accuracy of SVM model {round(accuracy_svm,2)*100}%", '\n')
        print("########################################################################")
        print('best kernel is : ', kernel_name)
        print("########################################################################")
        print(classification_report(predicted_svm, self.y_test))
        print("########################################################################")
        print("--- %s seconds ---" % (time.time() - start_time))
        
    def KNearetsNeighbor(self):
        start_time = time.time()
        Ks = 12
        accuracy_knn = np.zeros((Ks-1))
        std_acc = np.zeros((Ks-1))
        #print(accuracy_knn)
        for n in range(1,Ks):

            #Train Model and Predict  
            neigh = KNeighborsClassifier(n_neighbors = n).fit(self.X_train,self.y_train)
            yhat=neigh.predict(self.X_test)
            accuracy_knn[n-1] = metrics.accuracy_score(self.y_test, yhat)


            std_acc[n-1]=np.std(yhat==self.y_test)/np.sqrt(yhat.shape[0])

        #print(accuracy_knn,'\n\n') # courseranyn ozinde tek osy gana jazylyp turdy
        #print(std_acc)
        #accuracy_knn[0] = 0
        plt.figure(figsize=(10,6))
        plt.plot(range(1,Ks),accuracy_knn,'g')
        plt.fill_between(range(1,Ks),accuracy_knn - 1 * std_acc,accuracy_knn + 1 * std_acc, alpha=0.10)
        plt.fill_between(range(1,Ks),accuracy_knn - 3 * std_acc,accuracy_knn + 3 * std_acc, alpha=0.10,color="green")
        plt.legend(('Accuracy ', '+/- 1xstd','+/- 3xstd'))
        plt.ylabel('Accuracy ')
        plt.xlabel('Number of Neighbors (K)')
        plt.tight_layout()
        plt.show()
        
        
        knnc = KNeighborsClassifier()
        knnc_search = GridSearchCV(knnc, param_grid={'n_neighbors': [3, 5, 10],
                                             'weights': ['uniform', 'distance'],
                                             'metric': ['euclidean', 'manhattan']},
                           n_jobs=-1, cv=3, scoring='accuracy', verbose=2)
        
        knnc_search.fit(self.X_train, self.y_train)
        #print(knnc_search.best_params_)
        #print(knnc_search.best_score_)
        n_neighbors = knnc_search.best_params_['n_neighbors']
        weights = knnc_search.best_params_['weights']
        metric = knnc_search.best_params_['metric']
        KNN = KNeighborsClassifier(n_neighbors=n_neighbors, metric=metric, weights=weights).fit(self.X_train,self.y_train)
        
        predicted_knn = KNN.predict(self.X_test)
        accuracy_knn = metrics.accuracy_score(self.y_test, predicted_knn)
        print(f"Accuracy of KNN model {round(accuracy_knn,2)*100}%", '\n')
        print("########################################################################")
        print(classification_report(predicted_knn, self.y_test))
        print("########################################################################")
        print("--- %s seconds ---" % (time.time() - start_time))
        
    def DecisionTree(self):
        start_time = time.time()
        tree = DecisionTreeClassifier()
        dt_search = GridSearchCV(tree, param_grid={'criterion' : ['gini', 'entropy'],
                                           'max_depth' : [2,3,4,5,6,7,8, 9, 10],
                                           'max_leaf_nodes' : [2,3,4,5,6,7,8,9,10, 11]},
                           n_jobs=-1, cv=5, scoring='accuracy', verbose=2)
        
        dt_search.fit(self.X_train, self.y_train)
        
        criterion = dt_search.best_params_['criterion']
        max_depth = dt_search.best_params_['max_depth']
        max_leaf_nodes = dt_search.best_params_['max_leaf_nodes']
        
        dtree = DecisionTreeClassifier(criterion=criterion, 
                                       max_depth=max_depth, 
                                       max_leaf_nodes=max_leaf_nodes).fit(self.X_train, self.y_train)
        predicted_dt = dtree.predict(self.X_test)
        accuracy_dt = metrics.accuracy_score(self.y_test, predicted_dt)
        print(f"criterion: {criterion}, max depth: {max_depth}, max_leaf: {max_leaf_nodes}")
        print(f"The Accuracy is : {round(accuracy_dt * 100,2)}%")
        print("########################################################################")
        print(classification_report(predicted_dt, self.y_test))
        print("########################################################################")
        
        print("--- %s seconds ---" % (time.time() - start_time))
    
    def RandomForest(self):
        start_time = time.time()
        RF = RandomForestClassifier(criterion='gini', 
                                     n_estimators=500,
                                     min_samples_split=10,
                                     #min_samples_leaf=1,
                                     max_features='auto',
                                     oob_score=True,
                                     random_state=1,
                                     n_jobs=-1).fit(self.X_train, self.y_train)
        
        predicted_rf = RF.predict(self.X_test)
        svm_accuracy = accuracy_score(self.y_test, predicted_rf)
        print(f"Accuracy of RF is : {round(svm_accuracy*100,2)}%", '\n')
        print("########################################################################")
        print(classification_report(predicted_rf, self.y_test))
        print("########################################################################")
        
        print("--- %s seconds ---" % (time.time() - start_time))

"""
Decision Tree works Well
Suppert Vector Machine works well
Logistic Regression works well
KNN works well
Random Forest works well
"""

#### III .Prediction Without Feature Selection

##### II-1. Data Preprocessing

In [None]:
df = data.copy()
df = df.dropna()

In [None]:
df.info()

In [None]:
X = df.drop(['Source_IP','Destination_IP', 'Label','Flow_Packets', 'Flow_Bytes'], axis=1)
y = df.Label

In [None]:
X = pd.get_dummies(X)

In [None]:
#correlation map
f,ax = plt.subplots(figsize=(20, 20))
sns.heatmap(X.corr(), annot=True, linewidths=0.9, fmt= '.1f',ax=ax)

In [None]:
M = Model(X)

In [None]:
## Logistic Regression(Without FS)
M.LogisticRegression()

**The Accuracy of Logistic Regression: 99.33% **

########################################################################
Best solver is :  newton-cg
########################################################################
              precision    recall  f1-score   support

      BENIGN       0.20      1.00      0.33         5
         Syn       1.00      0.99      1.00      2995

    accuracy                           0.99      3000
   macro avg       0.60      1.00      0.66      3000
weighted avg       1.00      0.99      1.00      3000
 

########################################################################
--- 257.8119032382965 seconds --- time for LogisticRegression


In [None]:
## Support Vector Machine(Without FS)
M.SupportVectorMachine()

Accuracy: 99.63%
######################################################################
Accuracy: 99.17%
######################################################################
Accuracy: 99.17%
######################################################################
Accuracy: 99.70%
######################################################################
**Accuracy of Support Vector Machine  model 100.0%** 

########################################################################
best kernel is :  sigmoid
########################################################################
              precision    recall  f1-score   support

      BENIGN       0.72      0.90      0.80        20
         Syn       1.00      1.00      1.00      2980

    accuracy                           1.00      3000
   macro avg       0.86      0.95      0.90      3000
weighted avg       1.00      1.00      1.00      3000

########################################################################
--- 2220.7574150562286 seconds ---

In [None]:
## Decision Tree(Without FS)
M.DecisionTree()

**The Accuracy of Decision Tree : 100.0%**
########################################################################
              precision    recall  f1-score   support

      BENIGN       1.00      1.00      1.00        25
         Syn       1.00      1.00      1.00      2975

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000

########################################################################
--- 1457.577705860138 seconds ---

In [None]:
## Random Forest Classification(Without FS)
M.RandomForest()

**Accuracy of Random Forest is : 99.83% **

########################################################################
              precision    recall  f1-score   support

      BENIGN       0.88      0.92      0.90        24
         Syn       1.00      1.00      1.00      2976

    accuracy                           1.00      3000
   macro avg       0.94      0.96      0.95      3000
weighted avg       1.00      1.00      1.00      3000

########################################################################
--- 60.41777563095093 seconds ---

In [None]:
M.KNearetsNeighbor()

**Accuracy of KNN model 100.0% **

########################################################################
              precision    recall  f1-score   support

      BENIGN       0.80      0.95      0.87        21
         Syn       1.00      1.00      1.00      2979

    accuracy                           1.00      3000
   macro avg       0.90      0.98      0.93      3000
weighted avg       1.00      1.00      1.00      3000

########################################################################
--- 12904.510899543762 seconds ---