## Importing our Needed Modules

In [36]:
import pandas as pd
import numpy as np
import time
import warnings
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Setting up our columns for the dataframes

In [37]:
cols = [' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', 'FIN Flag Count',
       ' SYN Flag Count', ' RST Flag Count', ' PSH Flag Count',
       ' ACK Flag Count', ' URG Flag Count', ' CWE Flag Count',
       ' ECE Flag Count', ' Down/Up Ratio', ' Average Packet Size',
       ' Avg Fwd Segment Size', ' Avg Bwd Segment Size',
       ' Fwd Header Length.1', 'Fwd Avg Bytes/Bulk', ' Fwd Avg Packets/Bulk',
       ' Fwd Avg Bulk Rate', ' Bwd Avg Bytes/Bulk', ' Bwd Avg Packets/Bulk',
       'Bwd Avg Bulk Rate', 'Subflow Fwd Packets', ' Subflow Fwd Bytes',
       ' Subflow Bwd Packets', ' Subflow Bwd Bytes', 'Init_Win_bytes_forward',
       ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward', 'Active Mean', ' Active Std', ' Active Max',
       ' Active Min', 'Idle Mean', ' Idle Std', ' Idle Max', ' Idle Min',' Label']

## Importing our Data

In [46]:
df1=pd.read_csv("/workspaces/codespaces-jupyter/data/MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv", usecols = cols)#,nrows = 50000
df2=pd.read_csv("/workspaces/codespaces-jupyter/data/MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv", usecols = cols)
df3=pd.read_csv("/workspaces/codespaces-jupyter/data/MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv", usecols = cols)
df4=pd.read_csv("/workspaces/codespaces-jupyter/data/MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv", usecols = cols)
df5=pd.read_csv("/workspaces/codespaces-jupyter/data/MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv", usecols = cols)

df = pd.concat([df1,df2])
del df1,df2
df = pd.concat([df, df3])
del df3
df = pd.concat([df, df4])
del df4
df = pd.concat([df, df5])

## Cleaning data to build out LinearRegression Model

In [48]:
data = df.copy()
data.dropna(axis=1,inplace=True)
#LinearRegression doesn't use inf,NaN datas. Flow Packets/s includes that so reduce it.
data=data.drop(columns=[' Flow Packets/s'], axis=1, inplace=False)

## Function to measure the MEAN of the data

In [49]:
def testing_all(data):
    x = data[data.columns[0:-1]]
    y = data[[' Label']]
    x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.8,test_size=0.2)
    attack = ['DDoS', 'PortScan', 'Bot', 'Infiltration', 'Web Attack � Brute Force', 'Web Attack � XSS', 'Web Attack � Sql Injection']
    normal = 'BENIGN'
    y_train=y_train.replace(attack,-1)
    y_train=y_train.replace(normal,1)
    y_test=y_test.replace(attack,-1)
    y_test=y_test.replace(normal,1)
    model=LinearRegression()
    model.fit(x_train, y_train) 
    y_pred = model.predict(x_test)
    #Relabel values within a certain range to measure values
    for i in range(0,y_pred.size):
        if(y_pred[i]>0):
            y_pred[i]=1
        else:
            y_pred[i]=-1
    cf_matrix = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cf_matrix.ravel()
    recall = tp/(tp+fn) 
    precision = tp/(tp+fp) 
#    print("Recall", recall, "\nPrecision", precision)
    f = 2 * (precision*recall)/(precision+recall)
#    print("F1 Score", f)
    
    return f

## Average f1 Score

In [50]:
sum=0
for z in range(1,21):
    a = testing_all(data)
    sum +=a
avg= sum / 20
print("average f1 score:",avg)

average f1 score: 0.9612069992569886
