In [None]:
"""
Extraction connections not labeled BENIGN from the CICIDS 2017 dataset
Treating each malicious connection type as a separate class, there are 14 classes: 1 - 14
The input and output are in local directories
"""

In [None]:
# Load the top modules that are used in multiple places
import numpy as np
import pandas as pd

In [None]:
# Column name mapping from original data to compact form
# All the X** are features and the YY is the label
feature_map = {
 ' Destination Port' : 'X1',
 ' Flow Duration' : 'X2', 
 ' Total Fwd Packets' : 'X3', 
 ' Total Backward Packets' : 'X4', 
 'Total Length of Fwd Packets' : 'X5', 
 ' Total Length of Bwd Packets' : 'X6', 
 ' Fwd Packet Length Max' : 'X7', 
 ' Fwd Packet Length Min' : 'X8', 
 ' Fwd Packet Length Mean' : 'X9', 
 ' Fwd Packet Length Std' : 'X10', 
 'Bwd Packet Length Max' : 'X11', 
 ' Bwd Packet Length Min' : 'X12', 
 ' Bwd Packet Length Mean' : 'X13', 
 ' Bwd Packet Length Std' : 'X14', 
 'Flow Bytes/s' : 'X15', 
 ' Flow Packets/s' : 'X16', 
 ' Flow IAT Mean' : 'X17', 
 ' Flow IAT Std' : 'X18', 
 ' Flow IAT Max' : 'X19', 
 ' Flow IAT Min' : 'X20', 
 'Fwd IAT Total' : 'X21', 
 ' Fwd IAT Mean' : 'X22', 
 ' Fwd IAT Std' : 'X23', 
 ' Fwd IAT Max' : 'X24', 
 ' Fwd IAT Min' : 'X25', 
 'Bwd IAT Total' : 'X26', 
 ' Bwd IAT Mean' : 'X27', 
 ' Bwd IAT Std' : 'X28', 
 ' Bwd IAT Max' : 'X29', 
 ' Bwd IAT Min' : 'X30', 
 'Fwd PSH Flags' : 'X31', 
 ' Bwd PSH Flags' : 'X32', 
 ' Fwd URG Flags' : 'X33', 
 ' Bwd URG Flags' : 'X34', 
 ' Fwd Header Length' : 'X35', 
 ' Bwd Header Length' : 'X36', 
 'Fwd Packets/s' : 'X37', 
 ' Bwd Packets/s' : 'X38', 
 ' Min Packet Length' : 'X39', 
 ' Max Packet Length' : 'X40', 
 ' Packet Length Mean' : 'X41', 
 ' Packet Length Std' : 'X42', 
 ' Packet Length Variance' : 'X43', 
 'FIN Flag Count' : 'X44', 
 ' SYN Flag Count' : 'X45', 
 ' RST Flag Count' : 'X46', 
 ' PSH Flag Count' : 'X47', 
 ' ACK Flag Count' : 'X48', 
 ' URG Flag Count' : 'X49', 
 ' CWE Flag Count' : 'X50', 
 ' ECE Flag Count' : 'X51', 
 ' Down/Up Ratio' : 'X52', 
 ' Average Packet Size' : 'X53', 
 ' Avg Fwd Segment Size' : 'X54', 
 ' Avg Bwd Segment Size' : 'X55', 
 ' Fwd Header Length.1' : 'X56', 
 'Fwd Avg Bytes/Bulk' : 'X57', 
 ' Fwd Avg Packets/Bulk' : 'X58', 
 ' Fwd Avg Bulk Rate' : 'X59', 
 ' Bwd Avg Bytes/Bulk' : 'X60', 
 ' Bwd Avg Packets/Bulk' : 'X61', 
 'Bwd Avg Bulk Rate' : 'X62', 
 'Subflow Fwd Packets' : 'X63', 
 ' Subflow Fwd Bytes' : 'X64', 
 ' Subflow Bwd Packets' : 'X65', 
 ' Subflow Bwd Bytes' : 'X66', 
 'Init_Win_bytes_forward' : 'X67', 
 ' Init_Win_bytes_backward' : 'X68', 
 ' act_data_pkt_fwd' : 'X69', 
 ' min_seg_size_forward' : 'X70', 
 'Active Mean' : 'X71', 
 ' Active Std' : 'X72', 
 ' Active Max' : 'X73', 
 ' Active Min' : 'X74', 
 'Idle Mean' : 'X75', 
 ' Idle Std' : 'X76', 
 ' Idle Max' : 'X77', 
 ' Idle Min' : 'X78', 
 ' Label': 'YY'
}

In [None]:
# label names (YY) in the data and their
# mapping to numerical values
label_map = {
 'BENIGN' : 1,
 'FTP-Patator' : 2,
 'SSH-Patator' : 3,
 'DoS slowloris' : 4,
 'DoS Slowhttptest': 5,
 'DoS Hulk' : 6,
 'DoS GoldenEye' : 7,
 'Heartbleed' : 8,
 'Web Attack � Brute Force' : 9,
 'Web Attack � XSS' : 10,
 'Web Attack � Sql Injection' : 11,
 'Infiltration' : 12,
 'Bot' : 13,
 'PortScan' : 14,
 'DDoS' : 15,
}

num_ids_features = 78
num_ids_classes = 15
ids_classes = [ 'BENIGN', 'FTP-Patator', 'SSH-Patator', 'DoS slowloris', 'DoS Slowhttptest', 'DoS Hulk', 'DoS GoldenEye', 'Heartbleed', 'Brute Force', 'XSS', 'Sql Injection', 'Infiltration', 'Bot', 'PortScan', 'DDoS',]

In [None]:
indir = './MachineLearningCVE/raw/'
outdir = './MachineLearningCVE/restart/'
mal_data = 'mal.csv'
in_filenames = [
    'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv', 
    'Monday-WorkingHours.pcap_ISCX.csv', 
    'Friday-WorkingHours-Morning.pcap_ISCX.csv', 
    'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv', 
    'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv', 
    'Tuesday-WorkingHours.pcap_ISCX.csv', 
    'Wednesday-workingHours.pcap_ISCX.csv', 
    'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv'
]

In [None]:
feature_names = feature_map.values()
df_out = pd.DataFrame(columns = feature_names)

In [None]:
for f in in_filenames:
    df_in = pd.read_csv(indir + f)
    
    # Convert long column headers to compact format
    df_in.rename(columns = feature_map, inplace=True)
    
    if f != 'Monday-WorkingHours.pcap_ISCX.csv':
        df_in.drop(df_in[df_in.YY == 'BENIGN'].index, inplace=True)
    
    # Convert string labels to numeric
    df_in['YY'].replace(label_map, inplace=True)
    print (df_in.shape)
    
    # add to output dataframe
    df_out = pd.concat([df_out, df_in], ignore_index=True)

In [None]:
df_out.shape

In [None]:
# export to csv
df_out.to_csv(outdir + mal_data, index=False)

In [None]:
1087564 - 529918