# Extracting features from UNB CIS IDS 2018 dataset

Include external libraries and tools.

In [243]:
#basic
import re
import os
import sys
import math
import json
import pandas as pd
import time
import datetime
from dateutil import parser
import numpy as np

#evtx
import Evtx.Evtx as evtx
import Evtx.Views as e_views

#xml
import xmltodict

Define global variables.

In [244]:
dataDir = "../data/original"
filters = {"Wednesday-28-02-2018": ["18.221.148.137", "172.31.69.24"],
           "Thursday-01-03-2018": ["18.216.254.154", "172.31.69.13"]}

Function: readData ||| Req: dataDir, filters ||| Purpose: Read all files in given dir with filters ||| Ret: data matrix

In [245]:
def readData(dataDir, filters):
    dirList = os.listdir(dataDir)
    dataFiles = [os.path.join(dataDir, x) for x in dirList]
    data = []

    for file in dataFiles:
        print(file)
        with evtx.Evtx(file) as log:
            for record in log.records():
                data_dict = dict(xmltodict.parse(record.xml()))
                data.append(data_dict["Event"])                
            
        break

    return data

Function: main[dataSetDirs][][].

In [246]:
print("dataset Dirs: ", dataDir)
data = readData(dataDir, filters)

dataset Dirs:  ../data/original
../data/original/EC2AMAZ-O4EL3NG-172.31.69.24.evtx


Main section.

In [247]:
data_list = pd.DataFrame(pd.io.json.json_normalize(data))
data_list = data_list.drop(['@xmlns'], axis=1)

In [248]:
pd.set_option('display.max_row', 100, 'display.max_columns', None)

drop_columns = ['UserData.LogFileCleared.BackupPath', 'EventData', 'System.Provider.@Guid',
                'System.Execution.@ProcessID', 'System.Execution.@ThreadID', 'EventData.Binary',
                'System.EventID.@Qualifiers', 'System.Keywords', 'System.EventRecordID',
                'System.Correlation.@ActivityID', 'System.Correlation.@RelatedActivityID', 'System.Channel',
                'UserData.LogFileCleared.@xmlns', 'UserData.LogFileCleared.SubjectUserName',
                'UserData.LogFileCleared.SubjectDomainName', 'UserData.LogFileCleared.Channel',
                'UserData.InstallDeviceID.@xmlns', 'UserData.InstallDeviceID.DriverName',
                'UserData.InstallDeviceID.DriverVersion', 'UserData.InstallDeviceID.DriverProvider',
                'UserData.InstallDeviceID.DeviceInstanceID', 'UserData.InstallDeviceID.SetupClass',
                'UserData.InstallDeviceID.RebootOption', 'UserData.InstallDeviceID.UpgradeDevice', 
                'UserData.InstallDeviceID.IsDriverOEM', 'UserData.InstallDeviceID.InstallStatus',
                'UserData.InstallDeviceID.DriverDescription', 'UserData.InstallDeviceID.DeviceInstanceID',
                'EventData.Data.@Name', 'EventData.Data.#text', 'EventData.@Name']
for col in drop_columns:
    if col in data_list.columns:
        data_list = data_list.drop(columns=[col])

In [249]:
services = [[], []]
temp_len = 0
for index, row in data_list.iterrows():
    temp_len = index
    temp = row['EventData.Data']

    services[0].append(-1)
    services[1].append(-1)

    if (type(temp) is list):
        if len(temp) == 2:
            for i in range(len(temp)):
                temp_dict = dict(temp[i])
                if (len(temp_dict) == 2) and ('param' in temp_dict['@Name']) and (dict(temp[1])['#text'] == 'running' or dict(temp[1])['#text'] == 'stopped'):
                    if(temp_dict['#text'] == 'running'):
                        services[i][index] = 1
                    elif(temp_dict['#text'] == 'stopped'):
                        services[i][index] = 0
                    else:
                        services[i][index] = temp_dict['#text']

In [250]:
data_list['service'] = services[0]
data_list['service.status'] = services[1]
print(temp_len + 1)
print(len(services[0]))
print(len([i for i in services[1] if not(i == -1)]))

43221
43221
18775


In [251]:
if 'EventData.Data' in data_list.columns:
    data_list = data_list.drop(columns=['EventData.Data'])

In [252]:
event_source = [np.nan, 'Service Control Manager', 'DCOM', 'WinRM', 'User32', 'TermService']
for index, row in data_list.iterrows():
    if not(isinstance(row['System.Provider.@EventSourceName'], int)):
        data_list.loc[index, 'System.Provider.@EventSourceName'] = event_source.index(row['System.Provider.@EventSourceName'])

In [253]:
for index, row in data_list.iterrows():
    temp = parser.parse(row['System.TimeCreated.@SystemTime'])
    data_list.loc[index, 'System.TimeCreated.@SystemTime'] = time.mktime(temp.timetuple())

In [254]:
user_ids = ['', 'S-1-5-18', 'S-1-5-19', 'S-1-5-21-2658501782-1119487302-2855053826-500', 'S-1-5-21-798533991-437016557-2328494185-500']
for index, row in data_list.iterrows():
    if not(isinstance(row["System.Security.@UserID"], int)):
        data_list.loc[index, "System.Security.@UserID"] = user_ids.index(row["System.Security.@UserID"])

In [255]:
computers = ["EC2AMAZ-O4EL3NG", "EC2AMAZ-9ENFQRN"]
for index, row in data_list.iterrows():
    if not(isinstance(row["System.Computer"], int)):
        data_list.loc[index, "System.Computer"] = computers.index(row["System.Computer"])

In [256]:
systemProviderName = ['Microsoft-Windows-Eventlog', 'Service Control Manager',
       'Microsoft-Windows-UserModePowerService', 'User32', 'EventLog',
       'Microsoft-Windows-Kernel-General',
       'Microsoft-Windows-DHCPv6-Client', 'Microsoft-Windows-Dhcp-Client',
       'Microsoft-Windows-Kernel-Power', 'Microsoft-Windows-Kernel-Boot',
       'Microsoft-Windows-FilterManager', 'Microsoft-Windows-Ntfs',
       'Microsoft-Windows-Kernel-Processor-Power', 'vxn',
       'Microsoft-Windows-Wininit',
       'Microsoft-Windows-Directory-Services-SAM',
       'Microsoft-Windows-WinRM', 'Microsoft-Windows-Time-Service',
       'Microsoft-Windows-Setup', 'Microsoft-Windows-Iphlpsvc',
       'Microsoft-Windows-TerminalServices-RemoteConnectionManager',
       'Microsoft-Windows-TPM-WMI', 'Virtual Disk Service',
       'Microsoft-Windows-DistributedCOM', 'Microsoft-Windows-Winlogon',
       'Microsoft-Windows-GroupPolicy', 'Lfsvc',
       'Microsoft-Windows-WindowsUpdateClient', 'srv',
       'Microsoft-Windows-UserPnp']
for index, row in data_list.iterrows():
    if not(isinstance(row["System.Provider.@Name"], int)):
        data_list.loc[index, "System.Provider.@Name"] = systemProviderName.index(row["System.Provider.@Name"])

In [257]:
sys_data = data_list.sort_values(by=['System.TimeCreated.@SystemTime'])

### Extracting Network Features

In [258]:
dataDir = "../data/processed"
filter_ = "Wednesday"

In [259]:
def readNetworkData(dataDir, filter_):
    dirList = os.listdir(dataDir)
    dataFiles = [os.path.join(dataDir, x) for x in dirList if filter_ in x]

    for file in dataFiles:
        data = pd.read_csv(file)
            
    return data

In [260]:
print("dataset Dirs: ", dataDir)
data = readNetworkData(dataDir, filter_)

dataset Dirs:  ../data/processed


  if (await self.run_code(code, result,  async_=asy)):


In [261]:
net_data = data.sort_values(by=['Timestamp'])

In [262]:
pd.set_option('display.max_row', 100, 'display.max_columns', None)

In [267]:
print(len(sys_data.index))
print(sys_data["service"].value_counts())
display(sys_data)

43221
-1                                                24446
Amazon SSM Agent                                  11358
Software Protection                                2668
WinHTTP Web Proxy Auto-Discovery Service            377
Update Orchestrator Service for Windows Update      260
Windows Update                                      181
Windows Insider Service                             134
Microsoft Account Sign-in Assistant                 129
wlidsvc                                              82
sppsvc                                               82
wisvc                                                76
NetSetupSvc                                          67
Data Sharing Service                                 67
RemoteRegistry                                       53
Distributed Transaction Coordinator                  53
MapsBroker                                           50
Device Setup Manager                                 46
Connected Devices Platform Service        

Unnamed: 0,System.Provider.@Name,System.EventID.#text,System.Version,System.Level,System.Task,System.Opcode,System.TimeCreated.@SystemTime,System.Computer,System.Security.@UserID,System.Provider.@EventSourceName,service,service.status
2,1,7036,0,4,0,0,1515832337,1,0,1,AppX Deployment Service (AppXSVC),1
0,0,104,0,4,104,0,1515832338,1,4,0,-1,-1
1,0,104,0,4,104,0,1515832338,1,4,0,-1,-1
3,0,104,0,4,104,0,1515832338,1,4,0,-1,-1
4,1,7036,0,4,0,0,1515832340,1,0,1,Windows Update,0
5,1,7036,0,4,0,0,1515832344,1,0,1,Portable Device Enumerator Service,0
6,1,7036,0,4,0,0,1515832345,1,0,1,Connected Devices Platform Service,0
7,1,7036,0,4,0,0,1515832345,1,0,1,Diagnostic Policy Service,1
8,1,7036,0,4,0,0,1515832345,1,0,1,Update Orchestrator Service for Windows Update,0
9,1,7036,0,4,0,0,1515832345,1,0,1,Downloaded Maps Manager,1


In [268]:
print(len(net_data.index))
print(net_data["Label"].value_counts())
display(net_data)

613104
Benign           544200
Infilteration     68871
Label                33
Name: Label, dtype: int64


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
355677,3386,6,28/02/2018 01:00:00,529751,2,1,0,0,0,0,0,0,0,0,0,0,0,5.6630379178,264875.5,374562.240169107,529731,20,529751,529751,0,529751,529751,0,0,0,0,0,0,0,0,0,40,20,3.7753586119,1.8876793059,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,0,1024,0,0,20,0,0,0,0,0,0,0,0,Benign
223312,53,17,28/02/2018 01:00:01,154058,1,1,40,167,40,40,40,0,167,167,167,0,1343.6497942333,12.9821236158,154058,0,154058,154058,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,8,6.4910618079,6.4910618079,40,167,82.3333333333,73.3234841871,5376.3333333333,0,0,0,0,0,0,0,0,1,123.5,40,167,0,0,0,0,0,0,1,40,1,167,-1,-1,0,8,0,0,0,0,0,0,0,0,Benign
209980,443,6,28/02/2018 01:00:01,782742,9,8,1818,3898,1078,0,202,345.734,1460,0,487.25,639.506,7302.5339128346,21.7185228338,48921.4,106435,406093,1,761401,95175.1,142201,406093,44,760785,108684,212394,570093,1,0,0,0,0,192,172,11.498,10.2205,0,1460,317.556,501.2,251201,0,0,1,1,0,0,0,1,0,336.235,202,487.25,0,0,0,0,0,0,9,1818,8,3898,8192,8046,5,20,0,0,0,0,0,0,0,0,Benign
367625,0,0,28/02/2018 01:00:01,112640736,3,0,0,0,0,0,0,0,0,0,0,0,0,0.0266333487,56320368,18.3847763109,56320381,56320355,112640736,56320368,18.3847763109,56320381,56320355,0,0,0,0,0,0,0,0,0,0,0,0.0266333487,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,-1,-1,0,0,0,0,0,0,56320368,18.3847763109,56320381,56320355,Benign
446188,443,6,28/02/2018 01:00:02,19816595,6,8,717,3695,387,0,119.5,160.565,1460,0,461.875,652.749,222.6416799,0.706478585,1.52435e+06,5.43569e+06,19600000,1,122612,24522.4,20792,40151,887,19800000,2.82534e+06,7.40371e+06,19600000,1,0,0,0,0,132,172,0.302777,0.403702,0,1460,294.133,507.52,257577,0,0,1,1,0,0,0,1,1,315.143,119.5,461.875,0,0,0,0,0,0,6,717,8,3695,8192,7475,3,20,201350,0,201350,201350,1.96e+07,0,19600000,19600000,Benign
234807,443,6,28/02/2018 01:00:02,69927,5,5,676,219,388,0,135.2,171.697,141,0,43.8,63.9781,12799.0618788165,143.0062779756,7769.67,10666.6,22434,22,44685,11171.2,12363.8,21943,258,48006,12001.5,12302,23430,99,0,0,0,0,112,112,71.5031,71.5031,0,388,81.3636,127.459,16245.9,0,0,1,1,0,0,0,1,1,89.5,135.2,43.8,0,0,0,0,0,0,5,676,5,219,8192,7516,3,20,0,0,0,0,0,0,0,0,Benign
347976,6001,6,28/02/2018 01:00:02,19,1,1,0,0,0,0,0,0,0,0,0,0,0,105263.157894737,19,0,19,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,20,52631.5789473684,52631.5789473684,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,41062,0,0,20,0,0,0,0,0,0,0,0,Benign
244799,53,17,28/02/2018 01:00:02,208805,2,2,80,292,40,40,40,0,146,146,146,0,1781.5665333685,19.1566293911,69601.7,95536.9,178524,0,30281,30281,0,30281,30281,0,0,0,0,0,0,0,0,0,16,16,9.57831,9.57831,40,146,82.4,58.0586,3370.8,0,0,0,0,0,0,0,0,1,103,40,146,0,0,0,0,0,0,2,80,2,292,-1,-1,1,8,0,0,0,0,0,0,0,0,Benign
234809,443,6,28/02/2018 01:00:02,2250028,11,12,2008,4434,1078,0,182.545,315.125,1460,0,369.5,539.465,2863.0754817273,10.2220950139,102274,413135,1949745,1,299978,29997.8,27301.1,71621,45,2203048,200277,588493,1973252,1,0,0,0,0,232,252,4.88883,5.33327,0,1460,268.417,440.852,194351,0,0,1,1,0,0,0,1,1,280.087,182.545,369.5,0,0,0,0,0,0,11,2008,12,4434,8192,7856,6,20,0,0,0,0,0,0,0,0,Benign
209981,443,6,28/02/2018 01:00:02,136,2,0,0,0,0,0,0,0,0,0,0,0,0,14705.8823529412,136,0,136,136,136,136,0,136,136,0,0,0,0,0,0,0,0,0,40,0,14705.9,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,257,-1,0,20,0,0,0,0,0,0,0,0,Benign


In [265]:
for index, row in data.iterrows():
    temp = parser.parse(row['Timestamp'])
    data.loc[index, 'Timestamp'] = time.mktime(temp.timetuple())

KeyboardInterrupt: 