## Query Output Preprocessing

The graph database Dgraph returns query results in JSON format. The queries consist of getting all `originated` and all `responded` connections of a specified host. The `query_handler` tool converts these JSON outputs to CSV files (two CSV files for each host with some IP address - one for each connection direction (`originated`, `responded`)). 

This Jupyter notebook is used to:

1. Compute the neighbourhoods of these hosts. *(For each connection, compute its neighbourhood which is given by connections in a given time interval.)*
2. Concat DataFrames to one final DataFrames.
3. Assign labels. 
4. Write the result to a single file, ready for ML preprocessing (data preparation).

## Neighbourhood Computation

### 0. Load the data

In [1]:
import os

PREFIX_PATH = '/home/sramkova/diploma_thesis_data/cicids2017/attacks'

# get last two dictinary names of current directory, they correspond to directory names of input data
attack_dir_path = '/'.join(os.getcwd().split('/')[-2:])
PREFIX = PREFIX_PATH + '/' + attack_dir_path + '/'
print(PREFIX)

/home/sramkova/diploma_thesis_data/cicids2017/attacks/3_friday/ddos/


In [2]:
import pandas as pd
import numpy as np

DIR_PATH_ORIG = PREFIX + 'originated'
DIR_PATH_RESP = PREFIX + 'responded'


file_list_orig = []
file_list_resp = []

def get_file_names(file_list, dir_path):
    for filename in os.listdir(dir_path):
        # only IPv4: 
        if 'f' not in filename and filename.endswith('.csv'):
            # (if there is an 'f' present in the name of the file, it means that the file contains 
            # connections of a host with IPv6 address)
            file_list.append(filename)

# load filenames to lists:
get_file_names(file_list_orig, DIR_PATH_ORIG)
get_file_names(file_list_resp, DIR_PATH_RESP)

print(len(file_list_orig))
print(len(file_list_resp))

15
1136


In [3]:
# load as dataframes to a dictionary for easier processing:

# elements of the dictionary are in a form: { host.ip -> df with connections of corresponding host }
dfs_orig = {}
dfs_resp = {}

def load_files_to_dfs(dfs_dict, file_list, dir_path, prefix):
    prefix_name = 'output-' + prefix
    for filename in file_list:
        file_ip = filename
        file_ip = file_ip.replace(prefix_name, '').replace('.csv', '')
        df_conns = pd.read_csv(dir_path + '/' + filename)

        df_conns['connection.time'] = pd.to_datetime(df_conns['connection.ts'])
        
        # missing connection.service value means that Zeek wasn't able to extract the service => nulls can 
        # be treated as a new category
        df_conns['connection.service'].fillna('none', inplace = True)

        dfs_dict[file_ip] = df_conns

load_files_to_dfs(dfs_orig, file_list_orig, DIR_PATH_ORIG, 'o-')
load_files_to_dfs(dfs_resp, file_list_resp, DIR_PATH_RESP, 'r-')

print(len(dfs_orig))
print(len(dfs_resp))

15
1136


In [4]:
# max, min times to check if they correspond to available attack times (considering the time shift):

o_max = dfs_orig['192.168.10.25']['connection.time'][0]
o_min = dfs_orig['192.168.10.25']['connection.time'][0]

for o_ip in dfs_orig:
    o_df = dfs_orig[o_ip]
    cur_max = o_df['connection.time'].max()
    cur_min = o_df['connection.time'].min()
    if cur_max > o_max:
        o_max = cur_max
        # print(o_ip)
    if cur_min < o_min:
        o_min = cur_min
        # print(o_ip)

print(o_min)
print(o_max)

2017-07-07 18:55:00.685029+00:00
2017-07-07 19:24:56.015158+00:00


### 1. Compute neighbourhoods for each row based on a time interval

(e.g. time interval: +- 5 minutes)

In [5]:
# various stat functions on attributes from neighbourhood:

def get_counts(df, prefix):
    # counts (overall + counts of different protocols): 
    proto_tcp_count = 0
    proto_udp_count = 0
    proto_icmp_count = 0
            
    if 'connection.proto' in df:
        proto_counts = df['connection.proto'].value_counts()
        proto_tcp_count = proto_counts['tcp'] if 'tcp' in proto_counts else 0
        proto_udp_count = proto_counts['udp'] if 'udp' in proto_counts else 0
        proto_icmp_count = proto_counts['icmp'] if 'icmp' in proto_counts else 0
    
    return {prefix + '_total': len(df.index),
            prefix + '_proto_tcp_count': proto_tcp_count,
            prefix + '_proto_udp_count': proto_udp_count,
            prefix + '_proto_icmp_count': proto_icmp_count
           }

def get_modes(df, prefix):
    # .mode()[0] return the value of a categorical variable that appeared the most times
    return {prefix + '_connection.protocol_mode': df['connection.proto'].mode()[0] if 'connection.proto' in df else '-',
            prefix + '_connection.service_mode': df['connection.service'].mode()[0] if 'connection.service' in df else '-',
            prefix + '_connection.conn_state_mode': df['connection.conn_state'].mode()[0] if 'connection.conn_state' in df else '-'
           }

def get_means(df, prefix):
    # .mean() returns mean of the corresponding numerical attribute variable values
    return {prefix + '_connection.time_mean': df['connection.time'].mean() if 'connection.time' in df else cur_time,
            prefix + '_connection.duration_mean': df['connection.duration'].mean() if 'connection.duration' in df else 0, 
            # prefix + '_connection.orig_p_mean': df['connection.orig_p'].mean() if 'connection.orig_p' in df else 0, 
            prefix + '_connection.orig_bytes_mean': df['connection.orig_bytes'].mean() if 'connection.orig_bytes' in df else 0,
            prefix + '_connection.orig_pkts_mean': df['connection.orig_pkts'].mean() if 'connection.orig_pkts' in df else 0, 
            # prefix + '_connection.resp_p_mean': df['connection.resp_p'].mean() if 'connection.resp_p' in df else 0,
            prefix + '_connection.resp_bytes_mean': df['connection.resp_bytes'].mean() if 'connection.resp_bytes' in df else 0,
            prefix + '_connection.resp_pkts_mean': df['connection.resp_pkts'].mean() if 'connection.resp_pkts' in df else 0
           }

def get_stats_means(df, prefix):
    # .mean() returns mean of the corresponding numerical attribute variable values
    return {prefix + '_dns_count_mean': df['dns_count'].mean() if 'dns_count' in df else 0,
            prefix + '_ssh_count_mean': df['ssh_count'].mean() if 'ssh_count' in df else 0, 
            prefix + '_http_count_mean': df['http_count'].mean() if 'http_count' in df else 0,
            prefix + '_ssl_count_mean': df['ssl_count'].mean() if 'ssl_count' in df else 0,
            prefix + '_files_count_mean': df['files_count'].mean() if 'files_count' in df else 0
           }

def get_medians(df, prefix):
    # .median() returns median of the corresponding numerical attribute variable values
    return {prefix + '_connection.time_median': df['connection.time'].median() if 'connection.time' in df else cur_time,
            prefix + '_connection.duration_median': df['connection.duration'].median() if 'connection.duration' in df else 0, 
            # prefix + '_connection.orig_p_median': df['connection.orig_p'].median() if 'connection.orig_p' in df else 0,
            prefix + '_connection.orig_bytes_median': df['connection.orig_bytes'].median() if 'connection.orig_bytes' in df else 0,
            prefix + '_connection.orig_pkts_median': df['connection.orig_pkts'].median() if 'connection.orig_pkts' in df else 0, 
            # prefix + '_connection.resp_p_median': df['connection.resp_p'].median() if 'connection.resp_p' in df else 0,
            prefix + '_connection.resp_bytes_median': df['connection.resp_bytes'].median() if 'connection.resp_bytes' in df else 0,
            prefix + '_connection.resp_pkts_median': df['connection.resp_pkts'].median() if 'connection.resp_pkts' in df else 0
           }

def get_orig_ports(df, prefix):
    # count orig_p categories:
    orig_well_known_count = 0
    orig_reg_or_dyn_count = 0
    unique_orig_p_list = df['connection.orig_p'].unique().tolist()
    values_orig_p = df['connection.orig_p'].value_counts()
    
    for uniq_p in unique_orig_p_list:
        if uniq_p < 1024:
            orig_well_known_count += values_orig_p[uniq_p]
        else:
            orig_reg_or_dyn_count += values_orig_p[uniq_p]
            
    return {prefix + '_orig_p_well_known_count': orig_well_known_count,
            prefix + '_orig_p_reg_or_dyn_count': orig_reg_or_dyn_count}

def get_resp_ports(df, prefix):
    # count resp_p categories:
    common_ports = {21: 0, 
                    22: 0, 
                    53: 0, 
                    80: 0, 
                    123: 0, 
                    443: 0, 
                    3389: 0}
    resp_well_known = 0
    resp_reg = 0
    resp_dyn = 0
    unique_resp_p_list = df['connection.resp_p'].unique().tolist()
    values_resp_p = df['connection.resp_p'].value_counts()
    
    for uniq_p in unique_resp_p_list:
        if uniq_p in common_ports.keys():
            common_ports[uniq_p] += values_resp_p[uniq_p]
        elif uniq_p < 1024:
            resp_well_known += values_resp_p[uniq_p]
        elif uniq_p < 49152:
            resp_reg += values_resp_p[uniq_p]
        else:
            resp_dyn += values_resp_p[uniq_p]
            
    return {prefix + '_resp_p_21_count': common_ports[21],
            prefix + '_resp_p_22_count': common_ports[22],
            prefix + '_resp_p_53_count': common_ports[53], 
            prefix + '_resp_p_80_count': common_ports[80],
            prefix + '_resp_p_123_count': common_ports[123],
            prefix + '_resp_p_443_count': common_ports[443],
            prefix + '_resp_p_3389_count': common_ports[3389],
            prefix + '_resp_p_well_known_count': resp_well_known,
            prefix + '_resp_p_reg_count': resp_reg,
            prefix + '_resp_p_dyn_count': resp_dyn}

In [6]:
def generate_duration_filter(duration_val):
    # based on constants from data_exploration.ipynb
    if duration_val <= 0.0:
        return 0.000001, None
    elif duration_val <= 0.0001:
        return 0.000001, 0.001
    elif duration_val <= 0.009:
        return 0.001, 0.05
    elif duration_val <= 0.5:
        return 0.05, 1.5
    elif duration_val <= 5:
        return 1.5, 10
    elif duration_val <= 15:
        return 10, 20
    elif duration_val <= 30:
        return 20, 40
    elif duration_val <= 50:
        return 40, 60
    elif duration_val <= 75:
        return 60, 90
    elif duration_val <= 100:
        return 75, 110
    return None, 100

def generate_bytes_filter(bytes_val):
    if bytes_val == 0:
        return 0, 0
    elif bytes_val <= 1450:
        return bytes_val - 50, bytes_val + 50
    elif bytes_val <= 35000:
        return bytes_val - 500, bytes_val + 500
    else:
        return None, bytes_val - 1000

In [7]:
def get_similar_count(df, row, prefix):
    # protocol filter
    mask = (df['connection.proto'] == row['connection.proto'])
    df_filtered = df.loc[mask]
    
    # service filter
    mask = (df_filtered['connection.service'] == row['connection.service'])
    df_filtered = df_filtered.loc[mask]
    
    # conn_state filter
    mask = (df_filtered['connection.conn_state'] == row['connection.conn_state'])
    df_filtered = df_filtered.loc[mask]
    
    # duration filter
    lower, upper = generate_duration_filter(row['connection.duration'])
    if lower:
        mask = df_filtered['connection.duration'] >= lower
        df_filtered = df_filtered.loc[mask]
    if upper:
        mask = df_filtered['connection.duration'] <= upper
        df_filtered = df_filtered.loc[mask]
        
    # _bytes filter
    lower, upper = generate_duration_filter(row['connection.orig_bytes'])
    if lower:
        mask = df_filtered['connection.orig_bytes'] >= lower
        df_filtered = df_filtered.loc[mask]
    if upper:
        mask = df_filtered['connection.orig_bytes'] <= upper
        df_filtered = df_filtered.loc[mask]
        
    lower, upper = generate_duration_filter(row['connection.resp_bytes'])
    if lower:
        mask = df_filtered['connection.resp_bytes'] >= lower
        df_filtered = df_filtered.loc[mask]
    if upper:
        mask = df_filtered['connection.resp_bytes'] <= upper
        df_filtered = df_filtered.loc[mask]
    
    # _ip_bytes filter
    mask = (df_filtered['connection.orig_ip_bytes'] >= row['connection.orig_ip_bytes'] - 50) & (df_filtered['connection.orig_ip_bytes'] <= row['connection.orig_ip_bytes'] + 50)
    df_filtered = df_filtered.loc[mask]
    mask = (df_filtered['connection.resp_ip_bytes'] >= row['connection.resp_ip_bytes'] - 50) & (df_filtered['connection.resp_ip_bytes'] <= row['connection.resp_ip_bytes'] + 50)
    df_filtered = df_filtered.loc[mask]
    
    # remove original connection from neighbourhood (empty will have size 0 instead of 1)
    mask = (df_filtered['connection.uid'] != row['connection.uid'])
    df_filtered = df_filtered.loc[mask]

    return {prefix + '_similar_conns_count': df_filtered.shape[0]}

In [8]:
def check_attr_value(x, attr_str, row_attr_vals_list):
    if isinstance(x, float) and np.isnan(x):
        return False
    
    if isinstance(x, list) and len(x) < 1:
        return False
    
    if isinstance(x, str) and x == '[]':
        return False
    
    if isinstance(row_attr_vals_list, list) and len(row_attr_vals_list) > 0:
        for attribute in x:
            if attribute in row_attr_vals_list:
                return True
    return False

def get_similar_attributes_count(df, row, prefix):
    neighbourhood_attributes_dict = {}
    attributes = ['dns_qtype', 'dns_rcode', 'ssh_auth_attempts', 'ssh_host_key', 'http_method', 'http_status_code', 
                  'http_user_agent', 'ssl_version', 'ssl_cipher', 'ssl_curve', 'ssl_validation_status', 'files_source',
                  'file_md5']
    
    for attr in attributes:
        if not row[attr]:
            # attribute value list is empty, no similarity is counted
            attr_dict = {prefix + '_similar_' + attr + '_count': 0}
            neighbourhood_attributes_dict.update(attr_dict)
        else:
            # filter
            mask = df[attr].apply(lambda x: check_attr_value(x, attr, row[attr]))
            df_filtered = df.loc[mask]

            # remove original connection from neighbourhood (empty will have size 0 instead of 1)
            mask = (df_filtered['connection.uid'] != row['connection.uid'])
            df_filtered = df_filtered.loc[mask]

            # add attribute count to dictionary that contains all counts
            attr_dict = {prefix + '_similar_' + attr + '_count': df_filtered.shape[0]}
            neighbourhood_attributes_dict.update(attr_dict)
    
    return neighbourhood_attributes_dict

In [9]:
def compute_time_neighbourhood(host_ip, dfs_list, time_col_name, cur_time, time_start, time_end, row, prefix):
    if host_ip in dfs_list:
        ip_df = dfs_list[host_ip]
        mask = (ip_df[time_col_name] > time_start) & (ip_df[time_col_name] <= time_end)
        df = ip_df.loc[mask]

        if len(df) > 0:
            neighbourhood_dict = {}

            neighbourhood_counts = get_counts(df, prefix)
            neighbourhood_modes = get_modes(df, prefix)
            neighbourhood_means = get_means(df, prefix)
            # neighbourhood_medians = get_medians(df, prefix)
            neighbourhood_orig_ports = get_orig_ports(df, prefix)
            neighbourhood_resp_ports = get_resp_ports(df, prefix)
            neighbourhood_stats_means = get_stats_means(df, prefix)
            neighbourhood_similar_count = get_similar_count(df, row, prefix)
            neighbourhood_similar_attributes_count = get_similar_attributes_count(df, row, prefix)
            
            neighbourhood_dict.update(neighbourhood_counts)
            neighbourhood_dict.update(neighbourhood_modes)
            neighbourhood_dict.update(neighbourhood_means)
            # neighbourhood_dict.update(neighbourhood_medians)
            neighbourhood_dict.update(neighbourhood_orig_ports)
            neighbourhood_dict.update(neighbourhood_resp_ports)
            neighbourhood_dict.update(neighbourhood_stats_means)
            neighbourhood_dict.update(neighbourhood_similar_count)
            neighbourhood_dict.update(neighbourhood_similar_attributes_count)
            
            return neighbourhood_dict

    return {prefix + '_total': 0,
            prefix + '_proto_tcp_count': 0,
            prefix + '_proto_udp_count': 0,
            prefix + '_proto_icmp_count': 0,
            prefix + '_connection.protocol_mode': '-',
            prefix + '_connection.service_mode': '-',
            prefix + '_connection.conn_state_mode': '-',
            prefix + '_connection.time_mean': cur_time, # time_mean: 0 could not be here => problem later with time conversion (missing year) 
                                                        # (but does it make sense as a default value?)
            prefix + '_connection.duration_mean': 0, 
            prefix + '_connection.orig_bytes_mean': 0,
            prefix + '_connection.orig_pkts_mean': 0,
            prefix + '_connection.resp_bytes_mean': 0,
            prefix + '_connection.resp_pkts_mean': 0,
            prefix + '_orig_p_well_known_count': 0,
            prefix + '_orig_p_reg_or_dyn_count': 0,
            prefix + '_resp_p_21_count': 0,
            prefix + '_resp_p_22_count': 0,
            prefix + '_resp_p_53_count': 0, 
            prefix + '_resp_p_80_count': 0,
            prefix + '_resp_p_123_count': 0,
            prefix + '_resp_p_443_count': 0,
            prefix + '_resp_p_3389_count': 0,
            prefix + '_resp_p_well_known_count': 0,
            prefix + '_resp_p_reg_count': 0,
            prefix + '_resp_p_dyn_count': 0,
            prefix + '_dns_count_mean': 0,
            prefix + '_ssh_count_mean': 0,
            prefix + '_http_count_mean': 0,
            prefix + '_ssl_count_mean': 0,
            prefix + '_files_count_mean': 0,
            prefix + '_similar_conns_count': 0,
            prefix + '_similar_dns_qtype_count': 0,
            prefix + '_similar_dns_rcode_count': 0,
            prefix + '_similar_ssh_auth_attempts_count': 0,
            prefix + '_similar_ssh_host_key_count': 0,
            prefix + '_similar_http_method_count': 0,
            prefix + '_similar_http_status_code_count': 0,
            prefix + '_similar_http_user_agent_count': 0,
            prefix + '_similar_ssl_version_count': 0,
            prefix + '_similar_ssl_cipher_count': 0,
            prefix + '_similar_ssl_curve_count': 0,
            prefix + '_similar_ssl_validation_status_count': 0,
            prefix + '_similar_files_source_count': 0,
            prefix + '_similar_file_md5_count': 0
           }

In [10]:
NEIGHBOURHOOD_TIME_WINDOW_MINUTES_ORIG_DIRECTION = 5
NEIGHBOURHOOD_TIME_WINDOW_MINUTES_RESP_DIRECTION = 2

def compute_neighbourhoods(cur_orig_ip, dfs_list_orig, dfs_list_resp):
    df_result = pd.DataFrame()
    print('[{}]: Computing neighbourhood for connections of originator {:15} ({})'.format(datetime.now().strftime("%H:%M:%S"), cur_orig_ip, str(len(dfs_list_orig[cur_orig_ip]))))
    # iterate over rows in originated connections df of host with cur_orig_ip IP address:
    for index, row in dfs_list_orig[cur_orig_ip].iterrows():
        cur_row_dict = row.to_dict()
        cur_time = row['connection.time']
        
        time_start_orig = cur_time - pd.Timedelta(minutes=NEIGHBOURHOOD_TIME_WINDOW_MINUTES_ORIG_DIRECTION)
        time_end_orig = cur_time + pd.Timedelta(minutes=NEIGHBOURHOOD_TIME_WINDOW_MINUTES_ORIG_DIRECTION)
        time_start_resp = cur_time - pd.Timedelta(minutes=NEIGHBOURHOOD_TIME_WINDOW_MINUTES_RESP_DIRECTION)
        time_end_resp = cur_time + pd.Timedelta(minutes=NEIGHBOURHOOD_TIME_WINDOW_MINUTES_RESP_DIRECTION)
        ip_responder = row['responded_ip']
        try:
            # compute neighbourhoods (from originated connections for originator, from responded connections for responder):
            originator_neighbourhood = compute_time_neighbourhood(cur_orig_ip, dfs_list_orig, 'connection.time', cur_time, time_start_orig, time_end_orig, row, 'orig_orig')
            originator_neighbourhood2 = compute_time_neighbourhood(cur_orig_ip, dfs_list_resp, 'connection.time', cur_time, time_start_resp, time_end_resp, row, 'orig_resp')
            responder_neighbourhood = compute_time_neighbourhood(ip_responder, dfs_list_orig, 'connection.time', cur_time, time_start_orig, time_end_orig, row, 'resp_orig')
            responder_neighbourhood2 = compute_time_neighbourhood(ip_responder, dfs_list_resp, 'connection.time', cur_time, time_start_resp, time_end_resp, row, 'resp_resp')

            cur_row_dict.update(originator_neighbourhood)
            cur_row_dict.update(originator_neighbourhood2)
            cur_row_dict.update(responder_neighbourhood)
            cur_row_dict.update(responder_neighbourhood2)
            
            # concat to one long row and to df_result:
            row_df = pd.DataFrame([cur_row_dict])
            df_result = pd.concat([df_result, row_df], axis=0, ignore_index=True)
        except: 
            print('Problem with originator {} and responder {} ({})'.format(cur_orig_ip, ip_responder, row['connection.uid']))
            pass
    return df_result

In [None]:
from datetime import datetime
import multiprocessing
from multiprocessing import Pool
from functools import partial
from contextlib import contextmanager

@contextmanager
def poolcontext(*args, **kwargs):
    pool = multiprocessing.Pool(*args, **kwargs)
    yield pool
    pool.terminate()

# compute neighbourhoods using multiple threads (time optimalization):
print('Start at ' + datetime.now().strftime("%H:%M:%S") + '.')
with poolcontext(processes=32) as pool:
    
    dfs_with_neighbourhoods = pool.map(
        partial(compute_neighbourhoods, 
                dfs_list_orig=dfs_orig, 
                dfs_list_resp=dfs_resp), 
        dfs_orig.keys())

print('Done at ' + datetime.now().strftime("%H:%M:%S") + '.')

Start at 01:36:30.
[01:36:33]: Computing neighbourhood for connections of originator 192.168.10.25   (242)
[01:36:34]: Computing neighbourhood for connections of originator 192.168.10.8    (682)
[01:36:35]: Computing neighbourhood for connections of originator 210.14.132.70   (1)
[01:36:36]: Computing neighbourhood for connections of originator 192.168.10.51   (85)
[01:36:37]: Computing neighbourhood for connections of originator 192.168.10.50   (313)
[01:36:39]: Computing neighbourhood for connections of originator 192.168.10.12   (6182)
[01:36:39]: Computing neighbourhood for connections of originator 192.168.10.17   (698)
[01:36:41]: Computing neighbourhood for connections of originator 192.168.10.3    (3424)
[01:36:42]: Computing neighbourhood for connections of originator 192.168.10.16   (1814)
[01:36:45]: Computing neighbourhood for connections of originator 192.168.10.19   (269)
[01:36:46]: Computing neighbourhood for connections of originator 192.168.10.9    (1489)
[01:36:49]: 

In [None]:
print(type(dfs_with_neighbourhoods))

In [None]:
print(len(dfs_with_neighbourhoods))

In [None]:
dfs_with_neighbourhoods[0].head()

### 2. Concatenate to one final DataFrame

In [None]:
def concat_dfs(df_neighourhoods):
    df_result = pd.DataFrame()
    for i in range(0, len(df_neighourhoods)):
        df_i = df_neighourhoods[i]
        df_result = df_result.append(df_i)
    return df_result

df_result = concat_dfs(dfs_with_neighbourhoods)

In [None]:
df_result

In [None]:
# backup:

from datetime import date

df_result.to_csv(PREFIX + 'query_output_preprocessing_checkpoint_' + date.today().strftime("%d_%m") + '.csv', index=False, header=True)

# from datetime import datetime
# import pandas as pd
# df_result = pd.read_csv(PREFIX + 'query_output_preprocessing_checkpoint_' + date.today().strftime("%d_%m") + '.csv')
# df_result['connection.time'] = pd.to_datetime(df_result['connection.time'])

In [None]:
df_result

### 3. Assign attacker labels

In [None]:
df_result['attacker_label'] = 'No'
df_result['victim_label'] = 'No'

In [None]:
# assign labels to input data as:
# 'No'  - not from/ to attacker
# 'Yes' - originated from/ responded to attacker

df_result.loc[df_result['responded_ip'] == '172.16.0.1', 'attacker_label'] = 'Yes'
df_result.loc[df_result['originated_ip'] == '172.16.0.1', 'attacker_label'] = 'Yes'

In [None]:
df_result

In [None]:
df_result['attacker_label'].value_counts()

In [None]:
# assign labels to input data as:
# 'No'  - not from/ to victim
# 'Yes' - originated from/ responded to victim

df_result.loc[df_result['responded_ip'] == '192.168.10.50', 'victim_label'] = 'Yes'
df_result.loc[df_result['originated_ip'] == '192.168.10.50', 'victim_label'] = 'Yes'

In [None]:
df_result['victim_label'].value_counts()

### 4. Write to file

In [None]:
print(len(df_result))

In [None]:
df_result.to_csv(PREFIX + 'query_output_processing.csv', index=False, header=True)

In [None]:
for col in df_result.columns:
    print(col)