In [41]:
import os
import datetime
from os import makedirs
from os.path import join, dirname, realpath
from copy import deepcopy

import pandas as pd
import torch
import torch.nn as nn
import numpy as np

from util import ip_lst, timestr_to_timestamp, filter_dest_port, filter_ip, header, parallelize_dataframe, \
    convert_timestamp

## Combine All duplicate traffics

In [2]:
unique_dict = {}

for root, _, files in os.walk("anomaly_time_series_july_week5"):
    unique_list = []
    for file in files:
        values = file[:-4].split("_")[-3:]
        key = "_".join(values)
        if key not in unique_dict:
            unique_dict[key] = [file]
        else:
            unique_dict[key].append(file)
    break

makedirs("anomaly_time_series_july_week5_merged", exist_ok=True)
print(unique_dict.keys())
for key, files in unique_dict.items():
    merged_file = None
    print("working on", key, "with", len(files), "in total")
    i = 0
    for file in files:
        if merged_file is None:
            merged_file = pd.read_csv(join("anomaly_time_series_july_week5", file)).set_index("Unnamed: 0")
            merged_file = merged_file.drop_duplicates()
        else:
            df = pd.read_csv(join("anomaly_time_series_july_week5", file)).set_index("Unnamed: 0")
            df = df.drop_duplicates()
            merged_file = merged_file.append(df[~df.index.isin(merged_file.index)])
        i += 1
        print(i)
    print("finished", key)
    merged_file = merged_file.sort_index()
    merged_file.to_csv(join("anomaly_time_series_july_week5_merged", key + ".csv"))

dict_keys(['src25_dest17_port24', 'src25_dest19_portNone', 'src23_dest25_port15', 'src25_dest23_portNone', 'src1_dest16_portNone', 'src23_dest25_portNone', 'src25_dest16_portNone', 'src17_dest25_port7', 'src25_dest21_portNone', 'src17_dest25_portNone', 'src19_dest56_port29', 'src21_dest25_port14', 'src23_dest25_port12', 'src45_dest19_portNone', 'src17_dest25_port6', 'src21_dest25_port15', 'src25_dest23_port24', 'src17_dest25_port14', 'src25_dest21_port5', 'src25_dest23_port5', 'src17_dest0_portNone', 'src59_dest19_portNone', 'src17_dest51_portNone', 'src21_dest25_port20', 'src59_dest21_portNone', 'src23_dest25_port14', 'src23_dest25_port13', 'src21_dest25_portNone', 'src23_dest25_port7', 'src21_dest54_portNone', 'src22_dest57_portNone', 'src19_dest45_portNone', 'src19_dest25_port22', 'src17_dest25_port11', 'src23_dest0_portNone', 'src19_dest25_portNone', 'src21_dest59_port18', 'src17_dest25_port15', 'src36_dest27_portNone', 'src25_dest16_port24', 'src19_dest25_port20', 'src23_dest25_po

## Create Report (each row is for one file)

In [42]:
def get_unique_values(entropy_str):
    start_str = entropy_str[:entropy_str.find(":")]
    lowest_entropy = entropy_str[entropy_str.find("["):]
    return start_str, eval(lowest_entropy)


def format_entropy_col(entropy_str):
    start_str, entropy_list = get_unique_values(entropy_str)
    if len(entropy_list) > 10:
        return start_str + ": " + str(len(entropy_list))
    return entropy_str


def report_to_csv(report, save_name):
    report["highest_entropy"] = report["highest_entropy"].map(format_entropy_col)
    report["src_port"] = report["src_port"].map(lambda x: len(eval(x)) if len(eval(x)) > 10 else x)
    report = report.replace({',': ''}, regex=True)
#     for col in ['start_time', 'end_time']:
#         report[col] = report[col].map(lambda x: datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))
#     report.to_csv(save_name, index=False)
    

In [52]:
report = pd.DataFrame(columns=["num_flow", "start_time", "end_time", "lowest_entropy", "2nd_lowest_entropy", "highest_entropy", "protocol", "src_port", "flags", "label", "file_name"])
for root, _, files in os.walk("anomaly_time_series_july_week5_port6667_detect_abs10"):
    i = 0
    print("total file", len(files))
    for file in sorted(files):
        if file.endswith(".csv"):
#             df = pd.read_csv(join(root, file)).set_index("Unnamed: 0")
            df = pd.read_csv(join(root, file)).set_index("index")
            df = df.drop_duplicates()
            summary = {
                "num_flow": df.shape[0], 
                "start_time": df.timestamp.min(), 
                "end_time": df.timestamp.max(), 
                "protocol": str(sorted(df["protocol"].unique())), 
                "src_port": str(sorted(df["src_port"].unique())), 
                "flags": str(sorted(df["flags"].unique())), 
                "label": str([key + ":" + str(round(value * 100, 2)) + "%" for key, value in df.label.value_counts(normalize=True).items()]),
                "file_name": file
            }
            src_IP_unique = set()
            dest_IP_unique = set()
            dest_port_unique = set()
            with open(join(root, file[:-4] + ".txt")) as f:
                for (src_ip, dest_ip, dest_port) in eval(f.read()):
                    src_IP_unique.add(src_ip)
                    dest_IP_unique.add(dest_ip)
                    dest_port_unique.add(dest_port)
            src_IP_len = len(src_IP_unique)
            dest_IP_len = len(dest_IP_unique)
            dest_port_len = len(dest_port_unique)
            src_IP_status = True
            dest_IP_status = True
            dest_port_status = True
            
            if min(src_IP_len, dest_IP_len, dest_port_len) == src_IP_len:
                summary["lowest_entropy"] = "src_IP: " + str(sorted(list(src_IP_unique)))
                src_IP_status = False
            elif min(src_IP_len, dest_IP_len, dest_port_len) == dest_IP_len:
                summary["lowest_entropy"] = "dest_IP: " + str(sorted(list(dest_IP_unique)))
                dest_IP_status = False
            else:
                summary["lowest_entropy"] = "dest_port: " + str(sorted(list(dest_port_unique)))
                dest_port_status = False
            
            if max(src_IP_len, dest_IP_len, dest_port_len) == dest_port_len:
                summary["highest_entropy"] = "dest_port: " + str(sorted(list(dest_port_unique)))
                dest_port_status = False
            elif max(src_IP_len, dest_IP_len, dest_port_len) == dest_IP_len:
                summary["highest_entropy"] = "dest_IP: " + str(sorted(list(dest_IP_unique)))
                dest_IP_status = False
            else:
                summary["highest_entropy"] = "src_IP: " + str(sorted(list(src_IP_unique)))
                src_IP_status = False
                
            if src_IP_status:
                summary["2nd_lowest_entropy"] = "src_IP: " + str(sorted(list(src_IP_unique)))
            elif dest_IP_status:
                summary["2nd_lowest_entropy"] = "dest_IP: " + str(sorted(list(dest_IP_unique)))
            else:
                summary["2nd_lowest_entropy"] = "dest_port: " + str(sorted(list(dest_port_unique)))
            report = report.append(summary, ignore_index=True)
            print(i)
            i += 1
    break
report = report.sort_values(by=["start_time", "end_time", "num_flow"]).reset_index(drop=True)

total file 80
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39


In [6]:
for week in [1, 2, 3, 4, 5]:
    for root, _, files in os.walk(f"anomaly_time_series_august_week{week}_1min_mse4_port2.5_bidir_20.0_1hr_flag6_numBytes12_hardcode_dest_UDP"):
        i = 0
        print("total file", len(files))
        for file in sorted(files):
            if file.endswith(".csv") and "None" not in file:
                df = pd.read_csv(join(root, file)).set_index("Unnamed: 0")
                df = df.drop_duplicates()
                summary = {
                    "num_flow": df.shape[0], 
                    "start_time": df.timestamp.min(), 
                    "end_time": df.timestamp.max(), 
                    "protocol": str(sorted(df["protocol"].unique())), 
                    "src_port": str(sorted(df["src_port"].unique())), 
                    "flags": str(sorted(df["flags"].unique())), 
                    "label": str([key + ":" + str(round(value * 100, 2)) + "%" for key, value in df.label.value_counts(normalize=True).items()])
                }
                attributes = file.split("_")
                summary['t'] = int(attributes[0][1:])
                summary['src_idx'] = int(attributes[2][3:])
                summary['dest_idx'] = int(attributes[3][4:])
                summary['port_idx'] = int(attributes[4][4:-4])
                src_IP_unique = set()
                dest_IP_unique = set()
                dest_port_unique = set()
                with open(join(root, file[:-4] + ".txt")) as f:
                    for (src_ip, dest_ip, dest_port) in eval(f.read()):
                        src_IP_unique.add(src_ip)
                        dest_IP_unique.add(dest_ip)
                        dest_port_unique.add(dest_port)
                src_IP_len = len(src_IP_unique)
                dest_IP_len = len(dest_IP_unique)
                dest_port_len = len(dest_port_unique)
                src_IP_status = True
                dest_IP_status = True
                dest_port_status = True

                if min(src_IP_len, dest_IP_len, dest_port_len) == src_IP_len:
                    summary["lowest_entropy"] = "src_IP: " + str(sorted(list(src_IP_unique)))
                    src_IP_status = False
                elif min(src_IP_len, dest_IP_len, dest_port_len) == dest_IP_len:
                    summary["lowest_entropy"] = "dest_IP: " + str(sorted(list(dest_IP_unique)))
                    dest_IP_status = False
                else:
                    summary["lowest_entropy"] = "dest_port: " + str(sorted(list(dest_port_unique)))
                    dest_port_status = False

                if max(src_IP_len, dest_IP_len, dest_port_len) == dest_port_len:
                    summary["highest_entropy"] = "dest_port: " + str(sorted(list(dest_port_unique)))
                    dest_port_status = False
                elif max(src_IP_len, dest_IP_len, dest_port_len) == dest_IP_len:
                    summary["highest_entropy"] = "dest_IP: " + str(sorted(list(dest_IP_unique)))
                    dest_IP_status = False
                else:
                    summary["highest_entropy"] = "src_IP: " + str(sorted(list(src_IP_unique)))
                    src_IP_status = False

                if src_IP_status:
                    summary["2nd_lowest_entropy"] = "src_IP: " + str(sorted(list(src_IP_unique)))
                elif dest_IP_status:
                    summary["2nd_lowest_entropy"] = "dest_IP: " + str(sorted(list(dest_IP_unique)))
                else:
                    summary["2nd_lowest_entropy"] = "dest_port: " + str(sorted(list(dest_port_unique)))
                report = report.append(summary, ignore_index=True)
                print(i)
                i += 1
        break
report = report.sort_values(by=["start_time", "end_time", "num_flow"]).reset_index(drop=True)

total file 15844
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
2

In [54]:
report = report[report["num_flow"] != 0]

In [55]:
def get_unique_values(entropy_str):
    start_str = entropy_str[:entropy_str.find(":")]
    lowest_entropy = entropy_str[entropy_str.find("["):]
    return start_str, eval(lowest_entropy)


def get_net_id(unique_values):
    net_id = None
    for ip_address in unique_values:
        tmp_net_id = ip_address.split(".")[0:3]
        if net_id is None:
            net_id = tmp_net_id
        elif net_id != tmp_net_id:
            return None
    return net_id


def combine_unique(unique_value1, unique_value2):
    if isinstance(unique_value1, list):
        unique_value1.extend(unique_value2)
        unique_value1 = list(set(unique_value1))
    else:
        unique_value1 = list(unique_value1.union(unique_value2))
    return sorted(unique_value1)


def process_flags(str1, str2):
    flags = ['.' for _ in range(6)]
    for i in range(6):
        if str1[i] != ".":
            flags[i] = str1[i]
        elif str2[i] != '.':
            flags[i] = str2[i]
    return "".join(flags)


def group_similar(curr_row, row_list):
    for row_index, row in enumerate(row_list):      
        # get all unique value list
        curr_l_start_str, l_curr_unique_values = get_unique_values(curr_row["lowest_entropy"])
        l_curr_unique_values = set(l_curr_unique_values)
        l_start_str, l_row_unique_values = get_unique_values(row["lowest_entropy"])
        l_row_unique_values = set(l_row_unique_values)
        
        curr_sl_start_str, sl_curr_unique_values = get_unique_values(curr_row["2nd_lowest_entropy"])
        sl_curr_unique_values = set(sl_curr_unique_values)
        sl_start_str, sl_row_unique_values = get_unique_values(row["2nd_lowest_entropy"])
        sl_row_unique_values = set(sl_row_unique_values)
        
        curr_h_start_str, h_curr_unique_values = get_unique_values(curr_row["highest_entropy"])
        h_curr_unique_values = set(h_curr_unique_values)
        h_start_str, h_row_unique_values = get_unique_values(row["highest_entropy"])
        h_row_unique_values = set(h_row_unique_values)
        
        # base on start str to check if one is a subset of the other
        l_condition = False
        for (start_str, unique_values) in [(curr_l_start_str, l_curr_unique_values), (curr_sl_start_str, sl_curr_unique_values), 
                                           (curr_h_start_str, h_curr_unique_values)]:
            if l_start_str == start_str:
                l_condition = l_row_unique_values.issubset(unique_values) or unique_values.issubset(l_row_unique_values)
                l_row_unique_values = combine_unique(l_row_unique_values, unique_values)
        sl_condition = False
        for (start_str, unique_values) in [(curr_l_start_str, l_curr_unique_values), (curr_sl_start_str, sl_curr_unique_values), 
                                           (curr_h_start_str, h_curr_unique_values)]:
            if sl_start_str == start_str:
                sl_condition = sl_row_unique_values.issubset(unique_values) or unique_values.issubset(sl_row_unique_values)
                sl_row_unique_values = combine_unique(sl_row_unique_values, unique_values)
        h_condition = False
        for (start_str, unique_values) in [(curr_l_start_str, l_curr_unique_values), (curr_sl_start_str, sl_curr_unique_values), 
                                           (curr_h_start_str, h_curr_unique_values)]:
            if h_start_str == start_str:
                h_condition = h_row_unique_values.issubset(unique_values) or unique_values.issubset(h_row_unique_values)
                h_row_unique_values = combine_unique(h_row_unique_values, unique_values)
        
        # use only lowest and 2nd lowest entropy columns
        cond_a = curr_l_start_str == l_start_str and curr_sl_start_str == sl_start_str
        cond_b = curr_l_start_str == sl_start_str and curr_sl_start_str == l_start_str
        if not (cond_a or cond_b):
            continue
                
        # if 2 out of 3 meet, combine them
        if (l_condition + sl_condition + h_condition) >= 2:
            # check if they have overlapping time range
            curr_st_time, curr_end_time = curr_row["start_time"], curr_row["end_time"]
            row_st_time, row_end_time = row_list[row_index]["start_time"], row_list[row_index]["end_time"]
#             if (row_st_time <= curr_st_time <= row_end_time) or (row_st_time <= curr_end_time <= row_end_time):
            row_list[row_index]["num_flow"] = "Ignored"
            row_list[row_index]["file_name"] += curr_row["file_name"]
            row_list[row_index]["start_time"] = min(curr_st_time, row_st_time)
            row_list[row_index]["end_time"] = max(curr_end_time, row_end_time)
            # lowest_entropy, 2nd_lowest_entropy and highest_entropy
            for (col, entro) in zip(["lowest_entropy", "2nd_lowest_entropy", "highest_entropy"], 
                sorted([(len(l_row_unique_values), "l"), (len(sl_row_unique_values), "sl"), (len(h_row_unique_values), "h")])):
                row_list[row_index][col] = eval(f"{entro[1]}_start_str") + ": " + str(eval(f"{entro[1]}_row_unique_values"))
            # protocol, src_port, flags, label
            for column in ["protocol", "src_port", "flags", "label"]:
                curr_unique_values = eval(curr_row[column])
                row_unique_values = eval(row[column])
                row_list[row_index][column] = str(combine_unique(row_unique_values, curr_unique_values))
            return row_list.pop(row_index)
    row_list.append(deepcopy(curr_row))
    return None

def group_similar_second(curr_row, row_list):
    for row_index, row in enumerate(row_list):      
        # check if they have overlapping time range
        curr_st_time, curr_end_time = curr_row["start_time"], curr_row["end_time"]
        row_st_time, row_end_time = row_list[row_index]["start_time"], row_list[row_index]["end_time"]
        if not ((row_st_time <= curr_st_time <= row_end_time) or (row_st_time <= curr_end_time <= row_end_time) or 
                (curr_st_time <= row_st_time and row_end_time <= curr_end_time)):
            continue
            
        # get all unique value list
        curr_l_start_str, l_curr_unique_values = get_unique_values(curr_row["lowest_entropy"])
        l_curr_unique_values = set(l_curr_unique_values)
        l_start_str, l_row_unique_values = get_unique_values(row["lowest_entropy"])
        l_row_unique_values = set(l_row_unique_values)
        
        curr_sl_start_str, sl_curr_unique_values = get_unique_values(curr_row["2nd_lowest_entropy"])
        sl_curr_unique_values = set(sl_curr_unique_values)
        sl_start_str, sl_row_unique_values = get_unique_values(row["2nd_lowest_entropy"])
        sl_row_unique_values = set(sl_row_unique_values)
        
        curr_h_start_str, h_curr_unique_values = get_unique_values(curr_row["highest_entropy"])
        h_curr_unique_values = set(h_curr_unique_values)
        h_start_str, h_row_unique_values = get_unique_values(row["highest_entropy"])
        h_row_unique_values = set(h_row_unique_values)
        
        # base on start str to check if one is a subset of the other
        l_condition = False
        for (start_str, unique_values) in [(curr_l_start_str, l_curr_unique_values), (curr_sl_start_str, sl_curr_unique_values), 
                                           (curr_h_start_str, h_curr_unique_values)]:
            if l_start_str == start_str:
                l_condition = l_row_unique_values.issubset(unique_values) or unique_values.issubset(l_row_unique_values)
                l_row_unique_values = combine_unique(l_row_unique_values, unique_values)
        sl_condition = False
        for (start_str, unique_values) in [(curr_l_start_str, l_curr_unique_values), (curr_sl_start_str, sl_curr_unique_values), 
                                           (curr_h_start_str, h_curr_unique_values)]:
            if sl_start_str == start_str:
                sl_condition = sl_row_unique_values.issubset(unique_values) or unique_values.issubset(sl_row_unique_values)
                sl_row_unique_values = combine_unique(sl_row_unique_values, unique_values)
        h_condition = False
        for (start_str, unique_values) in [(curr_l_start_str, l_curr_unique_values), (curr_sl_start_str, sl_curr_unique_values), 
                                           (curr_h_start_str, h_curr_unique_values)]:
            if h_start_str == start_str:
                h_condition = h_row_unique_values.issubset(unique_values) or unique_values.issubset(h_row_unique_values)
                h_row_unique_values = combine_unique(h_row_unique_values, unique_values)
                
        # use only lowest and 2nd lowest entropy columns
        cond_a = curr_l_start_str == l_start_str and curr_sl_start_str == sl_start_str
        cond_b = curr_l_start_str == sl_start_str and curr_sl_start_str == l_start_str
        # exact match
        if not ((cond_a and (curr_row["lowest_entropy"] == row["lowest_entropy"] or curr_row["2nd_lowest_entropy"] == row["2nd_lowest_entropy"])) or 
           (cond_b and (curr_row["lowest_entropy"] == row["2nd_lowest_entropy"] or curr_row["2nd_lowest_entropy"] == row["lowest_entropy"]))):
            continue
#         if not (cond_a or cond_b):
#             continue
            
        # if 1 out of 3 meet, combine them
        if (l_condition + sl_condition + h_condition) >= 1:                
            row_list[row_index]["num_flow"] = "Ignored"
            row_list[row_index]["file_name"] += curr_row["file_name"]
            row_list[row_index]["start_time"] = min(curr_st_time, row_st_time)
            row_list[row_index]["end_time"] = max(curr_end_time, row_end_time)
            # lowest_entropy, 2nd_lowest_entropy and highest_entropy
            for (col, entro) in zip(["lowest_entropy", "2nd_lowest_entropy", "highest_entropy"], 
                sorted([(len(l_row_unique_values), "l"), (len(sl_row_unique_values), "sl"), (len(h_row_unique_values), "h")])):
                row_list[row_index][col] = eval(f"{entro[1]}_start_str") + ": " + str(eval(f"{entro[1]}_row_unique_values"))                
            # protocol, src_port, flags, label
            for column in ["protocol", "src_port", "flags", "label"]:
                curr_unique_values = eval(curr_row[column])
                row_unique_values = eval(row[column])
                row_list[row_index][column] = str(combine_unique(row_unique_values, curr_unique_values))
            return row_list.pop(row_index)
    row_list.append(deepcopy(curr_row))
    return None

In [56]:
row_list = []
row_list_tmp = []
for i, row in report.iterrows():
    print(i)
    tmp = group_similar(row, row_list_tmp)
    while tmp is not None:
        tmp = group_similar(tmp, row_list_tmp)
print("Finshed combine rows first round!")
print(f"Total {len(row_list_tmp)} rows!")
for i, row in enumerate(row_list_tmp):
    print(i)
    tmp = group_similar_second(row, row_list)
    while tmp is not None:
        tmp = group_similar_second(tmp, row_list)
print("Finished combine rows!")
print("Total", len(row_list),"rows!")
new_report = pd.DataFrame(columns=["num_flow", "start_time", "end_time", "lowest_entropy", "2nd_lowest_entropy", 
                                   "highest_entropy", "protocol", "src_port", "flags", "label", "combined_files"])
i = 0
anomaly_series_folder = "anomaly_time_series_july_week5_port6667_detect_abs10"
for row in row_list:
    print("process row", i)
    # reformat the report to be human readable
    summary = {
        "start_time": datetime.datetime.utcfromtimestamp(row["start_time"]).strftime('%Y-%m-%d %H:%M:%S'), 
        "end_time": datetime.datetime.utcfromtimestamp(row["end_time"]).strftime('%Y-%m-%d %H:%M:%S'),
        "protocol": row["protocol"], 
        "src_port": len(eval(row["src_port"])) if len(eval(row["src_port"])) > 10 else row["src_port"], 
        "flags": row["flags"]
    }
    if isinstance(row["num_flow"], str):
        # combine the files to get the correct num_flows
        merged_file = None
        files = [f + ".csv" for f in row["file_name"].split(".csv")[:-1]]
        for file in files:
            if merged_file is None:
#                 merged_file = pd.read_csv(join(anomaly_series_folder, file)).set_index("Unnamed: 0")
                merged_file = pd.read_csv(join(anomaly_series_folder, file)).set_index("index")
                merged_file = merged_file.drop_duplicates()
            else:
#                 df = pd.read_csv(join(anomaly_series_folder, file)).set_index("Unnamed: 0")
                df = pd.read_csv(join(anomaly_series_folder, file)).set_index("index")
                df = df.drop_duplicates()
                merged_file = merged_file.append(df[~df.index.isin(merged_file.index)], sort=True)
        summary["num_flow"] = merged_file.shape[0]
        summary["combined_files"] = str(files)
        summary["label"] = [key + ":" + str(round(value * 100, 2)) + "%" for key, value in merged_file.label.value_counts(normalize=True).items()]
    else:
        summary["num_flow"] = row["num_flow"]
        summary["combined_files"] = "['" + row["file_name"] + "']"
#         merged_file = pd.read_csv(join(anomaly_series_folder, row["file_name"])).set_index("Unnamed: 0")
        merged_file = pd.read_csv(join(anomaly_series_folder, row["file_name"])).set_index("index")
        summary["label"] = [key + ":" + str(round(value * 100, 2)) + "%" for key, value in merged_file.label.value_counts(normalize=True).items()]
    
    # process lowest, 2nd lowest entropy and highest entropy columns
    src_IP_unique = set()
    dest_IP_unique = set()
    dest_port_unique = set()
    for file in eval(summary["combined_files"]):
        file = file[:-4] + ".txt"
        with open(join(anomaly_series_folder, file)) as f:
            for (src_ip, dest_ip, dest_port) in eval(f.read()):
                src_IP_unique.add(src_ip)
                dest_IP_unique.add(dest_ip)
                dest_port_unique.add(dest_port)
    src_IP_len = len(src_IP_unique)
    dest_IP_len = len(dest_IP_unique)
    dest_port_len = len(dest_port_unique)
    src_IP_status = True
    dest_IP_status = True
    dest_port_status = True

    if min(src_IP_len, dest_IP_len, dest_port_len) == src_IP_len:
        summary["lowest_entropy"] = "src_IP: " + str(sorted(list(src_IP_unique)))
        src_IP_status = False
    elif min(src_IP_len, dest_IP_len, dest_port_len) == dest_IP_len:
        summary["lowest_entropy"] = "dest_IP: " + str(sorted(list(dest_IP_unique)))
        dest_IP_status = False
    else:
        summary["lowest_entropy"] = "dest_port: " + str(sorted(list(dest_port_unique)))
        dest_port_status = False

    if max(src_IP_len, dest_IP_len, dest_port_len) == dest_port_len:
        summary["highest_entropy"] = f"dest_port: {len(dest_port_unique)}"
        dest_port_status = False
    elif max(src_IP_len, dest_IP_len, dest_port_len) == dest_IP_len:
        summary["highest_entropy"] = f"dest_IP: {len(dest_IP_unique)}"
        dest_IP_status = False
    else:
        summary["highest_entropy"] = f"src_IP: {len(src_IP_unique)}"
        src_IP_status = False

    if src_IP_status:
        summary["2nd_lowest_entropy"] = "src_IP: " + str(sorted(list(src_IP_unique)))
    elif dest_IP_status:
        summary["2nd_lowest_entropy"] = "dest_IP: " + str(sorted(list(dest_IP_unique)))
    else:
        summary["2nd_lowest_entropy"] = "dest_port: " + str(sorted(list(dest_port_unique)))
        
    new_report = new_report.append(summary, ignore_index=True)
    i += 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
Finshed combine rows first round!
Total 7 rows!
0
1
2
3
4
5
6
Finished combine rows!
Total 7 rows!
process row 0
process row 1
process row 2
process row 3
process row 4
process row 5
process row 6


In [8]:
new_report  # abs 1000 whole july week5 port 25 10 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,21304,2016-07-27 18:45:17,2016-07-27 20:23:34,dest_port: [25],"dest_IP: ['54.143.48.135', '54.143.48.199']",src_IP: 3,['TCP'],8899,"['.A..S.', '.A..SF', '.A.RS.', '.AP.SF', '.APR...",[anomaly-spam:100.0%],['t1469645032_mse19.562389373779297_src19_dest...
1,55265,2016-07-27 17:10:34,2016-07-27 20:23:12,"dest_IP: ['42.219.156.212', '42.219.156.213', ...","src_IP: ['193.26.243.174', '193.27.1.120', '19...",dest_port: 17612,['TCP'],17612,"['....S.', '.A..SF', '.A.RS.', '.AP.S.', '.AP....",[anomaly-spam:100.0%],['t1469640222_mse5.5011305809021_src56_dest19_...
2,2042,2016-07-27 22:13:54,2016-07-27 22:55:23,src_IP: ['42.219.156.182'],dest_IP: ['192.143.84.60'],dest_port: 1,['TCP'],1016,"['....S.', '...R..', '.A.RS.', '.AP.SF', '.APR...",[background:100.0%],['t1469659404_mse9.756455421447754_src19_dest5...
3,5763,2016-07-29 06:52:28,2016-07-29 07:02:27,src_IP: ['182.73.133.172'],dest_IP: ['42.219.154.108'],dest_port: 1,['TCP'],1402,"['....S.', '.A....', '.A...F', '.A..SF', '.A.R...",[background:100.0%],['t1469775148_mse5.862349033355713_src54_dest2...
4,3012,2016-07-29 10:43:39,2016-07-29 11:30:46,"dest_IP: ['42.219.156.176', '42.219.156.178', ...","src_IP: ['177.235.191.17', '194.192.119.230', ...",dest_port: 1459,['TCP'],1460,"['.AP.SF', '.APRS.']",[background:100.0%],['t1469790123_mse47.742698669433594_src63_dest...
5,5560,2016-07-30 10:14:51,2016-07-30 10:22:55,src_IP: ['42.219.158.179'],dest_IP: ['195.79.205.250'],dest_port: 1772,['TCP'],1773,"['...R..', '.A..SF', '.AP...', '.AP.SF']",[background:100.0%],['t1469873485_mse5.793478965759277_src17_dest5...
6,254730,2016-07-27 22:13:26,2016-07-31 12:59:03,dest_port: [25],"dest_IP: ['108.66.255.194', '108.66.255.199', ...",src_IP: 12,['TCP'],27961,"['....S.', '...R..', '.A..S.', '.A.R..', '.A.R...","[background:99.57%, anomaly-spam:0.43%]",['t1469788923_mse5.6126508712768555_src19_dest...
7,12776,2016-07-31 21:23:38,2016-07-31 22:13:05,src_IP: ['42.219.159.196'],dest_IP: ['131.85.90.155'],dest_port: 4735,['TCP'],4736,"['.AP.SF', '.APRSF']",[background:100.0%],['t1470001203_mse5.004255771636963_src16_dest4...
8,15806,2016-07-31 21:30:03,2016-07-31 22:13:05,src_IP: ['131.85.90.155'],dest_IP: ['42.219.159.196'],dest_port: 1,['TCP'],6277,"['....S.', '.A..S.', '.AP..F', '.AP.S.', '.AP....",[background:100.0%],['t1470002403_mse5.670547008514404_src48_dest1...
9,23970,2016-07-31 22:16:26,2016-07-31 22:47:36,src_IP: ['176.42.47.107'],dest_IP: ['42.219.158.160'],dest_port: 1,['TCP'],4840,"['.A...F', '.A..S.', '.A..SF', '.AP...', '.AP....",[background:100.0%],['t1470004203_mse7.145425796508789_src54_dest1...


In [48]:
new_report  # abs 1000 whole july week5 port 25 10 min 50% threshold

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,6040,2016-07-27 16:43:42,2016-07-27 18:13:41,src_IP: ['54.68.165.58'],dest_IP: ['42.219.153.35'],dest_port: 1,['TCP'],2749,['.AP.SF'],[background:100.0%],['t1469639622_mse33.9871711730957_src38_dest22...
1,3854,2016-07-27 19:23:52,2016-07-27 20:23:49,src_IP: ['54.68.165.58'],dest_IP: ['42.219.153.35'],dest_port: 1,['TCP'],1819,['.AP.SF'],[background:100.0%],['t1469649232_mse5.540788650512695_src38_dest2...
2,4846,2016-07-27 16:44:36,2016-07-27 20:21:03,src_IP: ['42.219.153.35'],dest_IP: ['54.68.165.58'],dest_port: 2149,['TCP'],2150,['.AP.SF'],[background:100.0%],['t1469639622_mse33.9871711730957_src22_dest38...
3,9081,2016-07-27 18:44:03,2016-07-27 20:22:18,"src_IP: ['121.106.2.63', '193.27.83.86', '54.1...","dest_IP: ['42.219.156.212', '42.219.156.213', ...",dest_port: 4080,['TCP'],4081,"['.AP.SF', '.APRS.']",[anomaly-spam:100.0%],['t1469648032_mse54.860626220703125_src56_dest...
4,3907,2016-07-28 18:02:32,2016-07-28 19:02:09,src_IP: ['42.219.154.98'],dest_IP: ['240.19.71.56'],dest_port: 774,['TCP'],775,"['.A...F', '.A..SF', '.AP...', '.AP.S.']",[background:100.0%],['t1469730752_mse22.53179359436035_src21_dest6...
5,33943,2016-07-28 07:02:41,2016-07-28 19:02:31,dest_IP: ['42.219.154.98'],dest_port: [25],src_IP: 2,['TCP'],6685,"['.A...F', '.A..SF', '.AP...', '.AP.S.', '.AP....",[background:100.0%],['t1469691161_mse6.597326755523682_src43_dest2...
6,18962,2016-07-28 18:02:32,2016-07-28 19:02:31,src_IP: ['240.19.71.56'],dest_IP: ['42.219.154.98'],dest_port: 1,['TCP'],4151,"['.A...F', '.A..SF', '.AP...', '.AP.S.']",[background:100.0%],['t1469730752_mse22.53179359436035_src62_dest2...
7,1345,2016-07-27 19:56:54,2016-07-28 18:45:15,src_IP: ['42.219.158.178'],dest_IP: ['64.169.168.27'],dest_port: 657,['TCP'],658,['.AP.SF'],[background:100.0%],['t1469649232_mse5.540788650512695_src17_dest4...
8,1006,2016-07-28 18:32:39,2016-07-28 18:45:19,src_IP: ['64.169.168.27'],dest_IP: ['42.219.158.178'],dest_port: 1,['TCP'],501,['.AP.SF'],[background:100.0%],['t1469730752_mse22.53179359436035_src40_dest1...
9,1010,2016-07-29 10:45:24,2016-07-29 11:20:47,"src_IP: ['177.235.191.18', '253.136.160.168']","dest_IP: ['42.219.156.176', '42.219.156.178', ...",dest_port: 498,['TCP'],499,"['.AP.SF', '.APRS.']",[background:100.0%],['t1469790723_mse6.452500820159912_src63_dest1...


In [8]:
new_report  # abs 1000 whole july week5 port 22 1 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,5081,2016-07-30 05:40:19,2016-07-30 05:41:06,src_IP: ['183.168.158.155'],dest_IP: ['42.219.154.133'],dest_port: 1,['TCP'],2328,"['...R..', '.A....', '.A..S.', '.A..SF', '.A.R...",[background:100.0%],['t1469857228_mse4.117676734924316_src54_dest2...
1,2722,2016-07-30 05:40:28,2016-07-30 05:41:06,src_IP: ['42.219.154.133'],dest_IP: ['183.168.158.155'],dest_port: 1337,['TCP'],1338,"['...R..', '.A....', '.A..S.', '.A..SF', '.A.R...",[background:100.0%],['t1469857228_mse4.117676734924316_src21_dest5...
2,6076,2016-07-31 15:42:28,2016-07-31 15:58:25,src_IP: ['221.177.81.0'],dest_port: [22],dest_IP: 38,['TCP'],5273,"['....S.', '.A....', '.AP...', '.AP.S.']",[background:100.0%],['t1469979992_mse3.7246243953704834_src59_dest...


In [15]:
new_report  # abs 1000 whole july week5 port 53413 10 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,1016,2016-07-29 10:19:31,2016-07-29 10:24:58,dest_port: [53413],"src_IP: ['225.142.10.236', '225.77.183.51']",dest_IP: 256,['UDP'],506,['.A....'],[background:100.0%],['t1469787424_mse4.13129997253418_src60_dest29...
1,1022,2016-07-31 20:34:04,2016-07-31 20:39:47,dest_port: [53413],"src_IP: ['201.98.177.108', '202.147.152.15']",dest_IP: 256,['UDP'],504,['.A....'],[background:100.0%],['t1469996994_mse3.5395774841308594_src57_dest...


In [8]:
new_report  # abs 500 whole july week5 port 53413 10 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,334683,2016-07-27 13:57:26,2016-07-31 23:48:03,dest_port: [53413],"src_IP: ['105.245.236.250', '107.166.108.201',...",dest_IP: 4081,['UDP'],30152,['.A....'],"[background:99.98%, blacklist:0.02%, nerisbotn...",['t1469989794_mse2.927644729614258_src36_dest2...


In [8]:
new_report  # abs 200 whole july week5 port 25 10 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,1440,2016-07-27 13:53:29,2016-07-27 14:03:28,src_IP: ['42.219.158.161'],"dest_IP: ['102.206.84.64', '102.251.250.156', ...",dest_port: 710,['TCP'],711,"['.AP.SF', '.APRSF']",[background:100.0%],['t1469627609_mse2.0469613075256348_src17_dest...
1,1202,2016-07-27 14:33:31,2016-07-27 14:43:28,src_IP: ['42.219.158.161'],"dest_IP: ['100.15.163.5', '101.140.194.232', '...",dest_port: 597,['TCP'],598,"['.AP.SF', '.APRSF']",[background:100.0%],['t1469630009_mse2.045530319213867_src17_dest6...
2,294,2016-07-27 14:35:42,2016-07-27 14:43:19,"src_IP: ['192.22.2.165', '192.22.27.234']","dest_IP: ['42.219.156.194', '42.219.156.198']",dest_port: 147,['TCP'],148,"['.AP.SF', '.APRS.']",[background:100.0%],['t1469630009_mse2.045530319213867_src56_dest1...
3,1256,2016-07-27 14:43:30,2016-07-27 14:53:28,src_IP: ['42.219.158.161'],"dest_IP: ['101.200.96.70', '103.126.97.42', '1...",dest_port: 624,['TCP'],625,"['.AP.SF', '.APRSF']",[background:100.0%],['t1469630609_mse2.087506055831909_src17_dest6...
4,370,2016-07-27 14:45:45,2016-07-27 14:53:06,"src_IP: ['192.22.2.164', '192.22.27.234']","dest_IP: ['42.219.156.197', '42.219.156.199']",dest_port: 185,['TCP'],186,"['.AP.SF', '.APRS.']",[background:100.0%],['t1469630609_mse2.087506055831909_src56_dest1...
...,...,...,...,...,...,...,...,...,...,...,...
223,1252,2016-07-31 23:20:11,2016-08-01 00:10:02,"dest_IP: ['42.219.156.194', '42.219.156.198']","src_IP: ['204.97.176.27', '204.97.188.20', '20...",dest_port: 620,['TCP'],621,"['.AP.SF', '.APRS.']",[background:100.0%],['t1470007203_mse2.828524112701416_src2_dest19...
224,22378,2016-07-27 17:23:47,2016-08-01 00:03:00,"dest_IP: ['42.219.156.194', '42.219.156.197', ...","src_IP: ['204.97.180.40', '204.97.180.41', '20...",dest_port: 9230,['TCP'],9231,"['.AP.S.', '.AP.SF', '.APR..', '.APRS.']",[background:100.0%],['t1469659404_mse9.756455421447754_src2_dest19...
225,7751,2016-07-31 22:10:03,2016-08-01 00:00:02,dest_IP: ['182.73.133.172'],"src_IP: ['42.219.154.101', '42.219.154.108', '...",dest_port: 1707,['TCP'],1707,"['....S.', '.A....', '.A...F', '.A..S.', '.A.....",[background:100.0%],['t1470003003_mse4.486227989196777_src21_dest5...
226,224,2016-07-31 23:50:05,2016-07-31 23:59:50,src_IP: ['42.219.158.161'],"dest_IP: ['112.31.62.26', '126.121.42.225', '1...",dest_port: 112,['TCP'],113,"['.AP.S.', '.APRS.']",[background:100.0%],['t1470009003_mse4.615067005157471_src17_dest4...


In [19]:
new_report  # abs 200 whole august week1 port 25 10 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,1342,2016-08-01 12:02:30,2016-08-01 12:12:02,src_IP: ['42.219.153.143'],dest_IP: ['37.5.105.46'],dest_port: 671,['TCP'],672,"['.AP.SF', '.APRS.', '.APRSF']",[background:100.0%],['t1470052950_mse50.95751953125_src22_dest36_b...
1,862,2016-08-01 12:02:31,2016-08-01 12:25:41,"src_IP: ['204.97.176.26', '204.97.71.84', '204...","dest_IP: ['42.219.156.196', '42.219.156.197', ...",dest_port: 428,['TCP'],429,"['.AP.SF', '.APRS.']",[background:100.0%],['t1470052950_mse50.95751953125_src2_dest19_by...
2,480,2016-08-01 12:04:05,2016-08-01 12:06:35,src_IP: ['42.219.154.98'],dest_IP: ['131.138.59.188'],dest_port: 240,['TCP'],241,"['.AP.SF', '.APRSF']",[background:100.0%],['t1470052950_mse50.95751953125_src21_dest48_b...
3,256,2016-08-01 12:12:40,2016-08-01 12:22:13,"src_IP: ['204.97.194.148', '204.97.71.84']","dest_IP: ['42.219.156.197', '42.219.156.199']",dest_port: 128,['TCP'],129,"['.AP.SF', '.APRS.']",[background:100.0%],['t1470053550_mse47.897525787353516_src2_dest1...
4,1820,2016-08-01 20:34:03,2016-08-01 21:23:07,src_IP: ['42.219.158.164'],dest_port: [25],dest_IP: 2,['TCP'],896,"['.AP.SF', '.APRSF']",[background:100.0%],['t1470084843_mse5.797444820404053_src17_dest2...
...,...,...,...,...,...,...,...,...,...,...,...
131,29962,2016-08-07 23:32:59,2016-08-08 00:06:20,dest_port: [25],"dest_IP: ['253.139.127.225', '253.139.127.226']",src_IP: 4,['TCP'],11803,"['.AP.SF', '.APRS.']",[anomaly-spam:100.0%],['t1470612981_mse76.75270080566406_src19_dest6...
132,408,2016-08-07 23:37:09,2016-08-07 23:46:21,dest_IP: ['79.215.212.124'],"src_IP: ['42.219.154.116', '42.219.154.128']",dest_port: 204,['TCP'],205,['.AP.SF'],[background:100.0%],['t1470612981_mse76.75270080566406_src21_dest4...
133,17549032,2016-08-01 11:38:23,2016-08-08 00:10:01,"dest_port: [25, 48766]","src_IP: ['42.219.156.180', '42.219.156.181', '...",dest_IP: 29,['TCP'],28234,"['....S.', '...R..', '.A....', '.A...F', '.A.....","[anomaly-spam:95.14%, background:4.86%]",['t1470512143_mse5.066888332366943_src19_dest5...
134,19509331,2016-08-01 12:08:29,2016-08-08 00:10:01,"dest_IP: ['42.219.156.180', '42.219.156.181', ...","src_IP: ['108.66.255.194', '108.66.255.199', '...",dest_port: 28233,['TCP'],28234,"['....S.', '...R..', '.A....', '.A...F', '.A.....","[anomaly-spam:95.53%, background:4.47%]",['t1470512143_mse5.066888332366943_src63_dest1...


In [8]:
new_report  # abs 1000 whole august week1 port 25 10 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,81836,2016-08-02 07:22:59,2016-08-02 09:27:57,dest_IP: ['42.219.154.105'],dest_port: [25],src_IP: 6,['TCP'],11722,"['....S.', '...R..', '.A...F', '.A..S.', '.A.....",[background:100.0%],['t1470122769_mse7.693587303161621_src55_dest2...
1,84787,2016-08-04 02:18:23,2016-08-05 23:50:38,dest_port: [25],"dest_IP: ['42.219.154.154', '42.219.154.98']",src_IP: 3,['TCP'],14741,"['...R..', '.A...F', '.A..SF', '.AP...', '.AP....",[background:100.0%],['t1470301109_mse5.865317344665527_src39_dest2...
2,26640,2016-08-05 23:38:28,2016-08-05 23:50:38,src_IP: ['213.221.249.49'],dest_IP: ['42.219.154.98'],dest_port: 1,['TCP'],6151,"['.A...F', '.A..SF', '.AP...', '.AP.S.']",[background:100.0%],['t1470440493_mse21.938262939453125_src58_dest...
3,6444,2016-08-06 05:08:29,2016-08-06 05:19:02,src_IP: ['43.25.209.191'],dest_IP: ['42.219.154.108'],dest_port: 1,['TCP'],1578,"['.A...F', '.A..SF', '.AP...', '.AP..F', '.AP....",[background:100.0%],['t1470460067_mse7.145481109619141_src37_dest2...
4,216,2016-08-06 23:33:40,2016-08-06 23:58:24,"dest_IP: ['192.141.228.249', '192.22.25.41', '...","dest_port: [25, 37069, 64285]",src_IP: 5,['TCP'],109,"['.AP.SF', '.APRS.']",[background:100.0%],['t1470527057_mse288.6785888671875_src19_dest5...
5,146,2016-08-07 02:00:09,2016-08-07 02:04:48,dest_IP: ['192.22.7.102'],dest_port: [25],src_IP: 4,['TCP'],74,"['.AP.SF', '.APRS.']",[background:100.0%],['t1470535031_mse396.47772216796875_src19_dest...
6,98278,2016-08-07 04:28:40,2016-08-07 05:14:13,src_IP: ['42.219.156.223'],dest_IP: ['194.192.119.36'],dest_port: 1,['TCP'],23818,"['.AP.SF', '.APRS.']",[anomaly-spam:100.0%],['t1470545664_mse479.4194030761719_src19_dest5...
7,12212,2016-08-07 11:16:50,2016-08-07 11:45:47,dest_port: [25],"src_IP: ['42.219.156.176', '42.219.156.177', '...",dest_IP: 4,['TCP'],5501,"['.AP.S.', '.AP.SF', '.APRS.']",[background:100.0%],['t1470569272_mse7.131330490112305_src19_dest5...
8,28,2016-08-07 17:25:13,2016-08-07 17:33:25,"src_IP: ['192.22.2.165', '194.192.119.36']","dest_IP: ['42.219.156.194', '42.219.156.196', ...",dest_port: 14,['TCP'],15,"['.AP.SF', '.APRS.']","[anomaly-spam:71.43%, background:28.57%]",['t1470590655_mse21.90513801574707_src56_dest1...
9,24,2016-08-07 17:34:46,2016-08-07 17:42:58,"src_IP: ['192.22.2.165', '194.192.119.38']","dest_IP: ['42.219.156.194', '42.219.156.196', ...",dest_port: 12,['TCP'],13,"['.AP.SF', '.APRS.']","[background:50.0%, anomaly-spam:50.0%]",['t1470591255_mse11.44264030456543_src56_dest1...


In [18]:
new_report  # abs 200 whole august week2 port 25 10 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,1825,2016-08-08 00:26:14,2016-08-08 00:37:55,src_IP: ['42.219.158.191'],dest_IP: ['57.2.123.71'],dest_port: 891,['TCP'],892,"['...R..', '.AP.SF']",[background:100.0%],['t1470615601_mse20.605634689331055_src17_dest...
1,903,2016-08-08 00:53:41,2016-08-08 01:00:16,dest_IP: ['138.198.215.122'],"src_IP: ['42.219.154.101', '42.219.154.108', '...",dest_port: 202,['TCP'],203,['.AP.SF'],[background:100.0%],['t1470617401_mse26.26667594909668_src21_dest4...
2,735,2016-08-08 01:10:01,2016-08-08 01:19:12,src_IP: ['42.219.154.122'],dest_IP: ['189.48.88.172'],dest_port: 363,['TCP'],364,"['.A...F', '.AP.S.', '.AP.SF']",[background:100.0%],['t1470618601_mse19.887840270996094_src21_dest...
3,1082,2016-08-08 02:10:25,2016-08-08 02:14:24,"src_IP: ['42.219.158.161', '42.219.158.188']","dest_IP: ['200.91.0.26', '202.210.121.46', '61...",dest_port: 273,['TCP'],274,"['.A...F', '.A..SF', '.AP...', '.AP.S.', '.APR...","[blacklist:99.08%, background:0.92%]",['t1470622201_mse10.916491508483887_src17_dest...
4,920,2016-08-08 01:49:37,2016-08-08 02:49:58,src_IP: ['42.219.154.104'],dest_IP: ['195.52.235.224'],dest_port: 457,['TCP'],458,"['.AP.S.', '.APRS.']",[background:100.0%],['t1470621001_mse17.054393768310547_src19_dest...
...,...,...,...,...,...,...,...,...,...,...,...
136,3984,2016-08-14 22:22:13,2016-08-14 22:26:41,src_IP: ['218.121.114.193'],dest_IP: ['42.219.153.26'],dest_port: 1,['TCP'],997,"['.A...F', '.A..SF', '.AP...', '.AP.S.']",[background:100.0%],['t1471213371_mse6.346940040588379_src59_dest2...
137,3380,2016-08-14 22:22:51,2016-08-14 22:26:41,dest_IP: ['218.121.114.193'],"src_IP: ['42.219.153.26', '42.219.154.156']",dest_port: 845,['TCP'],846,"['.A...F', '.A..SF', '.AP...', '.AP.S.']",[background:100.0%],['t1471213371_mse6.346940040588379_src22_dest5...
138,25224170,2016-08-08 00:10:02,2016-08-15 00:11:25,"dest_port: [25, 54231, 59644]","src_IP: ['42.219.156.180', '42.219.156.181', '...",dest_IP: 94,['TCP'],28234,"['....S.', '...R..', '.A....', '.A...F', '.A.....","[anomaly-spam:98.87%, background:1.13%]",['t1470616201_mse16.14398956298828_src19_dest5...
139,8291,2016-08-14 23:35:17,2016-08-15 00:11:25,dest_port: [25],"dest_IP: ['108.66.255.199', '108.66.255.250', ...",src_IP: 7,['TCP'],3870,"['...R..', '.A.RS.', '.AP.SF']","[anomaly-spam:99.71%, background:0.29%]",['t1471219371_mse5.268787384033203_src19_dest4...


In [8]:
new_report  # abs 200 whole august week3 port 25 10 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,699,2016-08-15 02:41:53,2016-08-15 02:45:54,dest_IP: ['199.136.69.162'],"src_IP: ['42.219.157.16', '84.223.10.202']",dest_port: 118,['TCP'],119,"['.A...F', '.AP.SF']",[background:100.0%],['t1471228875_mse5.67778205871582_src18_dest56...
1,1006,2016-08-15 10:32:39,2016-08-15 10:38:06,src_IP: ['42.219.154.97'],dest_IP: ['176.131.55.229'],dest_port: 503,['TCP'],504,['.AP.SF'],[background:100.0%],['t1471257039_mse5.494534969329834_src21_dest5...
2,1876,2016-08-15 12:32:47,2016-08-15 12:42:08,"src_IP: ['42.219.154.128', '42.219.154.165']","dest_IP: ['100.251.87.141', '110.54.52.62', '1...",dest_port: 912,['TCP'],913,"['.AP.S.', '.AP.SF', '.APRS.']",[background:100.0%],['t1471264365_mse6.492796421051025_src21_dest3...
3,2111,2016-08-15 16:57:54,2016-08-15 17:14:38,src_IP: ['221.150.185.186'],dest_IP: ['42.219.155.190'],dest_port: 1,['TCP'],1026,"['.A....', '.AP.SF']",[background:100.0%],['t1471279976_mse7.274672985076904_src59_dest2...
4,450,2016-08-15 16:57:58,2016-08-15 17:12:07,src_IP: ['42.219.155.190'],dest_IP: ['221.150.185.186'],dest_port: 216,['TCP'],217,['.AP.SF'],[background:100.0%],['t1471279976_mse7.274672985076904_src20_dest5...
...,...,...,...,...,...,...,...,...,...,...,...
156,158,2016-08-21 22:55:07,2016-08-21 23:05:00,src_IP: ['42.219.153.35'],"dest_IP: ['136.103.139.20', '165.141.151.49', ...",dest_port: 79,['TCP'],80,"['.AP.S.', '.APRS.']",[background:100.0%],['t1471820100_mse12.416109085083008_src22_dest...
157,306,2016-08-21 23:05:06,2016-08-21 23:14:58,src_IP: ['42.219.158.161'],"dest_IP: ['100.15.118.131', '101.140.13.248', ...",dest_port: 152,['TCP'],153,"['.AP.S.', '.APRS.']",[background:100.0%],['t1471820700_mse13.54406452178955_src17_dest4...
158,6887980,2016-08-15 00:11:26,2016-08-21 23:51:57,"dest_IP: ['42.219.156.180', '42.219.156.181', ...","src_IP: ['108.66.255.194', '108.66.255.199', '...",dest_port: 28233,['TCP'],28234,"['....S.', '...R..', '.A....', '.A...F', '.A.....","[anomaly-spam:83.77%, background:16.23%]",['t1471279976_mse7.274672985076904_src45_dest1...
159,3776882,2016-08-15 02:13:32,2016-08-22 00:04:15,dest_port: [25],"dest_IP: ['108.66.255.194', '108.66.255.199', ...",src_IP: 32386,['TCP'],62144,"['....S.', '...R..', '...RS.', '.A....', '.A.....","[anomaly-spam:66.57%, background:33.43%]",['t1471800130_mse59.31736755371094_src19_dest2...


In [15]:
new_report  # abs 200 whole august week4 port 25 10 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,12145,2016-08-22 10:24:16,2016-08-22 11:06:51,src_IP: ['42.219.158.160'],dest_IP: ['217.240.120.194'],dest_port: 5026,['TCP'],5027,['.AP.SF'],[background:100.0%],['t1471861455_mse5.429193019866943_src17_dest5...
1,2224,2016-08-22 12:22:20,2016-08-22 12:48:48,src_IP: ['42.219.154.97'],dest_IP: ['150.248.191.98'],dest_port: 869,['TCP'],870,['.AP.SF'],[background:100.0%],['t1471869255_mse18.130054473876953_src21_dest...
2,698,2016-08-22 14:04:16,2016-08-22 14:08:49,src_IP: ['42.219.158.188'],dest_IP: ['196.59.32.51'],dest_port: 349,['TCP'],350,['.AP.SF'],[blacklist:100.0%],['t1471874331_mse18.4974308013916_src17_dest56...
3,1178,2016-08-22 14:08:32,2016-08-22 14:19:15,src_IP: ['42.219.155.106'],dest_IP: ['199.92.56.136'],dest_port: 580,['TCP'],581,['.AP.SF'],[background:100.0%],['t1471874931_mse21.966402053833008_src20_dest...
4,3324,2016-08-22 13:44:19,2016-08-22 14:58:50,src_IP: ['42.219.155.103'],dest_IP: ['152.85.201.226'],dest_port: 1096,['TCP'],1097,"['.A..SF', '.AP...', '.AP.SF']",[background:100.0%],['t1471873131_mse19.472702026367188_src20_dest...
...,...,...,...,...,...,...,...,...,...,...,...
210,5281,2016-08-28 21:41:48,2016-08-28 22:29:53,dest_IP: ['177.235.191.18'],dest_port: [25],src_IP: 2,['TCP'],2524,"['.AP.SF', '.APRS.']",[background:100.0%],['t1472422279_mse15.049588203430176_src19_dest...
211,323036,2016-08-26 19:23:23,2016-08-28 22:36:58,"dest_port: [25, 35679, 46752]","src_IP: ['42.219.145.18', '42.219.152.126', '4...",dest_IP: 18,['TCP'],28140,"['....S.', '...R..', '.A..S.', '.A.R..', '.A.R...",[background:100.0%],['t1472240637_mse14.888297080993652_src19_dest...
212,242,2016-08-28 22:04:40,2016-08-28 22:18:25,"src_IP: ['42.219.154.108', '42.219.154.165']","dest_IP: ['129.94.240.245', '168.61.9.41', '17...",dest_port: 120,['TCP'],121,"['.AP.S.', '.AP.SF', '.APRS.']",[background:100.0%],['t1472422279_mse15.049588203430176_src21_dest...
213,5763958,2016-08-22 10:24:16,2016-08-28 22:37:26,"dest_IP: ['42.219.156.176', '42.219.156.177', ...","src_IP: ['108.66.255.194', '108.66.255.199', '...",dest_port: 28233,['TCP'],28234,"['....S.', '...R..', '...RS.', '.A....', '.A.....","[anomaly-spam:73.26%, background:26.74%]",['t1472133597_mse5.7090840339660645_src56_dest...


In [16]:
new_report  # abs 200 whole august week5 port 25 10 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,11727,2016-08-29 00:32:22,2016-08-29 02:32:20,src_IP: ['255.220.127.141'],dest_IP: ['42.219.155.123'],dest_port: 1,['TCP'],5598,['.AP.SF'],[background:100.0%],['t1472432541_mse23.42551040649414_src63_dest2...
1,95830,2016-08-29 01:01:43,2016-08-29 02:21:12,"dest_IP: ['42.219.156.186', '42.219.156.187', ...","src_IP: ['177.235.191.17', '177.235.191.18', '...",dest_port: 22974,['TCP'],22975,"['.AP.SF', '.APRS.']",[background:100.0%],['t1472433741_mse8.93779468536377_src63_dest19...
2,6936,2016-08-29 00:32:31,2016-08-29 02:32:13,src_IP: ['42.219.155.123'],dest_IP: ['255.220.127.141'],dest_port: 3334,['TCP'],3335,['.AP.SF'],[background:100.0%],['t1472432541_mse23.42551040649414_src20_dest6...
3,10732,2016-08-29 01:13:15,2016-08-29 02:32:17,dest_port: [25],"dest_IP: ['194.192.119.38', '253.136.160.168',...",src_IP: 6,['TCP'],4583,"['....S.', '...R..', '.AP..F', '.AP.S.', '.AP....",[background:100.0%],['t1472434941_mse12.951377868652344_src52_dest...
4,2304,2016-08-29 01:45:02,2016-08-29 01:49:32,src_IP: ['42.219.158.190'],dest_IP: ['165.33.158.134'],dest_port: 755,['TCP'],756,"['....S.', '.AP..F', '.AP.SF']",[background:100.0%],['t1472434941_mse12.951377868652344_src17_dest...
5,139837,2016-08-29 01:02:21,2016-08-29 02:32:17,dest_port: [25],"src_IP: ['165.33.158.134', '42.219.156.186', '...",dest_IP: 19,['TCP'],26012,"['....S.', '...R..', '.A...F', '.A..S.', '.A.R...",[background:100.0%],['t1472432541_mse23.42551040649414_src19_dest6...
6,9669,2016-08-29 08:08:31,2016-08-29 08:56:08,dest_port: [25],"src_IP: ['42.219.156.194', '42.219.156.196', '...",dest_IP: 5,['TCP'],4445,"['.AP.SF', '.APRS.']",[background:100.0%],['t1472459169_mse5.782985687255859_src19_dest4...
7,2804,2016-08-29 08:12:04,2016-08-29 08:34:46,"src_IP: ['204.97.176.27', '204.97.68.212']","dest_IP: ['42.219.156.194', '42.219.156.198', ...",dest_port: 1348,['TCP'],1349,"['.AP.SF', '.APRS.']",[background:100.0%],['t1472459169_mse5.782985687255859_src2_dest19...
8,1394,2016-08-29 08:24:52,2016-08-29 08:52:27,"src_IP: ['85.194.210.36', '85.194.79.216', '85...","dest_IP: ['42.219.156.196', '42.219.156.198', ...",dest_port: 686,['TCP'],687,"['.AP.SF', '.APRS.']",[background:100.0%],['t1472459169_mse5.782985687255859_src42_dest1...
9,2292,2016-08-29 08:30:30,2016-08-29 08:55:46,src_IP: ['42.219.158.179'],dest_IP: ['92.23.46.32'],dest_port: 522,['TCP'],523,"['.A...F', '.A..SF', '.AP...', '.AP.S.']",[background:100.0%],['t1472459169_mse5.782985687255859_src17_dest4...


In [8]:
new_report  # abs 10 whole july week5 port 6667 10 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,34,2016-07-31 06:50:10,2016-07-31 06:52:55,src_IP: ['42.219.150.242'],dest_IP: ['42.219.154.69'],dest_port: 1,['TCP'],34,['....S.'],[scan44:100.0%],['t1469947570_mse0.16283731162548065_src25_des...
1,400,2016-07-28 01:09:29,2016-07-31 07:14:44,"src_IP: ['178.125.252.68', '178.151.76.180', '...","dest_port: [2074, 2349, 2719, 2827]",dest_IP: 20,['TCP'],"[2074, 2349, 2719, 2827, 6667]","['....S.', '.A.R..']",[nerisbotnet:100.0%],['t1469668108_mse0.3264482021331787_src54_dest...
2,480,2016-07-28 01:10:17,2016-07-31 07:15:01,"src_IP: ['218.94.106.74', '221.130.69.202']","dest_port: [1632, 2044, 2417, 2522, 2855]",dest_IP: 20,['TCP'],"[1632, 2044, 2417, 2522, 2855, 6667]","['....S.', '.A.R..']",[nerisbotnet:100.0%],['t1469668108_mse0.3264482021331787_src59_dest...
3,520,2016-07-28 01:24:28,2016-07-31 07:25:51,"src_IP: ['61.167.116.133', '61.17.216.4']","dest_port: [1486, 3606, 4370, 4418]",dest_IP: 20,['TCP'],"[1486, 3606, 4370, 4418, 6667]",['.AP.SF'],[nerisbotnet:100.0%],['t1469668708_mse0.2660191059112549_src39_dest...
4,15274,2016-07-28 01:00:09,2016-07-31 07:52:58,dest_port: [6667],"src_IP: ['42.219.150.242', '42.219.152.18', '4...",dest_IP: 157,['TCP'],273,"['....S.', '.A.R..', '.AP.SF']","[nerisbotnet:99.52%, scan44:0.48%]",['t1469853994_mse0.1643877923488617_src25_dest...
5,574,2016-07-28 00:49:52,2016-07-31 19:08:13,dest_IP: ['42.219.150.247'],"src_IP: ['42.219.156.30', '42.219.158.16']",dest_port: 285,['TCP'],286,"['....S.', '...RS.', '.A..S.', '.A.R..']","[scan44:99.65%, dos:0.35%]",['t1469666908_mse0.166036918759346_src19_dest2...
6,576,2016-07-28 00:49:52,2016-07-31 19:08:13,src_IP: ['42.219.150.247'],dest_IP: ['42.219.156.30'],dest_port: 1,['TCP'],287,"['....S.', '.A.R..']",[scan44:100.0%],['t1469666908_mse0.166036918759346_src25_dest1...
7,308,2016-07-28 00:49:54,2016-07-31 19:08:13,src_IP: ['42.219.150.243'],dest_IP: ['42.219.152.20'],dest_port: 1,['TCP'],302,['....S.'],[scan44:100.0%],['t1469666908_mse0.166036918759346_src25_dest2...
8,305,2016-07-28 00:49:55,2016-07-31 19:08:17,src_IP: ['42.219.150.246'],dest_IP: ['42.219.158.16'],dest_port: 1,['TCP'],304,['....S.'],[scan44:100.0%],['t1469666908_mse0.166036918759346_src25_dest1...
9,631,2016-07-28 00:49:54,2016-07-31 19:12:50,dest_port: [6667],"dest_IP: ['218.248.255.243', '218.57.210.214',...",src_IP: 22,['TCP'],248,"['....S.', '.A.R..']","[nerisbotnet:63.39%, scan44:30.9%, scan11:5.71%]",['t1469950276_mse0.25693637132644653_src19_des...


In [57]:
new_report  # abs 10 whole july week5 port 6667 10 min 50% threshold

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,74,2016-07-30 04:49:51,2016-07-30 16:26:50,src_IP: ['42.219.150.242'],dest_IP: ['42.219.154.69'],dest_port: 1,['TCP'],74,['....S.'],[scan44:100.0%],['t1469853994_mse0.1643877923488617_src25_dest...
1,34,2016-07-31 06:50:10,2016-07-31 06:52:55,src_IP: ['42.219.150.242'],dest_IP: ['42.219.154.69'],dest_port: 1,['TCP'],34,['....S.'],[scan44:100.0%],['t1469947570_mse0.16283731162548065_src25_des...
2,574,2016-07-28 00:49:52,2016-07-31 19:08:13,dest_IP: ['42.219.150.247'],"src_IP: ['42.219.156.30', '42.219.158.16']",dest_port: 285,['TCP'],286,"['....S.', '...RS.', '.A..S.', '.A.R..']","[scan44:99.65%, dos:0.35%]",['t1469666908_mse0.166036918759346_src19_dest2...
3,576,2016-07-28 00:49:52,2016-07-31 19:08:13,src_IP: ['42.219.150.247'],dest_IP: ['42.219.156.30'],dest_port: 1,['TCP'],287,"['....S.', '.A.R..']",[scan44:100.0%],['t1469666908_mse0.166036918759346_src25_dest1...
4,308,2016-07-28 00:49:54,2016-07-31 19:08:13,src_IP: ['42.219.150.243'],dest_IP: ['42.219.152.20'],dest_port: 1,['TCP'],302,['....S.'],[scan44:100.0%],['t1469666908_mse0.166036918759346_src25_dest2...
5,305,2016-07-28 00:49:55,2016-07-31 19:08:17,src_IP: ['42.219.150.246'],dest_IP: ['42.219.158.16'],dest_port: 1,['TCP'],304,['....S.'],[scan44:100.0%],['t1469666908_mse0.166036918759346_src25_dest1...
6,231,2016-07-28 00:49:54,2016-07-31 19:12:50,dest_IP: ['42.219.154.69'],dest_port: [6667],src_IP: 2,['TCP'],229,['....S.'],"[scan44:84.42%, scan11:15.58%]",['t1469666908_mse0.166036918759346_src25_dest2...


In [8]:
new_report  # abs 10 whole august week1 port 6667 10 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,360,2016-08-01 09:25:44,2016-08-03 13:31:53,"src_IP: ['117.204.73.246', '117.207.5.248', '1...","dest_port: [1454, 3217, 4006]",dest_IP: 20,['TCP'],"[1454, 3217, 4006, 6667]","['....S.', '.A.R..']",[nerisbotnet:100.0%],['t1470043487_mse0.3203742504119873_src46_dest...
1,282,2016-08-04 11:51:03,2016-08-04 11:57:05,src_IP: ['37.205.133.177'],"dest_IP: ['42.219.144.136', '42.219.144.149', ...",dest_port: 118,['TCP'],"[6667, 9564, 10911, 31612, 31878, 44771]","['...R..', '.A..S.']",[background:100.0%],['t1470311115_mse0.11019156873226166_src36_des...
2,120,2016-08-04 15:09:54,2016-08-04 15:18:52,"src_IP: ['91.214.134.197', '93.178.250.214', '...","dest_port: [2390, 2985, 3249]",dest_IP: 20,['TCP'],"[2390, 2985, 3249, 6667]","['....S.', '.A.R..']",[nerisbotnet:100.0%],['t1470323361_mse0.32141339778900146_src43_des...
3,360,2016-08-02 11:04:23,2016-08-05 17:11:05,"src_IP: ['209.127.193.138', '212.12.200.158', ...","dest_port: [1899, 2127, 2494]",dest_IP: 20,['TCP'],"[1899, 2127, 2494, 6667]","['....S.', '.A.R..']",[nerisbotnet:100.0%],['t1470135837_mse0.3152831792831421_src58_dest...
4,640,2016-08-01 09:06:18,2016-08-06 19:14:44,"src_IP: ['178.125.252.68', '178.151.76.180', '...","dest_port: [2074, 2349, 2719, 2827]",dest_IP: 20,['TCP'],"[2074, 2349, 2719, 2827, 6667]","['....S.', '.A.R..']",[nerisbotnet:100.0%],['t1470042287_mse0.32442039251327515_src54_des...
5,720,2016-08-01 09:06:03,2016-08-06 19:15:01,src_IP: ['221.130.69.202'],"dest_port: [2044, 2417, 2522, 2855]",dest_IP: 20,['TCP'],"[2044, 2417, 2522, 2855, 6667]","['....S.', '.A.R..']",[nerisbotnet:100.0%],['t1470042287_mse0.32442039251327515_src59_des...
6,680,2016-08-01 09:24:51,2016-08-06 19:25:51,"src_IP: ['61.167.116.133', '61.17.216.4']","dest_port: [1486, 3606, 4370, 4418]",dest_IP: 20,['TCP'],"[1486, 3606, 4370, 4418, 6667]",['.AP.SF'],[nerisbotnet:100.0%],['t1470043487_mse0.3203742504119873_src39_dest...
7,120,2016-08-06 19:28:20,2016-08-06 19:37:31,"src_IP: ['117.207.5.248', '118.96.124.137', '1...","dest_port: [3217, 3892, 4006]",dest_IP: 20,['TCP'],"[3217, 3892, 4006, 6667]","['....S.', '.A.R..']",[nerisbotnet:100.0%],['t1470511679_mse0.2903989851474762_src46_dest...
8,23820,2016-08-01 09:00:09,2016-08-06 19:39:09,dest_port: [6667],"src_IP: ['42.219.152.18', '42.219.152.20', '42...",dest_IP: 175,['TCP'],218,"['....S.', '.A.R..', '.A.R.F', '.AP.SF']",[nerisbotnet:100.0%],['t1470042287_mse0.32442039251327515_src17_des...
9,2274,2016-08-01 08:49:44,2016-08-07 06:42:05,dest_port: [6667],"dest_IP: ['178.150.146.54', '183.83.108.230', ...",src_IP: 24,['TCP'],1591,"['....S.', '...RS.', '.A..S.', '.A.R..']","[scan44:92.08%, nerisbotnet:7.92%]",['t1470511679_mse0.2903989851474762_src19_dest...


In [8]:
new_report  # abs 10 whole august week2 port 6667 10 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,366,2016-08-08 08:24:07,2016-08-08 22:52:38,dest_port: [6667],"src_IP: ['42.219.150.242', '42.219.150.243', '...",dest_IP: 4,['TCP'],294,"['....S.', '.A.R..']",[scan44:100.0%],['t1470644586_mse0.1591031700372696_src25_dest...
1,68,2016-08-09 11:09:02,2016-08-09 11:11:50,src_IP: ['42.219.154.69'],"dest_IP: ['42.219.150.241', '42.219.150.246']",dest_port: 34,['TCP'],35,"['...RS.', '.A..S.']","[scan11:97.06%, dos:2.94%]",['t1470740942_mse0.12922777235507965_src21_des...
2,252,2016-08-08 08:24:07,2016-08-09 11:18:59,"src_IP: ['42.219.154.69', '42.219.156.30']","dest_IP: ['42.219.150.246', '42.219.150.247']",dest_port: 126,['TCP'],127,"['....S.', '...RS.', '.A..S.', '.A.R..']","[scan44:73.81%, scan11:26.19%]",['t1470644586_mse0.1591031700372696_src19_dest...
3,72,2016-08-09 11:17:15,2016-08-09 11:20:16,src_IP: ['42.219.150.247'],dest_IP: ['42.219.156.30'],dest_port: 1,['TCP'],37,"['....S.', '.A.R..']",[scan44:100.0%],['t1470740942_mse0.12922777235507965_src25_des...
4,38,2016-08-09 11:17:16,2016-08-09 11:20:23,src_IP: ['42.219.150.243'],dest_IP: ['42.219.152.20'],dest_port: 1,['TCP'],38,['....S.'],[scan44:100.0%],['t1470740942_mse0.12922777235507965_src25_des...
5,107,2016-08-09 11:08:46,2016-08-09 11:20:20,dest_IP: ['42.219.154.69'],dest_port: [6667],src_IP: 2,['TCP'],71,"['....S.', '...RS.', '.A..S.']","[scan11:67.29%, scan44:32.71%]",['t1470740942_mse0.12922777235507965_src25_des...
6,35,2016-08-09 11:17:23,2016-08-09 11:20:18,src_IP: ['42.219.150.246'],dest_IP: ['42.219.158.16'],dest_port: 1,['TCP'],35,['....S.'],[scan44:100.0%],['t1470740942_mse0.12922777235507965_src25_des...
7,220,2016-08-13 09:32:34,2016-08-13 09:40:20,src_IP: ['240.49.164.230'],"dest_IP: ['42.219.144.104', '42.219.144.12', '...",dest_port: 217,['TCP'],[6667],['...R..'],[background:100.0%],['t1471080623_mse0.11410357058048248_src62_des...
8,239,2016-08-13 09:40:23,2016-08-13 09:50:21,src_IP: ['240.49.164.230'],"dest_IP: ['42.219.144.105', '42.219.144.124', ...",dest_port: 239,['TCP'],[6667],['...R..'],[background:100.0%],['t1471081223_mse0.12063635885715485_src62_des...
9,204,2016-08-13 09:50:30,2016-08-13 10:00:22,src_IP: ['240.49.164.230'],"dest_IP: ['42.219.144.117', '42.219.144.133', ...",dest_port: 204,['TCP'],[6667],['...R..'],[background:100.0%],['t1471081823_mse0.10206861048936844_src62_des...


In [8]:
new_report  # abs 10 whole august week4 port 6667 10 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,294,2016-08-27 20:30:19,2016-08-27 20:33:55,src_IP: ['216.199.110.239'],"dest_IP: ['42.219.144.0', '42.219.144.1', '42....",dest_port: 288,['TCP'],"[6667, 9927, 11119, 15130, 24300, 25173, 59348]","['...R..', '.A..S.']",[background:100.0%],['t1472329594_mse0.1447688639163971_src59_dest...


In [8]:
new_report  # abs 10 whole august week5 port 6667 10 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,521,2016-08-29 08:32:33,2016-08-29 09:26:21,src_IP: ['93.179.86.89'],dest_port: [6667],dest_IP: 510,['TCP'],"[6667, 51522]","['....S.', '.A.R..']",[background:100.0%],['t1472459182_mse0.20239125192165375_src43_des...


In [8]:
new_report  # abs 1000 whole july week5 port 53 1 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,112,2016-07-29 11:48:00,2016-07-29 12:38:02,src_IP: ['132.5.60.62'],"dest_IP: ['42.219.156.191', '42.219.156.198', ...",dest_port: 56,['UDP'],57,['.A....'],[background:100.0%],['t1469792823_mse5.101457595825195_src0_dest19...
1,4967,2016-07-29 10:05:03,2016-07-29 15:47:59,dest_IP: ['42.219.157.222'],"src_IP: ['61.0.246.175', '63.128.22.194']",dest_port: 2407,['UDP'],2398,['.A....'],[background:100.0%],['t1469786703_mse4.758784770965576_src39_dest1...
2,2348,2016-07-30 01:10:00,2016-07-30 01:10:05,src_IP: ['42.219.158.190'],"dest_IP: ['143.64.217.64', '143.64.217.65', '1...",dest_port: 1168,['UDP'],1169,['.A....'],[background:100.0%],['t1469840980_mse4.730398654937744_src17_dest4...
3,25803,2016-07-27 17:51:42,2016-07-31 05:12:26,src_IP: ['176.42.238.201'],dest_port: [53],dest_IP: 2,['UDP'],13866,['.A....'],[background:100.0%],['t1469641902_mse6.211724758148193_src56_dest2...
4,3026,2016-07-31 05:12:27,2016-07-31 05:13:25,src_IP: ['176.42.238.201'],dest_IP: ['42.219.154.105'],dest_port: 1,['UDP'],2046,['.A....'],[background:100.0%],['t1469941947_mse4.975587844848633_src54_dest2...
5,150949,2016-07-28 01:51:17,2016-07-31 07:54:07,src_IP: ['143.72.8.137'],"dest_IP: ['42.219.147.178', '42.219.152.18', '...",dest_port: 26374,['UDP'],26360,['.A....'],"[nerisbotnet:96.37%, background:3.63%, blackli...",['t1469951588_mse8.96233081817627_src0_dest19_...
6,2108,2016-07-31 07:54:08,2016-07-31 07:55:03,src_IP: ['143.72.8.137'],"dest_IP: ['42.219.147.178', '42.219.152.21', '...",dest_port: 1045,['UDP'],1046,['.A....'],"[nerisbotnet:99.72%, background:0.28%]",['t1469951648_mse5.189189910888672_src0_dest21...
7,4006,2016-07-31 07:55:26,2016-07-31 07:56:00,src_IP: ['143.72.8.137'],"dest_IP: ['42.219.154.101', '42.219.154.71', '...",dest_port: 1964,['UDP'],1965,['.A....'],"[nerisbotnet:99.75%, background:0.25%]",['t1469951708_mse5.286794662475586_src0_dest21...
8,5952,2016-07-31 07:56:18,2016-07-31 07:57:02,src_IP: ['143.72.8.137'],"dest_IP: ['42.219.152.20', '42.219.152.22', '4...",dest_port: 2853,['UDP'],2854,['.A....'],"[nerisbotnet:99.93%, background:0.07%]",['t1469951768_mse6.879902362823486_src0_dest23...
9,4914,2016-07-31 11:37:57,2016-07-31 16:18:31,src_IP: ['42.219.158.161'],dest_IP: ['143.72.8.137'],dest_port: 1,['UDP'],2374,['.A....'],[background:100.0%],['t1469965077_mse6.778693675994873_src17_dest0...


In [8]:
new_report  # abs 1000 whole july week5 port 53 1 min with 50% threshold value

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,1504,2016-07-30 01:10:00,2016-07-30 01:10:05,src_IP: ['42.219.158.190'],"dest_IP: ['143.64.217.64', '143.64.217.65', '1...",dest_port: 750,['UDP'],751,['.A....'],[background:100.0%],['t1469840980_mse4.730398654937744_src17_dest4...
1,69188,2016-07-30 05:19:34,2016-07-30 19:22:05,src_IP: ['143.72.8.137'],"dest_IP: ['42.219.147.178', '42.219.152.18', '...",dest_port: 20417,['UDP'],20398,['.A....'],"[nerisbotnet:94.51%, background:5.49%]",['t1469858128_mse5.916626453399658_src0_dest23...
2,22356,2016-07-31 05:10:27,2016-07-31 05:12:26,src_IP: ['176.42.238.201'],dest_IP: ['42.219.154.105'],dest_port: 1,['UDP'],12503,['.A....'],[background:100.0%],['t1469941827_mse14.282188415527344_src54_dest...
3,39825,2016-07-31 07:19:41,2016-07-31 07:20:19,src_IP: ['143.72.8.137'],"dest_IP: ['42.219.152.18', '42.219.152.20', '4...",dest_port: 14408,['UDP'],14409,['.A....'],"[nerisbotnet:99.96%, background:0.04%]",['t1469949567_mse25.009124755859375_src0_dest1...
4,5265,2016-07-31 07:52:08,2016-07-31 07:53:07,src_IP: ['143.72.8.137'],"dest_IP: ['42.219.154.117', '42.219.154.69', '...",dest_port: 2539,['UDP'],2539,['.A....'],"[nerisbotnet:99.81%, background:0.11%, blackli...",['t1469951528_mse6.7670793533325195_src0_dest2...
5,11710,2016-07-31 07:53:08,2016-07-31 07:54:07,src_IP: ['143.72.8.137'],"dest_IP: ['42.219.152.21', '42.219.152.23', '4...",dest_port: 5368,['UDP'],5368,['.A....'],"[nerisbotnet:99.8%, background:0.17%, blacklis...",['t1469951588_mse8.96233081817627_src0_dest19_...
6,5952,2016-07-31 07:56:18,2016-07-31 07:57:02,src_IP: ['143.72.8.137'],"dest_IP: ['42.219.152.20', '42.219.152.22', '4...",dest_port: 2853,['UDP'],2854,['.A....'],"[nerisbotnet:99.93%, background:0.07%]",['t1469951768_mse6.879902362823486_src0_dest23...
7,378339,2016-07-27 17:50:42,2016-07-31 08:50:07,dest_port: [53],"dest_IP: ['143.72.4.250', '143.72.8.137', '42....",src_IP: 94,"['TCP', 'UDP']",43009,"['.A....', '.AP.SF']","[nerisbotnet:53.35%, background:46.62%, blackl...",['t1469840980_mse4.730398654937744_src49_dest1...
8,6251,2016-07-31 08:53:08,2016-07-31 08:55:07,src_IP: ['42.219.145.114'],dest_IP: ['143.72.8.137'],dest_port: 1,['UDP'],2829,['.A....'],[background:100.0%],['t1469955188_mse4.903547763824463_src30_dest0...
9,2607,2016-07-31 08:55:08,2016-07-31 08:56:05,src_IP: ['42.219.145.114'],dest_IP: ['143.72.8.137'],dest_port: 1,['UDP'],1259,['.A....'],[background:100.0%],['t1469955308_mse4.154321670532227_src30_dest0...


In [38]:
new_report  # abs 1000 whole july week5 port 53 1 min with 50% threshold value new

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,1504,2016-07-30 01:10:00,2016-07-30 01:10:05,src_IP: ['42.219.158.190'],"dest_IP: ['143.64.217.64', '143.64.217.65', '1...",dest_port: 750,['UDP'],751,['.A....'],[background:100.0%],['t1469840980_mse4.730398654937744_src17_dest4...
1,75583,2016-07-30 05:03:12,2016-07-30 19:51:01,src_IP: ['143.72.8.137'],"dest_IP: ['42.219.147.178', '42.219.152.18', '...",dest_port: 20417,"['TCP', 'UDP']",20418,"['.A....', '.AP.SF']","[nerisbotnet:86.59%, background:13.41%]",['t1469857888_mse11.930277824401855_src0_dest1...
2,42740,2016-07-31 04:44:59,2016-07-31 05:13:25,src_IP: ['176.42.238.201'],dest_IP: ['42.219.154.105'],dest_port: 1,['UDP'],18350,['.A....'],[background:100.0%],['t1469941827_mse14.282188415527344_src54_dest...
3,39829,2016-07-31 06:59:46,2016-07-31 07:20:19,src_IP: ['143.72.8.137'],"dest_IP: ['42.219.152.18', '42.219.152.20', '4...",dest_port: 14408,['UDP'],14409,['.A....'],"[nerisbotnet:99.96%, background:0.04%]",['t1469949567_mse25.009124755859375_src0_dest1...
4,22943,2016-07-31 07:47:20,2016-07-31 08:13:11,src_IP: ['143.72.8.137'],"dest_IP: ['42.219.152.20', '42.219.152.21', '4...",dest_port: 9519,['UDP'],9520,['.A....'],"[nerisbotnet:99.79%, background:0.17%, blackli...",['t1469951588_mse8.96233081817627_src0_dest19_...
5,1653057,2016-07-27 17:50:30,2016-07-31 09:10:10,dest_port: [53],"dest_IP: ['143.72.4.250', '143.72.8.137', '42....",src_IP: 94,"['TCP', 'UDP']",58549,"['....S.', '.A....', '.AP..F', '.AP.SF']","[background:86.21%, nerisbotnet:13.73%, blackl...",['t1469840980_mse4.730398654937744_src49_dest1...
6,65951,2016-07-31 08:23:08,2016-07-31 09:20:48,src_IP: ['42.219.145.114'],dest_IP: ['143.72.8.137'],dest_port: 1,['UDP'],10756,['.A....'],[background:100.0%],['t1469955188_mse4.903547763824463_src30_dest0...
7,61754,2016-07-31 08:25:12,2016-07-31 09:20:48,src_IP: ['42.219.145.114'],dest_IP: ['143.72.8.137'],dest_port: 1,['UDP'],10638,['.A....'],[background:100.0%],['t1469955308_mse4.154321670532227_src30_dest0...
8,243746,2016-07-31 20:40:48,2016-07-31 21:20:51,src_IP: ['201.111.38.45'],dest_IP: ['42.219.153.191'],dest_port: 1,['UDP'],47689,['.A....'],[background:100.0%],['t1469997653_mse8.891632080078125_src57_dest2...
9,102780,2016-07-31 21:00:03,2016-07-31 21:20:51,src_IP: ['201.111.38.45'],dest_IP: ['42.219.153.191'],dest_port: 1,['UDP'],33161,['.A....'],[background:100.0%],['t1469999883_mse5.135331153869629_src57_dest2...


In [8]:
new_report  # abs 1000 whole August week1 port 53 1 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,3202,2016-08-01 04:07:28,2016-08-01 04:08:27,src_IP: ['42.219.154.105'],"dest_IP: ['176.42.236.233', '176.42.238.201']",dest_port: 1557,['UDP'],1519,['.A....'],[background:100.0%],['t1470024448_mse9.689338684082031_src21_dest5...
1,23261,2016-08-01 22:43:04,2016-08-01 22:44:02,dest_IP: ['42.219.152.249'],"dest_port: [53, 60038, 61498]",src_IP: 73,['UDP'],19647,['.A....'],[background:100.0%],['t1470091383_mse20.65939712524414_src37_dest2...
2,2979,2016-08-02 16:28:22,2016-08-02 16:29:21,src_IP: ['42.219.154.149'],dest_IP: ['176.42.238.201'],dest_port: 1485,['UDP'],1469,['.A....'],[background:100.0%],['t1470155302_mse8.011963844299316_src21_dest5...
3,15463,2016-08-01 15:24:33,2016-08-03 07:46:37,dest_IP: ['42.219.157.222'],"src_IP: ['143.72.4.250', '150.142.71.64', '61....",dest_port: 7078,['UDP'],7078,['.A....'],[background:100.0%],['t1470065072_mse4.587278842926025_src39_dest1...
4,19613,2016-08-01 04:07:28,2016-08-04 05:00:40,"src_IP: ['42.219.153.62', '42.219.154.135']","dest_IP: ['176.245.153.73', '176.245.47.31', '...",dest_port: 7660,['UDP'],7658,['.A....'],[background:100.0%],['t1470286781_mse10.524224281311035_src22_dest...
5,3004,2016-08-04 06:04:41,2016-08-04 06:05:40,src_IP: ['42.219.154.116'],"dest_IP: ['176.42.236.233', '176.42.238.201']",dest_port: 1393,['UDP'],1386,['.A....'],[background:100.0%],['t1470290681_mse8.847150802612305_src21_dest5...
6,4267,2016-08-04 06:04:41,2016-08-04 06:05:40,src_IP: ['42.219.153.62'],"dest_IP: ['176.42.236.233', '176.42.238.201', ...",dest_port: 1989,['UDP'],1989,['.A....'],[background:100.0%],['t1470290681_mse8.847150802612305_src22_dest5...
7,10029,2016-08-04 16:11:35,2016-08-04 16:12:32,dest_IP: ['42.219.152.249'],"dest_port: [53, 59985, 60015, 60398, 60405, 60...",src_IP: 85,['UDP'],4763,['.A....'],[background:100.0%],['t1470327094_mse12.168952941894531_src2_dest2...
8,3802,2016-08-04 16:11:37,2016-08-04 16:12:19,src_IP: ['42.219.159.194'],"dest_IP: ['195.226.116.53', '204.97.142.192', ...",dest_port: 1853,['UDP'],1854,['.A....'],[background:100.0%],['t1470327094_mse12.168952941894531_src16_dest...
9,6324,2016-08-04 16:11:40,2016-08-04 16:12:32,src_IP: ['42.219.152.249'],"dest_IP: ['139.13.255.124', '183.168.95.85', '...",dest_port: 3045,['UDP'],3046,['.A....'],[background:100.0%],['t1470327094_mse12.168952941894531_src23_dest...


In [27]:
new_report  # abs 1000 whole August week2 port 53 1 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,5184,2016-08-10 05:47:58,2016-08-10 05:48:14,src_IP: ['42.219.158.190'],"dest_IP: ['234.191.30.213', '235.30.109.208', ...",dest_port: 2545,['UDP'],2546,['.A....'],[background:100.0%],['t1470808065_mse8.333586692810059_src17_dest3...
1,3186,2016-08-11 15:51:46,2016-08-11 15:52:45,src_IP: ['43.214.118.41'],dest_IP: ['42.219.154.105'],dest_port: 1,['UDP'],2134,['.A....'],[background:100.0%],['t1470930706_mse6.916683673858643_src37_dest2...
2,4888,2016-08-11 15:51:46,2016-08-11 15:52:45,dest_IP: ['43.214.118.41'],"src_IP: ['42.219.153.62', '42.219.154.105']",dest_port: 2374,['UDP'],2374,['.A....'],[background:100.0%],['t1470930706_mse6.916683673858643_src21_dest3...
3,17014,2016-08-08 07:38:52,2016-08-12 14:22:49,dest_IP: ['42.219.157.222'],"src_IP: ['61.0.246.175', '63.128.22.194']",dest_port: 7633,['UDP'],7605,['.A....'],[background:100.0%],['t1470641891_mse4.012929439544678_src39_dest1...
4,2170,2016-08-13 17:02:01,2016-08-13 17:03:00,"src_IP: ['42.219.154.149', '42.219.154.55', '4...","dest_IP: ['43.214.118.54', '43.214.118.60', '4...",dest_port: 1073,['UDP'],1074,['.A....'],[background:100.0%],['t1471107721_mse7.375539302825928_src21_dest3...
5,5206,2016-08-13 17:01:01,2016-08-13 17:03:00,"dest_port: [53, 48186]","dest_IP: ['42.219.154.149', '42.219.154.55', '...",src_IP: 11,['UDP'],3125,['.A....'],[background:100.0%],['t1471107661_mse5.583077430725098_src37_dest2...
6,1032,2016-08-13 17:39:02,2016-08-13 17:40:00,src_IP: ['42.219.157.222'],dest_IP: ['63.128.22.194'],dest_port: 1,['UDP'],491,['.A....'],[background:100.0%],['t1471109941_mse5.648496150970459_src18_dest3...
7,5598,2016-08-13 15:37:08,2016-08-13 18:50:12,dest_IP: ['42.219.157.12'],"src_IP: ['60.11.148.225', '60.43.80.161', '60....",dest_port: 1702,['UDP'],1703,['.A....'],[background:100.0%],['t1471102621_mse5.000566005706787_src39_dest1...
8,433,2016-08-13 20:00:21,2016-08-13 20:01:03,src_IP: ['42.219.157.12'],dest_IP: ['60.44.175.17'],dest_port: 1,['UDP'],214,['.A....'],[background:100.0%],['t1471118420_mse5.2807722091674805_src18_dest...
9,433,2016-08-13 20:00:21,2016-08-13 20:01:03,src_IP: ['60.44.175.17'],dest_IP: ['42.219.157.12'],dest_port: 213,['UDP'],214,['.A....'],[background:100.0%],['t1471118420_mse5.2807722091674805_src39_dest...


In [16]:
new_report  # abs 1000 whole August week3 port 53 1 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,15636,2016-08-15 00:26:16,2016-08-15 00:29:15,"src_IP: ['42.219.153.62', '42.219.154.149']","dest_IP: ['212.200.69.148', '212.200.70.162', ...",dest_port: 6935,['UDP'],6934,['.A....'],[background:100.0%],['t1471220776_mse8.511595726013184_src21_dest3...
1,26152,2016-08-15 23:48:47,2016-08-15 23:55:46,src_IP: ['42.219.153.191'],dest_IP: ['36.255.131.32'],dest_port: 11656,['UDP'],11657,['.A....'],[background:100.0%],['t1471304927_mse4.278139114379883_src22_dest3...
2,2467,2016-08-18 17:38:18,2016-08-18 17:38:52,src_IP: ['85.229.37.234'],dest_IP: ['42.219.152.249'],dest_port: 1140,['UDP'],1141,['.A....'],[background:100.0%],['t1471541886_mse4.882055759429932_src42_dest2...
3,4012,2016-08-19 01:46:48,2016-08-19 01:47:21,src_IP: ['42.219.158.190'],"dest_IP: ['234.191.30.213', '235.30.109.208', ...",dest_port: 1971,['UDP'],1972,['.A....'],[background:100.0%],['t1471571194_mse6.933218955993652_src17_dest3...
4,5128,2016-08-19 01:46:51,2016-08-19 01:47:28,src_IP: ['42.219.153.7'],"dest_IP: ['204.97.95.72', '234.191.30.213', '2...",dest_port: 2521,['UDP'],2521,['.A....'],[background:100.0%],['t1471571194_mse6.933218955993652_src22_dest3...
5,15051,2016-08-16 06:46:36,2016-08-19 06:48:48,dest_IP: ['42.219.157.222'],"src_IP: ['143.72.4.250', '224.152.255.187', '2...",dest_port: 6966,['UDP'],6957,['.A....'],[background:100.0%],['t1471329996_mse4.132896423339844_src39_dest1...
6,86456,2016-08-16 14:02:33,2016-08-21 03:45:11,src_IP: ['143.72.8.137'],"dest_IP: ['42.219.145.114', '42.219.147.137', ...",dest_port: 22481,['UDP'],22466,['.A....'],"[background:91.65%, blacklist:8.35%]",['t1471676011_mse5.555728435516357_src0_dest21...
7,1820,2016-08-21 14:19:16,2016-08-21 14:20:13,src_IP: ['143.72.8.137'],"dest_IP: ['42.219.153.89', '42.219.154.100', '...",dest_port: 891,['UDP'],892,['.A....'],[background:100.0%],['t1471789156_mse7.381007671356201_src0_dest21...
8,5630,2016-08-21 22:31:00,2016-08-21 22:33:59,src_IP: ['42.219.158.161'],dest_IP: ['143.72.8.137'],dest_port: 1,['UDP'],2722,['.A....'],[background:100.0%],['t1471818660_mse10.604832649230957_src17_dest...
9,2830,2016-08-21 22:48:00,2016-08-21 22:48:59,src_IP: ['42.219.158.161'],dest_IP: ['143.72.8.137'],dest_port: 1,['UDP'],1398,['.A....'],[background:100.0%],['t1471819680_mse10.457809448242188_src17_dest...


In [23]:
new_report  # abs 1000 whole August week4 port 53 1 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,17013,2016-08-23 09:16:14,2016-08-24 19:40:19,dest_IP: ['42.219.154.149'],"dest_port: [53, 35516, 53340]",src_IP: 18,['UDP'],10356,['.A....'],[background:100.0%],['t1471943774_mse5.122874736785889_src56_dest2...
1,2934,2016-08-24 21:52:29,2016-08-24 21:53:17,dest_IP: ['42.219.152.249'],"src_IP: ['52.114.87.45', '85.229.36.60', '85.2...",dest_port: 1193,['UDP'],1194,['.A....'],[background:100.0%],['t1472075549_mse5.412864685058594_src42_dest2...
2,1037,2016-08-24 22:18:57,2016-08-24 22:19:28,dest_port: [53],"src_IP: ['176.42.236.233', '176.42.238.201']",dest_IP: 2,['UDP'],1026,['.A....'],[background:100.0%],['t1472077109_mse4.071473598480225_src54_dest1...
3,160385,2016-08-22 04:07:57,2016-08-26 08:03:38,src_IP: ['143.72.8.137'],"dest_IP: ['42.219.145.18', '42.219.145.19', '4...",dest_port: 26962,['UDP'],26952,['.A....'],"[background:99.99%, blacklist:0.01%]",['t1472131677_mse6.883640289306641_src0_dest21...
4,27703,2016-08-22 06:42:53,2016-08-26 12:58:27,dest_IP: ['42.219.157.222'],"src_IP: ['61.0.246.175', '63.128.22.194']",dest_port: 11932,['UDP'],11866,['.A....'],[background:100.0%],['t1471848172_mse4.7952423095703125_src39_dest...
5,1384,2016-08-26 15:44:19,2016-08-26 15:45:18,src_IP: ['143.72.8.137'],"dest_IP: ['42.219.153.156', '42.219.153.89', '...",dest_port: 686,['UDP'],687,['.A....'],[background:100.0%],['t1472226259_mse5.438696384429932_src0_dest21...
6,1440,2016-08-26 15:48:19,2016-08-26 15:49:18,src_IP: ['143.72.8.137'],"dest_IP: ['42.219.147.178', '42.219.153.156', ...",dest_port: 717,['UDP'],718,['.A....'],[background:100.0%],['t1472226499_mse4.739689350128174_src0_dest21...
7,1828,2016-08-26 15:49:19,2016-08-26 15:50:18,src_IP: ['143.72.8.137'],"dest_IP: ['42.219.147.178', '42.219.154.100', ...",dest_port: 903,['UDP'],904,['.A....'],[background:100.0%],['t1472226559_mse4.358358383178711_src0_dest21...
8,1518,2016-08-26 15:51:19,2016-08-26 15:52:18,src_IP: ['143.72.8.137'],"dest_IP: ['42.219.145.88', '42.219.153.89', '4...",dest_port: 752,['UDP'],751,['.A....'],[background:100.0%],['t1472226679_mse4.449367523193359_src0_dest21...
9,775,2016-08-26 16:07:30,2016-08-26 16:08:29,src_IP: ['143.72.8.137'],"dest_IP: ['42.219.147.178', '42.219.153.156', ...",dest_port: 376,['UDP'],372,['.A....'],[background:100.0%],['t1472227650_mse7.948718547821045_src0_dest19...


In [24]:
new_report  # abs 1000 whole August week5 port 53 1 min

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,12350,2016-08-29 01:25:21,2016-08-29 01:38:20,src_IP: ['42.219.156.247'],"dest_IP: ['202.252.114.232', '84.231.196.47', ...",dest_port: 5902,['UDP'],5903,['.A....'],[background:100.0%],['t1472433921_mse4.8107008934021_src19_dest42_...
1,20188,2016-08-29 02:02:38,2016-08-29 02:17:20,src_IP: ['42.219.156.247'],"dest_IP: ['200.244.165.98', '251.188.212.28', ...",dest_port: 9420,['UDP'],9421,['.A....'],[background:100.0%],['t1472436681_mse5.5250935554504395_src19_dest...
2,2988,2016-08-29 02:37:21,2016-08-29 02:38:20,src_IP: ['42.219.156.247'],dest_IP: ['196.71.41.147'],dest_port: 1479,['UDP'],1480,['.A....'],[background:100.0%],['t1472438241_mse4.720265865325928_src19_dest5...
3,2324,2016-08-29 02:39:21,2016-08-29 02:40:20,src_IP: ['42.219.156.247'],dest_IP: ['207.35.85.181'],dest_port: 1157,['UDP'],1158,['.A....'],[background:100.0%],['t1472438361_mse4.545443058013916_src19_dest5...
4,3100,2016-08-29 02:42:21,2016-08-29 02:43:19,src_IP: ['42.219.156.247'],dest_IP: ['243.157.88.89'],dest_port: 1538,['UDP'],1539,['.A....'],[background:100.0%],['t1472438541_mse5.491912364959717_src19_dest6...
5,5566,2016-08-29 02:46:25,2016-08-29 02:48:20,src_IP: ['42.219.156.247'],dest_IP: ['152.247.140.233'],dest_port: 2737,['UDP'],2738,['.A....'],[background:100.0%],['t1472438781_mse4.213651657104492_src19_dest5...
6,5600,2016-08-29 04:31:09,2016-08-29 04:33:08,src_IP: ['42.219.156.247'],dest_IP: ['203.248.224.138'],dest_port: 2756,['UDP'],2757,['.A....'],[background:100.0%],['t1472445069_mse4.517996788024902_src19_dest5...
7,6750,2016-08-29 05:27:09,2016-08-29 05:29:08,src_IP: ['42.219.156.247'],dest_IP: ['193.3.240.134'],dest_port: 3298,['UDP'],3299,['.A....'],[background:100.0%],['t1472448429_mse5.305490493774414_src19_dest5...
8,6344,2016-08-29 05:40:09,2016-08-29 05:42:08,src_IP: ['42.219.156.247'],dest_IP: ['206.132.29.139'],dest_port: 3108,['UDP'],3109,['.A....'],[background:100.0%],['t1472449209_mse5.769664287567139_src19_dest5...
9,5894,2016-08-29 05:48:09,2016-08-29 05:49:58,src_IP: ['42.219.156.247'],dest_IP: ['253.241.242.186'],dest_port: 2896,['UDP'],2897,['.A....'],[background:100.0%],['t1472449689_mse5.799064636230469_src19_dest6...


In [9]:
new_report_t = new_report.replace({',': ''}, regex=True)
new_report_t.to_csv("august_week1_port25_detect_abs1000_report.csv", index=False)

## Combine Report rows

In [58]:
anomaly_time_series_dir = "anomaly_time_series_july_week5_port6667_detect_abs10"
makedirs(f"{anomaly_time_series_dir}_merged", exist_ok=True)
i = 1
for file_str in new_report["combined_files"]:    
    merged_file = None
    print("working on row", i, "with", len(eval(file_str)), "in total")
    for file in eval(file_str):
        if merged_file is None:
#             merged_file = pd.read_csv(join(anomaly_time_series_dir, file)).set_index("Unnamed: 0")
            merged_file = pd.read_csv(join(anomaly_time_series_dir, file)).set_index("index")
            merged_file = merged_file.drop_duplicates()
        else:
#             df = pd.read_csv(join(anomaly_time_series_dir, file)).set_index("Unnamed: 0")
            df = pd.read_csv(join(anomaly_time_series_dir, file)).set_index("index")
            df = df.drop_duplicates()
            merged_file = merged_file.append(df[~df.index.isin(merged_file.index)])
    merged_file = merged_file.sort_index()
    merged_file.to_csv(join(f"{anomaly_time_series_dir}_merged", "row_" + str(i) + ".csv"))
    i += 1

working on row 1 with 2 in total
working on row 2 with 1 in total
working on row 3 with 8 in total
working on row 4 with 8 in total
working on row 5 with 8 in total
working on row 6 with 8 in total
working on row 7 with 5 in total


## June Week 3 Report

In [6]:
new_report

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,49830,2016-07-27 17:04:08,2016-07-27 18:37:47,dest_port: [25],"src_IP: ['42.219.145.18', '42.219.145.19', '42...",dest_IP: 210,['TCP'],23469,['.APRS.' '.A.RS.' '.APRSF' '.AP.SF' '.A..S.' ...,"[anomaly-spam:86.23%, background:13.44%, black...",[src19_dest56_port29.csv]
1,601677,2016-07-28 00:08:04,2016-07-30 06:11:47,dest_port: [80],"dest_IP: ['42.219.153.12', '42.219.153.15', '4...",src_IP: 15554,['TCP'],63549,['....S.' '.A...F' '.A.R..' '.APRSF' '.AP.SF' ...,"[background:100.0%, blacklist:0.0%]",[src25_dest22_port24.csv]
2,1274265,2016-07-28 00:08:04,2016-07-30 06:11:47,dest_IP: ['42.219.156.211'],dest_port: [80],src_IP: 3241,['TCP'],56571,['.AP.SF' '.APRS.' '.APRSF' '.AP.S.' '.AP...' ...,[background:100.0%],[src25_dest19_port24.csv]
3,1566836,2016-07-28 00:08:04,2016-07-31 19:56:33,dest_port: [80],"dest_IP: ['42.219.158.16', '42.219.158.160', '...",src_IP: 32954,['TCP'],64255,['....S.' '...R..' '...RS.' '.A....' '.A...F' ...,"[background:55.32%, dos:44.36%, blacklist:0.27...","['src25_dest17_port24.csv', 'src20_dest17_port..."
4,20509,2016-07-28 00:08:06,2016-07-31 19:56:29,dest_port: [80],"dest_IP: ['42.219.159.103', '42.219.159.104', ...",src_IP: 449,['TCP'],16094,['....S.' '...R..' '...RS.' '.A....' '.A...F' ...,[background:100.0%],"['src25_dest16_port24.csv', 'src20_dest16_port..."
5,569904,2016-07-28 00:09:52,2016-07-31 19:32:32,src_IP: ['42.219.152.20'],"dest_IP: ['42.219.150.242', '42.219.150.243']",dest_port: 19553,['ICMP' 'TCP'],[ 0 22 80],['.A....' '.A..S.' '.A.R..'],"[dos:99.99%, scan44:0.01%]","['src23_dest25_port6.csv', 'src23_dest25_port7..."
6,701122,2016-07-28 00:09:52,2016-07-31 19:32:32,"src_IP: ['42.219.158.16', '42.219.158.160', '4...","dest_IP: ['152.13.150.132', '152.138.227.72', ...",dest_port: 25056,['ICMP' 'TCP' 'UDP'],40,['...R..' '.A....' '.A...F' '.A..S.' '.A..SF' ...,"[dos:98.29%, background:1.7%, scan44:0.01%]","['src17_dest25_port6.csv', 'src17_dest25_port7..."
7,571876,2016-07-28 00:09:52,2016-07-31 19:32:32,dest_IP: ['42.219.152.20'],dest_port: [80],src_IP: 2,['TCP'],19568,['...RS.' '....S.' '.A....'],"[dos:99.99%, scan44:0.01%]",[src25_dest23_port24.csv]
8,343503,2016-07-28 00:09:53,2016-07-31 19:32:32,"src_IP: ['42.219.154.100', '42.219.154.101', '...","dest_IP: ['152.103.34.229', '152.105.244.214',...",dest_port: 46251,['ICMP' 'TCP' 'UDP'],1654,['....S.' '...R..' '.A....' '.A...F' '.A..S.' ...,"[dos:62.13%, background:29.12%, scan11:8.75%, ...","['src21_dest25_port15.csv', 'src21_dest25_port..."
9,288594,2016-07-28 00:49:52,2016-07-31 19:08:13,src_IP: ['42.219.156.30'],dest_IP: ['42.219.150.247'],dest_port: 570,['ICMP' 'TCP'],1001,['...R..' '.A....' '.A..S.' '.A.R..'],[scan44:100.0%],"['src19_dest25_port20.csv', 'src19_dest25_port..."


## June Week 4 Report

In [8]:
new_report

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,7039297,2016-06-20 00:07:06,2016-06-25 18:59:56,dest_port: [25],"src_IP: ['42.219.145.88', '42.219.152.126', '4...",dest_IP: 221,['TCP'],28363,['....S.' '...R..' '...RS.' '.A....' '.A..S.' ...,"[anomaly-spam:92.21%, background:7.78%, blackl...","['src19_dest56_port29.csv', 'src19_dest63_port..."
1,927220,2016-06-20 01:33:58,2016-06-26 13:29:19,dest_port: [25],"dest_IP: ['104.1.239.51', '104.130.105.35', '1...",src_IP: 72,['TCP'],28606,['....S.' '...R..' '.A....' '.A..S.' '.A..SF' ...,"[anomaly-spam:71.3%, background:28.69%, blackl...","['src19_dest45_port29.csv', 'src19_dest54_port..."
2,156064,2016-06-20 01:52:40,2016-06-26 13:06:13,"src_IP: ['108.66.255.194', '108.66.255.199', '...","dest_IP: ['42.219.156.180', '42.219.156.181', ...",dest_port: 28145,['TCP'],[25],['.A..S.' '.AP..F' '.AP.S.' '.AP.SF'],"[background:97.82%, anomaly-spam:2.18%]","['src45_dest19_port20.csv', 'src45_dest19_port..."
3,2990739,2016-06-20 01:52:41,2016-06-25 18:09:25,"dest_IP: ['42.219.156.180', '42.219.156.184', ...","src_IP: ['192.143.84.56', '192.143.84.60', '19...",dest_port: 28235,['TCP'],[ 25 34536 64396],['....S.' '...R..' '.A...F' '.A..S.' '.A..SF' ...,"[anomaly-spam:98.26%, background:1.74%]","['src56_dest19_port18.csv', 'src63_dest19_port..."
4,4261707,2016-06-20 17:03:28,2016-06-25 18:09:25,"src_IP: ['192.128.230.96', '192.143.84.56', '1...","dest_IP: ['42.219.144.100', '42.219.144.202', ...",dest_port: 28241,['TCP' 'UDP'],[ 25 1130 26459 31674 34680 40002 40016 577...,['....S.' '...R..' '.A....' '.A...F' '.A..S.' ...,"[anomaly-spam:99.48%, background:0.52%]","['src63_dest19_port21.csv', 'src56_dest21_port..."
5,26212,2016-06-21 19:52:46,2016-06-21 21:31:56,dest_port: [53],"dest_IP: ['42.219.155.100', '42.219.155.102', ...",src_IP: 9454,['TCP' 'UDP'],19914,['...RS.' '.A....'],"[background:99.99%, blacklist:0.01%]","['src36_dest20_port23.csv', 'src38_dest20_port..."
6,1971061,2016-06-22 10:53:42,2016-06-23 06:54:32,"src_IP: ['192.22.25.141', '194.231.140.151', '...","dest_port: [1000, 1001, 1002, 1003, 1004, 1005...",dest_IP: 4109,['ICMP' 'TCP' 'UDP'],152,['....S.' '...R..' '...RS.' '.A....' '.A..S.' ...,"[background:99.98%, blacklist:0.02%]","['src52_dest36_port6.csv', 'src58_dest29_port8..."
7,544618,2016-06-22 12:00:39,2016-06-23 04:08:49,src_IP: ['210.97.208.13'],"dest_port: [5000, 5001, 5002, 5003, 5004, 5005...",dest_IP: 4096,['UDP'],[5080 5093],['.A....'],"[background:99.97%, blacklist:0.03%]","['src58_dest21_port8.csv', 'src58_dest16_port8..."
8,14996,2016-06-23 01:45:36,2016-06-23 03:50:45,dest_IP: ['42.219.159.82'],dest_port: [8000],src_IP: 13,['TCP'],315,['.AP...' '.AP.S.' '.A...F' '.AP..F' '.AP.SF' ...,[background:100.0%],[src59_dest19_port11.csv]
9,549915,2016-06-23 05:59:24,2016-06-23 07:51:17,"dest_port: [6, 8, 9, 10, 11, 12, 13, 14, 15, 1...","src_IP: ['208.100.215.134', '208.104.108.241',...",dest_IP: 4096,['TCP' 'UDP'],51630,['....S.' '...R..' '...RS.' '.A....' '.A...F' ...,"[background:99.97%, blacklist:0.03%]","['src58_dest25_port0.csv', 'src58_dest26_port0..."


In [9]:
new_report.to_csv("june_week_4_first_three_oct_report.csv", index=False)

## July Week 5

In [13]:
new_report

Unnamed: 0,num_flow,start_time,end_time,lowest_entropy,2nd_lowest_entropy,highest_entropy,protocol,src_port,flags,label,combined_files
0,14877,2016-07-27 17:04:16,2016-07-27 17:37:17,dest_port: [25],"dest_IP: ['193.27.1.120', '193.27.6.165']",src_IP: 21,['TCP'],11667,['....S.' '.A.RS.' '.AP..F' '.AP.S.' '.AP.SF' ...,"[anomaly-spam:99.39%, background:0.61%]",['t1469653842_mse5.122787952423096_src19_dest5...
1,37686,2016-07-27 17:10:31,2016-07-27 17:16:15,"src_IP: ['193.27.1.120', '193.27.1.135', '193....","dest_IP: ['193.27.1.120', '193.27.1.135', '193...",dest_port: 14000,['TCP'],14000,['....S.' '.A..S.' '.A..SF' '.A.RS.' '.AP.S.' ...,[anomaly-spam:100.0%],['t1469653952_mse5.289257526397705_src19_dest5...
2,100226,2016-07-27 17:21:10,2016-07-31 05:42:30,"src_IP: ['176.206.157.1', '176.42.236.233', '1...","dest_IP: ['176.206.157.1', '176.42.236.233', '...",dest_port: 21209,['ICMP' 'TCP' 'UDP'],25107,['....S.' '.A....' '.A...F' '.A..SF' '.A.R..' ...,[background:100.0%],['t1469656262_mse6.292412757873535_src21_dest5...
3,340046,2016-07-27 18:51:24,2016-07-31 23:07:52,dest_port: [25],"src_IP: ['42.219.153.8', '42.219.156.178', '42...",dest_IP: 46,['TCP'],28233,['....S.' '...R..' '.A....' '.A..S.' '.A.R..' ...,[background:100.0%],['t1469659842_mse7.91537618637085_src19_dest45...
4,305927,2016-07-27 18:51:24,2016-07-31 23:14:46,"src_IP: ['108.66.255.194', '108.66.255.199', '...","dest_IP: ['108.66.255.194', '108.66.255.199', ...",dest_port: 28117,['TCP'],28119,['....S.' '...R..' '.A....' '.A..S.' '.A.R..' ...,[background:100.0%],['t1469659842_mse7.91537618637085_src19_dest45...
...,...,...,...,...,...,...,...,...,...,...,...
74,4,2016-07-31 15:47:27,2016-07-31 15:49:14,dest_IP: ['42.219.154.167'],dest_port: [53],src_IP: 3,['UDP'],[29364 65099 28771 50103],['.A....'],[background:100.0%],[t1469994387_mse6.667959690093994_src59_dest21...
75,255194,2016-07-31 20:10:22,2016-07-31 21:49:23,"dest_IP: ['42.219.153.191', '42.219.153.62']","dest_port: [53, 80]",src_IP: 11559,['TCP' 'UDP'],63553,['....S.' '...R..' '...RS.' '.A....' '.A...F' ...,"[background:100.0%, blacklist:0.0%]",['t1470012022_mse5.0016303062438965_src57_dest...
76,336991,2016-07-31 20:10:22,2016-07-31 21:49:23,dest_port: [53],"dest_IP: ['42.219.153.191', '42.219.153.26', '...",src_IP: 16812,['TCP' 'UDP'],64688,['.A....' '.AP.SF'],"[background:99.99%, blacklist:0.01%]",['t1470012022_mse5.0016303062438965_src57_dest...
77,244829,2016-07-31 20:10:33,2016-07-31 21:37:18,"src_IP: ['200.17.220.137', '201.111.38.45', '4...","dest_IP: ['200.17.220.137', '201.111.38.45', '...",dest_port: 47947,['TCP' 'UDP'],47954,['.A....' '.A...F' '.A..S.' '.A..SF' '.AP..F' ...,[background:100.0%],['t1470012022_mse5.0016303062438965_src22_dest...


In [10]:
new_report.to_csv("july_week_5_mse4_biIP_20_1hr_report.csv", index=False)

In [5]:
total_traffic = 0
anomaly_traffic = 0
scan_44 = 0
scan_11 = 0
dos = 0
anomaly_spam = 0
anomaly_udpscan = 0
anomaly_sshscan = 0
blacklist = 0
nerisbotnet = 0
i = 0
for df in pd.read_csv("july.week5.csv.uniqblacklistremoved", names=header, usecols=header, chunksize=10000000):
    total_traffic += df.shape[0]
    anomaly_traffic += df[df.label != "background"].shape[0]
    scan_44 += df[df.label == "scan44"].shape[0]
    scan_11 += df[df.label == "scan11"].shape[0]
    dos += df[df.label == "dos"].shape[0]
    anomaly_spam += df[df.label == "anomaly-spam"].shape[0]
    anomaly_udpscan += df[df.label == "anomaly-udpscan"].shape[0]
    anomaly_sshscan += df[df.label == "anomaly-sshscan"].shape[0]
    blacklist += df[df.label == "blacklist"].shape[0]
    nerisbotnet += df[df.label == "nerisbotnet"].shape[0]
    print(i)
    i += 1
print("Originally there are {} traffic with {}({}%) being anomaly traffic".format(total_traffic, anomaly_traffic, round(anomaly_traffic / float(total_traffic) * 100, 2)))
print("Scan 44 {}, Scan 11 {}, DOS {}, anomaly-udpscan {}, anomaly-sshscan {}, anomaly-spam {}, nerisbotnet {}, blacklist {}".format(scan_44, scan_11, dos, anomaly_udpscan, anomaly_sshscan, anomaly_spam, nerisbotnet, blacklist))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
Originally there are 539490894 traffic with 7515794(1.39%) being anomaly traffic
Scan 44 1496504, Scan 11 323120, DOS 3134728, anomaly-udpscan 0, anomaly-sshscan 10, anomaly-spam 503658, nerisbotnet 607036, blacklist 1450738


In [3]:
total_traffic = 0
anomaly_traffic = 0
scan_44 = 0
scan_11 = 0
dos = 0
anomaly_spam = 0
anomaly_udpscan = 0
anomaly_sshscan = 0
blacklist = 0
nerisbotnet = 0
i = 0
for df in pd.read_csv("august.week1.csv", names=header, usecols=header, chunksize=10000000):
    total_traffic += df.shape[0]
    anomaly_traffic += df[df.label != "background"].shape[0]
    scan_44 += df[df.label == "scan44"].shape[0]
    scan_11 += df[df.label == "scan11"].shape[0]
    dos += df[df.label == "dos"].shape[0]
    anomaly_spam += df[df.label == "anomaly-spam"].shape[0]
    anomaly_udpscan += df[df.label == "anomaly-udpscan"].shape[0]
    anomaly_sshscan += df[df.label == "anomaly-sshscan"].shape[0]
    blacklist += df[df.label == "blacklist"].shape[0]
    nerisbotnet += df[df.label == "nerisbotnet"].shape[0]
    print(i)
    i += 1
print("Originally there are {} traffic with {}({}%) being anomaly traffic".format(total_traffic, anomaly_traffic, round(anomaly_traffic / float(total_traffic) * 100, 2)))
print("Scan 44 {}, Scan 11 {}, DOS {}, anomaly-udpscan {}, anomaly-sshscan {}, anomaly-spam {}, nerisbotnet {}, blacklist {}".format(scan_44, scan_11, dos, anomaly_udpscan, anomaly_sshscan, anomaly_spam, nerisbotnet, blacklist))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
Originally there are 851614534 traffic with 40526401(4.76%) being anomaly traffic
Scan 44 2477203, Scan 11 539018, DOS 5093132, anomaly-udpscan 989872, anomaly-sshscan 16, anomaly-spam 27970000, nerisbotnet 992575, blacklist 2464585


In [71]:
total_traffic = 0
anomaly_traffic = 0
scan_44 = 0
scan_11 = 0
dos = 0
anomaly_spam = 0
anomaly_udpscan = 0
anomaly_sshscan = 0
blacklist = 0
nerisbotnet = 0
i = 0
for df in pd.read_csv("august.week5.csv", names=header, usecols=header, chunksize=10000000):
    total_traffic += df.shape[0]
    anomaly_traffic += df[df.label != "background"].shape[0]
    scan_44 += df[df.label == "scan44"].shape[0]
    scan_11 += df[df.label == "scan11"].shape[0]
    dos += df[df.label == "dos"].shape[0]
    anomaly_spam += df[df.label == "anomaly-spam"].shape[0]
    anomaly_udpscan += df[df.label == "anomaly-udpscan"].shape[0]
    anomaly_sshscan += df[df.label == "anomaly-sshscan"].shape[0]
    blacklist += df[df.label == "blacklist"].shape[0]
    nerisbotnet += df[df.label == "nerisbotnet"].shape[0]
    print(i)
    i += 1
print("Originally there are {} traffic with {}({}%) being anomaly traffic".format(total_traffic, anomaly_traffic, round(anomaly_traffic / float(total_traffic) * 100, 2)))
print(f"Background {total_traffic - anomaly_traffic}, anomaly {anomaly_traffic}, anomaly w/o blacklist {anomaly_traffic - blacklist}")
print("Scan 44 {}, Scan 11 {}, DOS {}, anomaly-udpscan {}, anomaly-sshscan {}, anomaly-spam {}, nerisbotnet {}, blacklist {}".format(scan_44, scan_11, dos, anomaly_udpscan, anomaly_sshscan, anomaly_spam, nerisbotnet, blacklist))

0
1
2
3
4
Originally there are 40289595 traffic with 136403(0.34%) being anomaly traffic
Background 40153192, anomaly 136403, anomaly w/o blacklist 1
Scan 44 0, Scan 11 0, DOS 0, anomaly-udpscan 0, anomaly-sshscan 1, anomaly-spam 0, nerisbotnet 0, blacklist 136402


In [87]:
"Scan 44: Actual: 1496504 detected: 1494553(99.87%)"     "MSE 15, Abs Error 2000, 80%"
                                    1494553(99.87%)      "MSE 7,  Abs Error 2000, 80%"
                                    1494553(99.87%)      "MSE 7,  Abs Error 2000, new dimension, 80%"
                                    1494553(99.87%)      "MSE 0,  Abs Error 2000, new dimension, 80%"
            
                                    1496504(100%)        "MSE 4,  Abs Error 2000, bidirection ip detection with new dimension, 50% rule used instead of 80%"
                                    1496504(100%)        "MSE 4,  Abs Error 2000, bidirection ip detection with new dimension, 80%"      
                    
                                    1496504(100%)        "MSE 4,  Abs Error 2000, bidirection ip detection with new dimension, 80%, 1hour"
                                    1494553(99.87%)      "MSE 4,  Abs Error 2000, new dimension, 80%, 1hour"
                                    1494553(99.87%)      "MSE 4,  Abs Error 2000, new dimension, above 20%, 1hour"
                                    1496504(100%)        "MSE 4,  Abs Error 2000, bidirection ip detection with new dimension, above 20%, 1hour"
                                    
                                    1496504(100%)        "MSE 4,  Abs Error 2000, bidirection ip detection with new dimension 10% threshold, above 20%, 1hour"
                                    
                                    1496419(99.99%)      "MSE 4,  Abs Error 2000, bidirection detection with new dimension 5% threshold, above 20%, 1hour"
                                    1496419(99.99%)      "MSE 4,  Abs Error 2000, bidirection detection with new dimension 0% threshold, above 20%, 1hour"
                                    1496419(99.99%)      "MSE 4,  Abs Error 2000, bidirection detection with new dimension 2.5% threshold, above 20%, 1hour"
                                    1479569(98.87%)      "MSE 4,  Abs Error 2000, bidirection detection with new dimension 2.5% threshold, above 20%"
                                    
                                    1496413(99.99%)      "MSE 4,  Abs Error 2000, bidirection detection with new dimension 2.5% threshold, above 20%, 1hour, flag 6, bytes 12"
                                    1491748(99.68%)      "With UDP rule applied"
                                    1496349(99.99%)      "With UDP rule applied, bytes 100"
                                    1496411(99.99%)      "without 20% on protocol, bytes 100"
                                    1491748(99.68%)      "without 20% on protocol, bytes 12, first 99%"
                                    1496413(99.99%)      "without 20% on protocol, bytes 12, all values"
                                    
                2477203             2477071(99.99%)      "MSE 4,  Abs Error 2000, bidirection detection with new dimension 2.5% threshold, above 20%, 1hour, flag 6, bytes 12"
                                    2477071(99.99%)
                 547468              547423(99.99%)      "MSE 4,  Abs Error 2000, bidirection detection with new dimension 2.5% threshold, above 20%, 1hour, flag 6, bytes 12"
                                     547423(99.99%)
                      0                   0(-%)          "MSE 4,  Abs Error 2000, bidirection detection with new dimension 2.5% threshold, above 20%, 1hour, flag 6, bytes 12"
                                          0(-%)
                      0                   0(-%)          "MSE 4,  Abs Error 2000, bidirection detection with new dimension 2.5% threshold, above 20%, 1hour, flag 6, bytes 12"
                                          0(-%)
                      0                   0(-%)          "MSE 4,  Abs Error 2000, bidirection detection with new dimension 2.5% threshold, above 20%, 1hour, flag 6, bytes 12"
                                          0(-%)

In [None]:
"Scan 11: Actual: 323120 detected: 102348(31.67%)"     "DOS: Actual: 3134728 detected: 2881705(91.93%)"
                                   244562(75.69%)                                      3113442(99.32%)
                                   244562(75.69%)                                      3113442(99.32%)
                                   244562(75.69%)                                      3113442(99.32%)
            
                                   291058(90.08%)                                      2544529(81.17%)
                                   323120(100%)                                        3131256(99.89%)
                    
                                   323120(100%)                                        3127784(99.78%)
                                   322489(99.80%)                                      3109970(99.21%)
                                   322489(99.80%)                                      3039587(96.96%)
                                   323120(100%)                                        3127784(99.78%)
                                   
                                   323120(100%)                                        3127784(99.78%)
                                   
                                   323092(99.99%)                                      3133385(99.96%)
                                   323092(99.99%)                                      3133385(99.96%)
                                   323092(99.99%)                                      3133385(99.96%)
                                   307101(95.04%)                                      3086860(98.47%)
                                
                                   323087(99.99%)                                      3126345(99.73%)
                                   322154(99.70%)                                      3127318(99.76%)
                                   323073(99.99%)                                      3127663(99.77%)
                                   323087(99.99%)                                      3127663(99.77%)
                                   322154(99.70%)                                      3126975(99.75%)
                                   323087(99.99%)                                      3127636(99.77%)
                  
                539018             538977(99.99%)                    5093132           5069518(99.54%)
                                   538975(99.99%)                                      5069534(99.54%)
                140541             140530(99.99%)                    1028245           1027076(99.89%)
                                   140530(99.99%)                                      1027076(99.89%)
                     0                  0(-%)                              0                 0(-%)
                                        0(-%)                                                0(-%)
                     0                  0(-%)                              0                 0(-%)
                                        0(-%)                                                0(-%)                    
                     0                  0(-%)                              0                 0(-%)
                                        0(-%)                                                0(-%)

In [None]:
"anomaly-spam: Actual: 503658 detected: 42968(8.53%)"   "nerisbotnet: Actual: 607036 detected: 40368(6.65%)"
                                        42938(8.53%)                                          141370(23.29%)
                                        42938(8.53%)                                          141370(23.29%)
                                        42938(8.53%)                                          141370(23.29%)
            
                                        36209(7.19%)                                          169636(27.94%)
                                        57082(11.33%)                                         190960(31.46%)
                    
                                        56996(11.32%)                                         180907(29.80%)
                                        31933(6.34%)                                          180907(29.80%)
                                        25359(5.03%)                                          141370(23.29%)
                                        50422(10.01%)                                         141370(23.29%)
                                         
                                        6309(1.25%)                                           141370(23.29%)
                                        
                                        71592(14.21%)                                         203000(33.44%)
                                        71592(14.21%)                                         203005(33.44%)
                                        71592(14.21%)                                         203005(33.44%)
                                        50985(10.12%)                                         167943(27.67%)
                                        
                                        71582(14.21%)                                         203005(33.44%)
                                        71464(14.19%)                                         156527(25.79%)
                                        71354(14.17%)                                         163759(26.98%)
                                        71354(14.17%)                                         163759(26.98%)
                                        71464(14.19%)                                         156527(25.79%)
                                        71591(14.21%)                                         156558(25.79%)
                      
                    27970000         27281404(97.54%)                         992575          319358(32.17%)
                                     27245350(97.41%), anomaly-udpscan 989730(99.99%),        186348(18.77%)
                    36796698         34029761(92.48%)                          81918           50492(61.64%)
                                     33962362(92.30%), anomaly-udpscan 0(0.00%),               44464(54.28%)
                     7059245          3574305(50.63%)                              0               0(-%)
                                      3625682(51.36%), anomaly-udpscan 0(0.00%)                    0(-%)
                     5287316          1389479(26.28%)                              0               0(-%)
                                      1463443(27.68%), anomaly-udpscan 0(0.00%)                    0(-%)
                           0                0(-%)                                  0               0(-%)
                                            0(-%)                                                  0(-%)

In [None]:
"blacklist: Actual: 1450738 detected: 7519(0.52%)"      "Background: Actual: 531975100 unlabed Deta: 5017613(0.94%)"
                                     18109(1.25%)                                                    7263038(1.37%)
                                     18109(1.25%)                                                   12318012(2.32%)
                                     18234(1.26%)                                                   12388694(2.33%)
            
                                     16295(1.12%)                                                   15186264(2.85%)
                                     32732(2.26%)                                                   27031010(5.08%)
                    
                                     19927(1.37%)                                                   17168935(3.23%)
                                     19883(1.37%)                                                   16513290(3.10%)
                                      8691(0.60%)                                                    9873775(1.86%)
                                      8691(0.60%)                                                   10133579(1.90%)
                                     
                                      8673(0.60%)                                                    9635405(1.81%)
                                    
                                     15602(1.06%)                                                   15138312(2.85%)
                                     15742(1.09%)                                                   16311350(3.07%)
                                     15602(1.06%)                                                   15218029(2.86%)
                                      2236(0.15%)                                                    2345733(0.44%)
                                    
                                     15075(1.06%)                                                    4870604(0.92%)
                                      2223(0.15%)                                                    2685323(0.50%)
                                      3590(0.25%)                                                    3014293(0.57%)
                                      3588(0.25%)                                                    2789581(0.52%)
                                      2209(0.15%)                                                    2434271(0.46%)
                                      2209(0.15%)                                                    2431905(0.46%)
                                        
                    2464585         298302(12.10%)                           811088133              10112128(1.25%)
                                    273673(11.10%)                                                   5888004(0.73%)
                    5728174         167287(2.92%)                            793518109               4366264(0.55%)
                                    101987(1.78%)                                                    3612411(0.46%)
                    5141634            422(0.01%)                            806857029               1679317(0.21%)
                                       422(0.01%)                                                    1510079(0.19%)
                    3069118            481(0.02%)                            854170414               3884811(0.45%)
                                       473(0.02%)                                                    3701660(0.43%)
                     136402              4(0.00%)                            40153192                 306744(0.76%)
                                         4(0.00%)                                                     264100(0.66%)

In [None]:
"Anomaly: Actual: 7515794 detected: 4569461(60.80%)"   "Anomaly without blacklist: Actual: 6065046 detected: 4569461(60.80%)"
                                    5054974(67.26%)                                                          5054974(83.35%)
                                    5054974(67.26%)                                                          5054974(83.35%)
                                    5055099(67.26%)                                                          5055099(83.35%)
            
                                    4554231(60.60%)                                                          4554231(75.09%)
                                    5231654(69.61%)                                                          5198922(85.72%)
                    
                                    5205238(69.26%)                                                          5198922(85.49%)
                                    5159735(68.65%)                                                          5139852(84.75%)
                                    5032049(66.95%)                                                          5023358(82.82%)
                                    5147891(68.49%)                                                          5139200(84.73%)
                                    
                                    5103760(67.91%)                                                          5095087(84.01%)
                                    
                                    5243090(69.76%)                                                          5227488(86.19%)
                                    5243235(69.76%)                                                          5227493(86.19%)
                                    5243095(69.76%)                                                          5227493(86.19%)
                                    5094694(67.79%)                                                          5092458(83.96%)
                                    
                                    5235507(69.66%)                                                          5220432(86.07%)
                                    5171434(68.81%)                                                          5169211(85.23%)
                                    5185788(69.00%)                                                          5182198(85.44%)
                                    5185862(69.00%)                                                          5182274(85.44%)
                                    5171077(68.80%)                                                          5168868(85.22%)
                                    5177494(68.89%)                                                          5175285(85.33%)
                                    
                  40526401         36724843(90.62%)                                        38061816         36426541(95.70%)
                                   36780681(90.76%)                                                         36507008(95.92%)
                  44323049         35962569(81.14%)                                        38594875         35795282(92.75%)
                                   35823842(80.82%)                                                         35721855(92.56%)
                  12200899          3574727(29.30%)                                         7059265          3574305(50.63%)
                                    3626104(29.72%)                                                          3625682(51.36%)
                   8356446          1389960(16.63%)                                         5287328          1389479(26.28%)
                                    1463916(17.52%)                                                          1463443(27.68%)
                    136403                4(0.00%)                                                1                0(0.00%)
                                          4(0.00%)                                                                 0(0.00%)

In [26]:
anomaly_spam_tot[~anomaly_spam_tot.index.isin(anomaly_spam)]

Unnamed: 0,timestamp,duration,src_IP,dest_IP,src_port,dest_port,protocol,flags,forwarding_status,type_of_service,packets_exchanged,number_of_bytes,label
2871345,2016-07-27 14:10:50,1.500,193.27.6.165,42.219.156.223,25,56713,TCP,.AP.SF,0,0,16,1582,anomaly-spam
2871369,2016-07-27 14:10:50,1.676,42.219.156.223,193.27.6.165,56713,25,TCP,.APRS.,0,0,26,21709,anomaly-spam
2874609,2016-07-27 14:10:52,1.468,193.26.243.129,42.219.156.223,25,43980,TCP,.AP.SF,0,0,17,1627,anomaly-spam
2874649,2016-07-27 14:10:52,1.644,42.219.156.223,193.26.243.129,43980,25,TCP,.APRS.,0,0,26,21667,anomaly-spam
2878185,2016-07-27 14:10:54,1.608,193.27.1.120,42.219.156.212,25,45886,TCP,.AP.SF,0,0,18,1686,anomaly-spam
...,...,...,...,...,...,...,...,...,...,...,...,...,...
265855083,2016-07-29 13:53:07,0.000,42.219.156.214,192.143.87.124,38523,25,TCP,....S.,0,0,1,60,anomaly-spam
265945303,2016-07-29 13:54:04,0.200,108.66.255.194,42.219.156.214,25,42128,TCP,.AP.SF,0,72,3,230,anomaly-spam
265945377,2016-07-29 13:54:04,0.320,42.219.156.214,108.66.255.194,42128,25,TCP,.A.RS.,0,0,4,216,anomaly-spam
265947041,2016-07-29 13:54:05,1.032,192.143.87.95,42.219.156.214,25,50636,TCP,.AP.SF,0,72,12,1215,anomaly-spam


In [35]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_july_week5_1min_mse4_port2.5_bidir_20.0_1hr_flag6_numBytes12_hardcode_dest_UDP_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('Unnamed: 0')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1496504 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 323120 * 100,
    dos.shape[0], dos.shape[0] / 3134728 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 503658 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 1 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 607036 * 100,
    blacklist.shape[0], blacklist.shape[0] / 1450738 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 531975100 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 7515794 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 6065046 * 100,
))

0.02 0.04 0.05 0.07 0.09 0.11 0.13 0.15 0.16 0.18 0.2 0.22 0.24 0.25 0.27 0.29 0.31 0.33 0.35 0.36 0.38 0.4 0.42 0.44 0.45 0.47 0.49 0.51 0.53 0.55 0.56 0.58 0.6 0.62 0.64 0.65 0.67 0.69 0.71 0.73 0.75 0.76 0.78 0.8 0.82 0.84 0.85 0.87 0.89 0.91 0.93 0.95 0.96 0.98 1.0 
There are 7609399 traffic detected as anomaly 5177494(68.04%) being labeled anomaly traffic
Scan 44 1496413(99.99%), Scan 11 323087(99.99%), DOS 3127636(99.77%),
anomaly-spam 71591(14.21%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 156558(25.79%), blacklist 2209(0.15%)
background misclassfied 2431905(0.46%),
anomaly detected 5177494(68.89%), anomaly detected(w/o blacklist) 5175285(85.33%)



In [10]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_july_week5_port6667_detect_abs10_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
#             df = pd.read_csv(join(root, f)).set_index('Unnamed: 0')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1496504 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 323120 * 100,
    dos.shape[0], dos.shape[0] / 3134728 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 503658 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 1 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 607036 * 100,
    blacklist.shape[0], blacklist.shape[0] / 1450738 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 531975100 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 7515794 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 6065046 * 100,
))

0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 
There are 17310 traffic detected as anomaly 17310(100.0%) being labeled anomaly traffic
Scan 44 1492(0.10%), Scan 11 36(0.01%), DOS 2(0.00%),
anomaly-spam 0(0.00%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 15780(2.60%), blacklist 0(0.00%)
background misclassfied 0(0.00%),
anomaly detected 17310(0.23%), anomaly detected(w/o blacklist) 17310(0.29%)



In [59]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_july_week5_port6667_detect_abs10_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
#             df = pd.read_csv(join(root, f)).set_index('Unnamed: 0')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1496504 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 323120 * 100,
    dos.shape[0], dos.shape[0] / 3134728 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 503658 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 1 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 607036 * 100,
    blacklist.shape[0], blacklist.shape[0] / 1450738 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 531975100 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 7515794 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 6065046 * 100,
))

0.14 0.29 0.43 0.57 0.71 0.86 1.0 
There are 1530 traffic detected as anomaly 1530(100.0%) being labeled anomaly traffic
Scan 44 1492(0.10%), Scan 11 36(0.01%), DOS 2(0.00%),
anomaly-spam 0(0.00%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 0(0.00%), blacklist 0(0.00%)
background misclassfied 0(0.00%),
anomaly detected 1530(0.02%), anomaly detected(w/o blacklist) 1530(0.03%)



In [11]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
mse 2 abs 200 july
mse 5 abs 200 august


for root, _, files in os.walk("anomaly_time_series_july_week5_port25_detect_abs200_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
#             df = pd.read_csv(join(root, f)).set_index('Unnamed: 0')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1496504 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 323120 * 100,
    dos.shape[0], dos.shape[0] / 3134728 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 503658 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 1 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 607036 * 100,
    blacklist.shape[0], blacklist.shape[0] / 1450738 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 531975100 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 7515794 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 6065046 * 100,
))

0.0 0.01 0.01 0.02 0.02 0.03 0.03 0.04 0.04 0.04 0.05 0.05 0.06 0.06 0.07 0.07 0.07 0.08 0.08 0.09 0.09 0.1 0.1 0.11 0.11 0.11 0.12 0.12 0.13 0.13 0.14 0.14 0.14 0.15 0.15 0.16 0.16 0.17 0.17 0.18 0.18 0.18 0.19 0.19 0.2 0.2 0.21 0.21 0.21 0.22 0.22 0.23 0.23 0.24 0.24 0.25 0.25 0.25 0.26 0.26 0.27 0.27 0.28 0.28 0.29 0.29 0.29 0.3 0.3 0.31 0.31 0.32 0.32 0.32 0.33 0.33 0.34 0.34 0.35 0.35 0.36 0.36 0.36 0.37 0.37 0.38 0.38 0.39 0.39 0.39 0.4 0.4 0.41 0.41 0.42 0.42 0.43 0.43 0.43 0.44 0.44 0.45 0.45 0.46 0.46 0.46 0.47 0.47 0.48 0.48 0.49 0.49 0.5 0.5 0.5 0.51 0.51 0.52 0.52 0.53 0.53 0.54 0.54 0.54 0.55 0.55 0.56 0.56 0.57 0.57 0.57 0.58 0.58 0.59 0.59 0.6 0.6 0.61 0.61 0.61 0.62 0.62 0.63 0.63 0.64 0.64 0.64 0.65 0.65 0.66 0.66 0.67 0.67 0.68 0.68 0.68 0.69 0.69 0.7 0.7 0.71 0.71 0.71 0.72 0.72 0.73 0.73 0.74 0.74 0.75 0.75 0.75 0.76 0.76 0.77 0.77 0.78 0.78 0.79 0.79 0.79 0.8 0.8 0.81 0.81 0.82 0.82 0.82 0.83 0.83 0.84 0.84 0.85 0.85 0.86 0.86 0.86 0.87 0.87 0.88 0.88 0.89 0.89 0.8

In [51]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None


for root, _, files in os.walk("anomaly_time_series_july_week5_port25_detect_abs200_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
#             df = pd.read_csv(join(root, f)).set_index('Unnamed: 0')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1496504 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 323120 * 100,
    dos.shape[0], dos.shape[0] / 3134728 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 503658 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 1 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 607036 * 100,
    blacklist.shape[0], blacklist.shape[0] / 1450738 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 531975100 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 7515794 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 6065046 * 100,
))

0.03 0.06 0.08 0.11 0.14 0.17 0.19 0.22 0.25 0.28 0.31 0.33 0.36 0.39 0.42 0.44 0.47 0.5 0.53 0.56 0.58 0.61 0.64 0.67 0.69 0.72 0.75 0.78 0.81 0.83 0.86 0.89 0.92 0.94 0.97 1.0 
There are 1013563 traffic detected as anomaly 9081(0.9%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-spam 9081(1.80%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 0(0.00%), blacklist 0(0.00%)
background misclassfied 1004482(0.19%),
anomaly detected 9081(0.12%), anomaly detected(w/o blacklist) 9081(0.15%)



In [11]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_july_week5_port25_detect_abs1000_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
#             df = pd.read_csv(join(root, f)).set_index('Unnamed: 0')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1496504 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 323120 * 100,
    dos.shape[0], dos.shape[0] / 3134728 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 503658 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 1 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 607036 * 100,
    blacklist.shape[0], blacklist.shape[0] / 1450738 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 531975100 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 7515794 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 6065046 * 100,
))

0.08 0.15 0.23 0.31 0.38 0.46 0.54 0.62 0.69 0.77 0.85 0.92 1.0 
There are 1261303 traffic detected as anomaly 98125(7.78%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-spam 98125(19.48%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 0(0.00%), blacklist 0(0.00%)
background misclassfied 1163178(0.22%),
anomaly detected 98125(1.31%), anomaly detected(w/o blacklist) 98125(1.62%)



In [32]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_july_week5_port53_detect_abs1000_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
#             df = pd.read_csv(join(root, f)).set_index('Unnamed: 0')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1496504 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 323120 * 100,
    dos.shape[0], dos.shape[0] / 3134728 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 503658 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 1 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 607036 * 100,
    blacklist.shape[0], blacklist.shape[0] / 1450738 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 531975100 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 7515794 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 6065046 * 100,
))

0.06 0.11 0.17 0.22 0.28 0.33 0.39 0.44 0.5 0.56 0.61 0.67 0.72 0.78 0.83 0.89 0.94 1.0 
There are 3230793 traffic detected as anomaly 298589(9.24%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-spam 0(0.00%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 292765(48.23%), blacklist 5824(0.40%)
background misclassfied 2932204(0.55%),
anomaly detected 298589(3.97%), anomaly detected(w/o blacklist) 292765(4.83%)



In [11]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_july_week5_port53_detect_abs1000_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
#             df = pd.read_csv(join(root, f)).set_index('Unnamed: 0')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1496504 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 323120 * 100,
    dos.shape[0], dos.shape[0] / 3134728 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 503658 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 1 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 607036 * 100,
    blacklist.shape[0], blacklist.shape[0] / 1450738 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 531975100 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 7515794 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 6065046 * 100,
))

0.08 0.15 0.23 0.31 0.38 0.46 0.54 0.62 0.69 0.77 0.85 0.92 1.0 
There are 659908 traffic detected as anomaly 217087(32.9%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-spam 0(0.00%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 216957(35.74%), blacklist 130(0.01%)
background misclassfied 442821(0.08%),
anomaly detected 217087(2.89%), anomaly detected(w/o blacklist) 216957(3.58%)



In [40]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_july_week5_port53_detect_abs1000_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
#             df = pd.read_csv(join(root, f)).set_index('Unnamed: 0')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1496504 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 323120 * 100,
    dos.shape[0], dos.shape[0] / 3134728 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 503658 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 1 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 607036 * 100,
    blacklist.shape[0], blacklist.shape[0] / 1450738 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 531975100 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 7515794 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 6065046 * 100,
))

0.09 0.18 0.27 0.36 0.45 0.55 0.64 0.73 0.82 0.91 1.0 
There are 1896811 traffic detected as anomaly 228030(12.02%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-spam 0(0.00%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 227046(37.40%), blacklist 984(0.07%)
background misclassfied 1668781(0.31%),
anomaly detected 228030(3.03%), anomaly detected(w/o blacklist) 227046(3.74%)



In [36]:
for port, abs_value in [(6667, 10), (25, 200), (53, 1000)]:
    for root, _, files in os.walk(f"anomaly_time_series_july_week5_port{port}_detect_abs{abs_value}_merged"):
        i = 0
        total = len(files)
        for f in sorted(files):
            if f.endswith(".csv"):
                df = pd.read_csv(join(root, f)).set_index('index')
                df = df.drop_duplicates()
                if total_traffic is None:
                    total_traffic = df.index
                    anomaly_traffic = df[df.label != "background"].index
                    scan_44 = df[df.label == "scan44"].index
                    scan_11 = df[df.label == "scan11"].index
                    dos = df[df.label == "dos"].index
                    anomaly_spam = df[df.label == "anomaly-spam"].index
                    anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                    anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                    nerisbotnet = df[df.label == "nerisbotnet"].index
                    blacklist = df[df.label == "blacklist"].index
                else:
                    total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                    anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                    scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                    scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                    dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                    anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                    anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                    anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                    nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                    blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
                i += 1
                print(round(i/total, 2), end=" ")
        break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1496504 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 323120 * 100,
    dos.shape[0], dos.shape[0] / 3134728 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 503658 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 1 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 607036 * 100,
    blacklist.shape[0], blacklist.shape[0] / 1450738 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 531975100 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 7515794 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 6065046 * 100,
))

0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 0.0 0.01 0.01 0.02 0.02 0.03 0.03 0.04 0.04 0.04 0.05 0.05 0.06 0.06 0.07 0.07 0.07 0.08 0.08 0.09 0.09 0.1 0.1 0.11 0.11 0.11 0.12 0.12 0.13 0.13 0.14 0.14 0.14 0.15 0.15 0.16 0.16 0.17 0.17 0.18 0.18 0.18 0.19 0.19 0.2 0.2 0.21 0.21 0.21 0.22 0.22 0.23 0.23 0.24 0.24 0.25 0.25 0.25 0.26 0.26 0.27 0.27 0.28 0.28 0.29 0.29 0.29 0.3 0.3 0.31 0.31 0.32 0.32 0.32 0.33 0.33 0.34 0.34 0.35 0.35 0.36 0.36 0.36 0.37 0.37 0.38 0.38 0.39 0.39 0.39 0.4 0.4 0.41 0.41 0.42 0.42 0.43 0.43 0.43 0.44 0.44 0.45 0.45 0.46 0.46 0.46 0.47 0.47 0.48 0.48 0.49 0.49 0.5 0.5 0.5 0.51 0.51 0.52 0.52 0.53 0.53 0.54 0.54 0.54 0.55 0.55 0.56 0.56 0.57 0.57 0.57 0.58 0.58 0.59 0.59 0.6 0.6 0.61 0.61 0.61 0.62 0.62 0.63 0.63 0.64 0.64 0.64 0.65 0.65 0.66 0.66 0.67 0.67 0.68 0.68 0.68 0.69 0.69 0.7 0.7 0.71 0.71 0.71 0.72 0.72 0.73 0.73 0.74 0.74 0.75 0.75 0.75 0.76 0.76 0.77 0.77 0.78 0.78 0.79 0.79 0.79 0.8 0.8 0.81 0.81 0.82 0.82 0.82 0.83 0.83 0.84 0.84 0.85 0.85 0.86 0.8

In [12]:
(312596 + 15780 + 94895) / 607036 * 100

69.72749556863185

Unnamed: 0.1,Unnamed: 0,timestamp,duration,src_IP,dest_IP,src_port,dest_port,protocol,flags,forwarding_status,type_of_service,packets_exchanged,number_of_bytes,label
0,64598679,2016-07-28 00:59:07,0.000,77.156.211.73,42.219.156.28,36178,53413,UDP,.A....,0,0,2,197,nerisbotnet
1,64598680,2016-07-28 00:59:07,0.000,77.156.211.73,42.219.156.29,40366,53413,UDP,.A....,0,0,2,197,nerisbotnet
2,64598681,2016-07-28 00:59:07,0.000,77.156.211.73,42.219.156.30,40539,53413,UDP,.A....,0,0,2,197,nerisbotnet
3,64598682,2016-07-28 00:59:07,0.000,77.156.211.73,42.219.156.31,53340,53413,UDP,.A....,0,0,2,197,nerisbotnet
4,64598799,2016-07-28 00:59:07,0.004,77.156.211.73,42.219.156.27,59177,53413,UDP,.A....,0,0,2,197,nerisbotnet
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607031,466939775,2016-07-31 07:59:58,9.013,42.219.158.16,209.85.225.27,1145,25,TCP,....S.,0,0,6,288,nerisbotnet
607032,466939776,2016-07-31 07:59:58,9.013,42.219.158.17,209.85.225.27,1145,25,TCP,....S.,0,0,6,288,nerisbotnet
607033,466939777,2016-07-31 07:59:58,9.013,42.219.158.18,209.85.225.27,1145,25,TCP,....S.,0,0,6,288,nerisbotnet
607034,466939778,2016-07-31 07:59:58,9.013,42.219.158.19,209.85.225.27,1145,25,TCP,....S.,0,0,6,288,nerisbotnet


In [37]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_august_week1_1min_mse4_port2.5_bidir_20.0_1hr_flag6_numBytes12_hardcode_dest_UDP_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('Unnamed: 0')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 2477203 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 539018 * 100,
    dos.shape[0], dos.shape[0] / 5093132 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 27970000 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 989872 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 16 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 992575 * 100,
    blacklist.shape[0], blacklist.shape[0] / 2464585 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 811088133 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 40526401 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 38061816 * 100,
))

0.01 0.02 0.03 0.04 0.05 0.06 0.08 0.09 0.1 0.11 0.12 0.13 0.14 0.15 0.16 0.17 0.18 0.19 0.2 0.22 0.23 0.24 0.25 0.26 0.27 0.28 0.29 0.3 0.31 0.32 0.33 0.34 0.35 0.37 0.38 0.39 0.4 0.41 0.42 0.43 0.44 0.45 0.46 0.47 0.48 0.49 0.51 0.52 0.53 0.54 0.55 0.56 0.57 0.58 0.59 0.6 0.61 0.62 0.63 0.65 0.66 0.67 0.68 0.69 0.7 0.71 0.72 0.73 0.74 0.75 0.76 0.77 0.78 0.8 0.81 0.82 0.83 0.84 0.85 0.86 0.87 0.88 0.89 0.9 0.91 0.92 0.94 0.95 0.96 0.97 0.98 0.99 1.0 
There are 42668685 traffic detected as anomaly 36780681(86.2%) being labeled anomaly traffic
Scan 44 2477071(99.99%), Scan 11 538975(99.99%), DOS 5069534(99.54%),
anomaly-spam 27245350(97.41%), anomaly-udpscan 989730(99.99%), anomaly-sshscan 0(0.00%),
nerisbotnet 186348(18.77%), blacklist 273673(11.10%)
background misclassfied 5888004(0.73%),
anomaly detected 36780681(90.76%), anomaly detected(w/o blacklist) 36507008(95.92%)



In [11]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_august_week1_port6667_detect_abs10_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 2477203 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 539018 * 100,
    dos.shape[0], dos.shape[0] / 5093132 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 27970000 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 989872 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 16 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 992575 * 100,
    blacklist.shape[0], blacklist.shape[0] / 2464585 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 811088133 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 40526401 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 38061816 * 100,
))

0.07 0.13 0.2 0.27 0.33 0.4 0.47 0.53 0.6 0.67 0.73 0.8 0.87 0.93 1.0 
There are 26866 traffic detected as anomaly 26584(98.95%) being labeled anomaly traffic
Scan 44 2282(0.09%), Scan 11 0(0.00%), DOS 2(0.00%),
anomaly-spam 0(0.00%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 24300(2.45%), blacklist 0(0.00%)
background misclassfied 282(0.00%),
anomaly detected 26584(0.07%), anomaly detected(w/o blacklist) 26584(0.07%)



In [22]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_august_week1_port25_detect_abs200_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 2477203 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 539018 * 100,
    dos.shape[0], dos.shape[0] / 5093132 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 27970000 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 989872 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 16 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 992575 * 100,
    blacklist.shape[0], blacklist.shape[0] / 2464585 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 811088133 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 40526401 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 38061816 * 100,
))

0.01 0.01 0.02 0.03 0.04 0.04 0.05 0.06 0.07 0.07 0.08 0.09 0.1 0.1 0.11 0.12 0.12 0.13 0.14 0.15 0.15 0.16 0.17 0.18 0.18 0.19 0.2 0.21 0.21 0.22 0.23 0.24 0.24 0.25 0.26 0.26 0.27 0.28 0.29 0.29 0.3 0.31 0.32 0.32 0.33 0.34 0.35 0.35 0.36 0.37 0.38 0.38 0.39 0.4 0.4 0.41 0.42 0.43 0.43 0.44 0.45 0.46 0.46 0.47 0.48 0.49 0.49 0.5 0.51 0.51 0.52 0.53 0.54 0.54 0.55 0.56 0.57 0.57 0.58 0.59 0.6 0.6 0.61 0.62 0.62 0.63 0.64 0.65 0.65 0.66 0.67 0.68 0.68 0.69 0.7 0.71 0.71 0.72 0.73 0.74 0.74 0.75 0.76 0.76 0.77 0.78 0.79 0.79 0.8 0.81 0.82 0.82 0.83 0.84 0.85 0.85 0.86 0.87 0.88 0.88 0.89 0.9 0.9 0.91 0.92 0.93 0.93 0.94 0.95 0.96 0.96 0.97 0.98 0.99 0.99 1.0 
There are 23916934 traffic detected as anomaly 22231234(92.95%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-spam 22191401(79.34%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 23120(2.33%), blacklist 16713(0.68%)
background misclassfied 1685700(0.21%),
anomaly detected 

In [11]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_august_week1_port25_detect_abs1000_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 2477203 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 539018 * 100,
    dos.shape[0], dos.shape[0] / 5093132 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 27970000 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 989872 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 16 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 992575 * 100,
    blacklist.shape[0], blacklist.shape[0] / 2464585 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 811088133 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 40526401 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 38061816 * 100,
))

0.05 0.11 0.16 0.21 0.26 0.32 0.37 0.42 0.47 0.53 0.58 0.63 0.68 0.74 0.79 0.84 0.89 0.95 1.0 
There are 23058984 traffic detected as anomaly 21933391(95.12%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-spam 21926087(78.39%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 0(0.00%), blacklist 7304(0.30%)
background misclassfied 1125593(0.14%),
anomaly detected 21933391(54.12%), anomaly detected(w/o blacklist) 21926087(57.61%)



In [11]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_august_week1_port53_detect_abs1000_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 2477203 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 539018 * 100,
    dos.shape[0], dos.shape[0] / 5093132 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 27970000 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 989872 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 16 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 992575 * 100,
    blacklist.shape[0], blacklist.shape[0] / 2464585 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 811088133 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 40526401 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 38061816 * 100,
))

0.05 0.09 0.14 0.18 0.23 0.27 0.32 0.36 0.41 0.45 0.5 0.55 0.59 0.64 0.68 0.73 0.77 0.82 0.86 0.91 0.95 1.0 
There are 2845707 traffic detected as anomaly 530100(18.63%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-spam 0(0.00%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 505575(50.94%), blacklist 24525(1.00%)
background misclassfied 2315607(0.29%),
anomaly detected 530100(1.31%), anomaly detected(w/o blacklist) 505575(1.33%)



In [38]:
for port, abs_value in [(6667, 10), (25, 200), (53, 1000)]:
    for root, _, files in os.walk(f"anomaly_time_series_august_week1_port{port}_detect_abs{abs_value}_merged"):
        i = 0
        total = len(files)
        for f in sorted(files):
            if f.endswith(".csv"):
                df = pd.read_csv(join(root, f)).set_index('index')
                df = df.drop_duplicates()
                if total_traffic is None:
                    total_traffic = df.index
                    anomaly_traffic = df[df.label != "background"].index
                    scan_44 = df[df.label == "scan44"].index
                    scan_11 = df[df.label == "scan11"].index
                    dos = df[df.label == "dos"].index
                    anomaly_spam = df[df.label == "anomaly-spam"].index
                    anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                    anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                    nerisbotnet = df[df.label == "nerisbotnet"].index
                    blacklist = df[df.label == "blacklist"].index
                else:
                    total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                    anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                    scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                    scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                    dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                    anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                    anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                    anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                    nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                    blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
                i += 1
                print(round(i/total, 2), end=" ")
        break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 2477203 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 539018 * 100,
    dos.shape[0], dos.shape[0] / 5093132 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 27970000 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 989872 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 16 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 992575 * 100,
    blacklist.shape[0], blacklist.shape[0] / 2464585 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 811088133 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 40526401 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 38061816 * 100,
))

0.07 0.13 0.2 0.27 0.33 0.4 0.47 0.53 0.6 0.67 0.73 0.8 0.87 0.93 1.0 0.01 0.01 0.02 0.03 0.04 0.04 0.05 0.06 0.07 0.07 0.08 0.09 0.1 0.1 0.11 0.12 0.12 0.13 0.14 0.15 0.15 0.16 0.17 0.18 0.18 0.19 0.2 0.21 0.21 0.22 0.23 0.24 0.24 0.25 0.26 0.26 0.27 0.28 0.29 0.29 0.3 0.31 0.32 0.32 0.33 0.34 0.35 0.35 0.36 0.37 0.38 0.38 0.39 0.4 0.4 0.41 0.42 0.43 0.43 0.44 0.45 0.46 0.46 0.47 0.48 0.49 0.49 0.5 0.51 0.51 0.52 0.53 0.54 0.54 0.55 0.56 0.57 0.57 0.58 0.59 0.6 0.6 0.61 0.62 0.62 0.63 0.64 0.65 0.65 0.66 0.67 0.68 0.68 0.69 0.7 0.71 0.71 0.72 0.73 0.74 0.74 0.75 0.76 0.76 0.77 0.78 0.79 0.79 0.8 0.81 0.82 0.82 0.83 0.84 0.85 0.85 0.86 0.87 0.88 0.88 0.89 0.9 0.9 0.91 0.92 0.93 0.93 0.94 0.95 0.96 0.96 0.97 0.98 0.99 0.99 1.0 0.05 0.09 0.14 0.18 0.23 0.27 0.32 0.36 0.41 0.45 0.5 0.55 0.59 0.64 0.68 0.73 0.77 0.82 0.86 0.91 0.95 1.0 
There are 46758788 traffic detected as anomaly 37686960(80.6%) being labeled anomaly traffic
Scan 44 2477071(99.99%), Scan 11 538975(99.99%), DOS 5069534(9

In [39]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_august_week2_1min_mse4_port2.5_bidir_20.0_1hr_flag6_numBytes12_hardcode_dest_UDP_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('Unnamed: 0')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 547468 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 140541 * 100,
    dos.shape[0], dos.shape[0] / 1028245 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 36796698 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 5 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 81918 * 100,
    blacklist.shape[0], blacklist.shape[0] / 5728174 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 793518109 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 44323049 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 38594875 * 100,
))

0.01 0.02 0.03 0.04 0.05 0.07 0.08 0.09 0.1 0.11 0.12 0.13 0.14 0.15 0.16 0.17 0.18 0.2 0.21 0.22 0.23 0.24 0.25 0.26 0.27 0.28 0.29 0.3 0.32 0.33 0.34 0.35 0.36 0.37 0.38 0.39 0.4 0.41 0.42 0.43 0.45 0.46 0.47 0.48 0.49 0.5 0.51 0.52 0.53 0.54 0.55 0.57 0.58 0.59 0.6 0.61 0.62 0.63 0.64 0.65 0.66 0.67 0.68 0.7 0.71 0.72 0.73 0.74 0.75 0.76 0.77 0.78 0.79 0.8 0.82 0.83 0.84 0.85 0.86 0.87 0.88 0.89 0.9 0.91 0.92 0.93 0.95 0.96 0.97 0.98 0.99 1.0 
There are 39436253 traffic detected as anomaly 35823842(90.84%) being labeled anomaly traffic
Scan 44 547423(99.99%), Scan 11 140530(99.99%), DOS 1027076(99.89%),
anomaly-spam 33962362(92.30%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 44464(54.28%), blacklist 101987(1.78%)
background misclassfied 3612411(0.46%),
anomaly detected 35823842(80.82%), anomaly detected(w/o blacklist) 35721855(92.56%)



In [12]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_august_week2_port6667_detect_abs10_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 547468 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 140541 * 100,
    dos.shape[0], dos.shape[0] / 1028245 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 36796698 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 5 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 81918 * 100,
    blacklist.shape[0], blacklist.shape[0] / 5728174 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 793518109 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 44323049 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 38594875 * 100,
))

0.08 0.15 0.23 0.31 0.38 0.46 0.54 0.62 0.69 0.77 0.85 0.92 1.0 
There are 2208 traffic detected as anomaly 621(28.12%) being labeled anomaly traffic
Scan 44 546(0.10%), Scan 11 72(0.05%), DOS 2(0.00%),
anomaly-spam 0(0.00%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 0(0.00%), blacklist 1(0.00%)
background misclassfied 1587(0.00%),
anomaly detected 621(0.00%), anomaly detected(w/o blacklist) 620(0.00%)



In [22]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_august_week2_port25_detect_abs200_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 547468 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 140541 * 100,
    dos.shape[0], dos.shape[0] / 1028245 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 36796698 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 5 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 81918 * 100,
    blacklist.shape[0], blacklist.shape[0] / 5728174 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 793518109 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 44323049 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 38594875 * 100,
))

0.01 0.01 0.02 0.03 0.04 0.04 0.05 0.06 0.06 0.07 0.08 0.09 0.09 0.1 0.11 0.11 0.12 0.13 0.13 0.14 0.15 0.16 0.16 0.17 0.18 0.18 0.19 0.2 0.21 0.21 0.22 0.23 0.23 0.24 0.25 0.26 0.26 0.27 0.28 0.28 0.29 0.3 0.3 0.31 0.32 0.33 0.33 0.34 0.35 0.35 0.36 0.37 0.38 0.38 0.39 0.4 0.4 0.41 0.42 0.43 0.43 0.44 0.45 0.45 0.46 0.47 0.48 0.48 0.49 0.5 0.5 0.51 0.52 0.52 0.53 0.54 0.55 0.55 0.56 0.57 0.57 0.58 0.59 0.6 0.6 0.61 0.62 0.62 0.63 0.64 0.65 0.65 0.66 0.67 0.67 0.68 0.69 0.7 0.7 0.71 0.72 0.72 0.73 0.74 0.74 0.75 0.76 0.77 0.77 0.78 0.79 0.79 0.8 0.81 0.82 0.82 0.83 0.84 0.84 0.85 0.86 0.87 0.87 0.88 0.89 0.89 0.9 0.91 0.91 0.92 0.93 0.94 0.94 0.95 0.96 0.96 0.97 0.98 0.99 0.99 1.0 
There are 36108769 traffic detected as anomaly 33619686(93.11%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-spam 33604623(91.33%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 0(0.00%), blacklist 15063(0.26%)
background misclassfied 2489083(0.31%

In [30]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_august_week2_port53_detect_abs1000_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 547468 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 140541 * 100,
    dos.shape[0], dos.shape[0] / 1028245 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 36796698 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 5 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 81918 * 100,
    blacklist.shape[0], blacklist.shape[0] / 5728174 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 793518109 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 44323049 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 38594875 * 100,
))

0.05 0.1 0.15 0.2 0.25 0.3 0.35 0.4 0.45 0.5 0.55 0.6 0.65 0.7 0.75 0.8 0.85 0.9 0.95 1.0 
There are 1813690 traffic detected as anomaly 82438(4.55%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-spam 0(0.00%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 70868(86.51%), blacklist 11570(0.20%)
background misclassfied 1731252(0.22%),
anomaly detected 82438(0.19%), anomaly detected(w/o blacklist) 70868(0.18%)



In [40]:
for port, abs_value in [(6667, 10), (25, 200), (53, 1000)]:
    for root, _, files in os.walk(f"anomaly_time_series_august_week2_port{port}_detect_abs{abs_value}_merged"):
        i = 0
        total = len(files)
        for f in sorted(files):
            if f.endswith(".csv"):
                df = pd.read_csv(join(root, f)).set_index('index')
                df = df.drop_duplicates()
                if total_traffic is None:
                    total_traffic = df.index
                    anomaly_traffic = df[df.label != "background"].index
                    scan_44 = df[df.label == "scan44"].index
                    scan_11 = df[df.label == "scan11"].index
                    dos = df[df.label == "dos"].index
                    anomaly_spam = df[df.label == "anomaly-spam"].index
                    anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                    anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                    nerisbotnet = df[df.label == "nerisbotnet"].index
                    blacklist = df[df.label == "blacklist"].index
                else:
                    total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                    anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                    scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                    scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                    dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                    anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                    anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                    anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                    nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                    blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
                i += 1
                print(round(i/total, 2), end=" ")
        break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 547468 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 140541 * 100,
    dos.shape[0], dos.shape[0] / 1028245 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 36796698 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 5 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 81918 * 100,
    blacklist.shape[0], blacklist.shape[0] / 5728174 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 793518109 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 44323049 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 38594875 * 100,
))

0.08 0.15 0.23 0.31 0.38 0.46 0.54 0.62 0.69 0.77 0.85 0.92 1.0 0.01 0.01 0.02 0.03 0.04 0.04 0.05 0.06 0.06 0.07 0.08 0.09 0.09 0.1 0.11 0.11 0.12 0.13 0.13 0.14 0.15 0.16 0.16 0.17 0.18 0.18 0.19 0.2 0.21 0.21 0.22 0.23 0.23 0.24 0.25 0.26 0.26 0.27 0.28 0.28 0.29 0.3 0.3 0.31 0.32 0.33 0.33 0.34 0.35 0.35 0.36 0.37 0.38 0.38 0.39 0.4 0.4 0.41 0.42 0.43 0.43 0.44 0.45 0.45 0.46 0.47 0.48 0.48 0.49 0.5 0.5 0.51 0.52 0.52 0.53 0.54 0.55 0.55 0.56 0.57 0.57 0.58 0.59 0.6 0.6 0.61 0.62 0.62 0.63 0.64 0.65 0.65 0.66 0.67 0.67 0.68 0.69 0.7 0.7 0.71 0.72 0.72 0.73 0.74 0.74 0.75 0.76 0.77 0.77 0.78 0.79 0.79 0.8 0.81 0.82 0.82 0.83 0.84 0.84 0.85 0.86 0.87 0.87 0.88 0.89 0.89 0.9 0.91 0.91 0.92 0.93 0.94 0.94 0.95 0.96 0.96 0.97 0.98 0.99 0.99 1.0 0.05 0.1 0.15 0.2 0.25 0.3 0.35 0.4 0.45 0.5 0.55 0.6 0.65 0.7 0.75 0.8 0.85 0.9 0.95 1.0 
There are 44065947 traffic detected as anomaly 37921241(86.06%) being labeled anomaly traffic
Scan 44 547423(99.99%), Scan 11 140530(99.99%), DOS 1027076(9

In [41]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_august_week3_1min_mse4_port2.5_bidir_20.0_1hr_flag6_numBytes12_hardcode_dest_UDP_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('Unnamed: 0')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 1 * 100,
    dos.shape[0], dos.shape[0] / 1 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 7059245 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 20 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 1 * 100,
    blacklist.shape[0], blacklist.shape[0] / 5141634 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 806857029 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 12200899 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 7059265 * 100,
))

0.02 0.04 0.06 0.08 0.1 0.12 0.14 0.16 0.18 0.2 0.22 0.24 0.25 0.27 0.29 0.31 0.33 0.35 0.37 0.39 0.41 0.43 0.45 0.47 0.49 0.51 0.53 0.55 0.57 0.59 0.61 0.63 0.65 0.67 0.69 0.71 0.73 0.75 0.76 0.78 0.8 0.82 0.84 0.86 0.88 0.9 0.92 0.94 0.96 0.98 1.0 
There are 5136183 traffic detected as anomaly 3626104(70.6%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-spam 3625682(51.36%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 0(0.00%), blacklist 422(0.01%)
background misclassfied 1510079(0.19%),
anomaly detected 3626104(29.72%), anomaly detected(w/o blacklist) 3625682(51.36%)



In [11]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_august_week3_port25_detect_abs200_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 1 * 100,
    dos.shape[0], dos.shape[0] / 1 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 7059245 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 20 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 1 * 100,
    blacklist.shape[0], blacklist.shape[0] / 5141634 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 806857029 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 12200899 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 7059265 * 100,
))

0.01 0.01 0.02 0.02 0.03 0.04 0.04 0.05 0.06 0.06 0.07 0.07 0.08 0.09 0.09 0.1 0.11 0.11 0.12 0.12 0.13 0.14 0.14 0.15 0.16 0.16 0.17 0.17 0.18 0.19 0.19 0.2 0.2 0.21 0.22 0.22 0.23 0.24 0.24 0.25 0.25 0.26 0.27 0.27 0.28 0.29 0.29 0.3 0.3 0.31 0.32 0.32 0.33 0.34 0.34 0.35 0.35 0.36 0.37 0.37 0.38 0.39 0.39 0.4 0.4 0.41 0.42 0.42 0.43 0.43 0.44 0.45 0.45 0.46 0.47 0.47 0.48 0.48 0.49 0.5 0.5 0.51 0.52 0.52 0.53 0.53 0.54 0.55 0.55 0.56 0.57 0.57 0.58 0.58 0.59 0.6 0.6 0.61 0.61 0.62 0.63 0.63 0.64 0.65 0.65 0.66 0.66 0.67 0.68 0.68 0.69 0.7 0.7 0.71 0.71 0.72 0.73 0.73 0.74 0.75 0.75 0.76 0.76 0.77 0.78 0.78 0.79 0.8 0.8 0.81 0.81 0.82 0.83 0.83 0.84 0.84 0.85 0.86 0.86 0.87 0.88 0.88 0.89 0.89 0.9 0.91 0.91 0.92 0.93 0.93 0.94 0.94 0.95 0.96 0.96 0.97 0.98 0.98 0.99 0.99 1.0 
There are 9013156 traffic detected as anomaly 6691594(74.24%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-spam 6686021(94.71%), anomaly-udpscan 0(0.00%), anomaly-sshsc

In [19]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_august_week3_port53_detect_abs1000_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 1 * 100,
    dos.shape[0], dos.shape[0] / 1 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 7059245 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 20 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 1 * 100,
    blacklist.shape[0], blacklist.shape[0] / 5141634 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 806857029 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 12200899 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 7059265 * 100,
))

0.08 0.17 0.25 0.33 0.42 0.5 0.58 0.67 0.75 0.83 0.92 1.0 
There are 3238551 traffic detected as anomaly 94498(2.92%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-spam 0(0.00%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 0(0.00%), blacklist 94498(1.84%)
background misclassfied 3144053(0.39%),
anomaly detected 94498(0.77%), anomaly detected(w/o blacklist) 0(0.00%)



In [42]:
for port, abs_value in [(6667, 10), (25, 200), (53, 1000)]:
    for root, _, files in os.walk(f"anomaly_time_series_august_week3_port{port}_detect_abs{abs_value}_merged"):
        i = 0
        total = len(files)
        for f in sorted(files):
            if f.endswith(".csv"):
                df = pd.read_csv(join(root, f)).set_index('index')
                df = df.drop_duplicates()
                if total_traffic is None:
                    total_traffic = df.index
                    anomaly_traffic = df[df.label != "background"].index
                    scan_44 = df[df.label == "scan44"].index
                    scan_11 = df[df.label == "scan11"].index
                    dos = df[df.label == "dos"].index
                    anomaly_spam = df[df.label == "anomaly-spam"].index
                    anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                    anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                    nerisbotnet = df[df.label == "nerisbotnet"].index
                    blacklist = df[df.label == "blacklist"].index
                else:
                    total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                    anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                    scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                    scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                    dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                    anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                    anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                    anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                    nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                    blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
                i += 1
                print(round(i/total, 2), end=" ")
        break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 1 * 100,
    dos.shape[0], dos.shape[0] / 1 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 7059245 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 20 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 1 * 100,
    blacklist.shape[0], blacklist.shape[0] / 5141634 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 806857029 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 12200899 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 7059265 * 100,
))

0.01 0.01 0.02 0.02 0.03 0.04 0.04 0.05 0.06 0.06 0.07 0.07 0.08 0.09 0.09 0.1 0.11 0.11 0.12 0.12 0.13 0.14 0.14 0.15 0.16 0.16 0.17 0.17 0.18 0.19 0.19 0.2 0.2 0.21 0.22 0.22 0.23 0.24 0.24 0.25 0.25 0.26 0.27 0.27 0.28 0.29 0.29 0.3 0.3 0.31 0.32 0.32 0.33 0.34 0.34 0.35 0.35 0.36 0.37 0.37 0.38 0.39 0.39 0.4 0.4 0.41 0.42 0.42 0.43 0.43 0.44 0.45 0.45 0.46 0.47 0.47 0.48 0.48 0.49 0.5 0.5 0.51 0.52 0.52 0.53 0.53 0.54 0.55 0.55 0.56 0.57 0.57 0.58 0.58 0.59 0.6 0.6 0.61 0.61 0.62 0.63 0.63 0.64 0.65 0.65 0.66 0.66 0.67 0.68 0.68 0.69 0.7 0.7 0.71 0.71 0.72 0.73 0.73 0.74 0.75 0.75 0.76 0.76 0.77 0.78 0.78 0.79 0.8 0.8 0.81 0.81 0.82 0.83 0.83 0.84 0.84 0.85 0.86 0.86 0.87 0.88 0.88 0.89 0.89 0.9 0.91 0.91 0.92 0.93 0.93 0.94 0.94 0.95 0.96 0.96 0.97 0.98 0.98 0.99 0.99 1.0 0.08 0.17 0.25 0.33 0.42 0.5 0.58 0.67 0.75 0.83 0.92 1.0 
There are 13217962 traffic detected as anomaly 6812269(51.54%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-sp

In [43]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_august_week4_1min_mse4_port2.5_bidir_20.0_1hr_flag6_numBytes12_hardcode_dest_UDP_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('Unnamed: 0')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 1 * 100,
    dos.shape[0], dos.shape[0] / 1 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 5287316 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 12 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 1 * 100,
    blacklist.shape[0], blacklist.shape[0] / 3069118 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 854170414 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 8356446 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 5287328 * 100,
))

0.01 0.02 0.04 0.05 0.06 0.07 0.09 0.1 0.11 0.12 0.13 0.15 0.16 0.17 0.18 0.2 0.21 0.22 0.23 0.24 0.26 0.27 0.28 0.29 0.3 0.32 0.33 0.34 0.35 0.37 0.38 0.39 0.4 0.41 0.43 0.44 0.45 0.46 0.48 0.49 0.5 0.51 0.52 0.54 0.55 0.56 0.57 0.59 0.6 0.61 0.62 0.63 0.65 0.66 0.67 0.68 0.7 0.71 0.72 0.73 0.74 0.76 0.77 0.78 0.79 0.8 0.82 0.83 0.84 0.85 0.87 0.88 0.89 0.9 0.91 0.93 0.94 0.95 0.96 0.98 0.99 1.0 
There are 5165576 traffic detected as anomaly 1463916(28.34%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-spam 1463443(27.68%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 0(0.00%), blacklist 473(0.02%)
background misclassfied 3701660(0.43%),
anomaly detected 1463916(17.52%), anomaly detected(w/o blacklist) 1463443(27.68%)



In [10]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_august_week4_port6667_detect_abs10_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 1 * 100,
    dos.shape[0], dos.shape[0] / 1 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 5287316 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 12 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 1 * 100,
    blacklist.shape[0], blacklist.shape[0] / 3069118 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 854170414 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 8356446 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 5287328 * 100,
))

1.0 
There are 294 traffic detected as anomaly 0(0.0%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-spam 0(0.00%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 0(0.00%), blacklist 0(0.00%)
background misclassfied 294(0.00%),
anomaly detected 0(0.00%), anomaly detected(w/o blacklist) 0(0.00%)



In [18]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_august_week4_port25_detect_abs200_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 1 * 100,
    dos.shape[0], dos.shape[0] / 1 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 5287316 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 12 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 1 * 100,
    blacklist.shape[0], blacklist.shape[0] / 3069118 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 854170414 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 8356446 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 5287328 * 100,
))

0.0 0.01 0.01 0.02 0.02 0.03 0.03 0.04 0.04 0.05 0.05 0.06 0.06 0.07 0.07 0.07 0.08 0.08 0.09 0.09 0.1 0.1 0.11 0.11 0.12 0.12 0.13 0.13 0.13 0.14 0.14 0.15 0.15 0.16 0.16 0.17 0.17 0.18 0.18 0.19 0.19 0.2 0.2 0.2 0.21 0.21 0.22 0.22 0.23 0.23 0.24 0.24 0.25 0.25 0.26 0.26 0.27 0.27 0.27 0.28 0.28 0.29 0.29 0.3 0.3 0.31 0.31 0.32 0.32 0.33 0.33 0.33 0.34 0.34 0.35 0.35 0.36 0.36 0.37 0.37 0.38 0.38 0.39 0.39 0.4 0.4 0.4 0.41 0.41 0.42 0.42 0.43 0.43 0.44 0.44 0.45 0.45 0.46 0.46 0.47 0.47 0.47 0.48 0.48 0.49 0.49 0.5 0.5 0.51 0.51 0.52 0.52 0.53 0.53 0.53 0.54 0.54 0.55 0.55 0.56 0.56 0.57 0.57 0.58 0.58 0.59 0.59 0.6 0.6 0.6 0.61 0.61 0.62 0.62 0.63 0.63 0.64 0.64 0.65 0.65 0.66 0.66 0.67 0.67 0.67 0.68 0.68 0.69 0.69 0.7 0.7 0.71 0.71 0.72 0.72 0.73 0.73 0.73 0.74 0.74 0.75 0.75 0.76 0.76 0.77 0.77 0.78 0.78 0.79 0.79 0.8 0.8 0.8 0.81 0.81 0.82 0.82 0.83 0.83 0.84 0.84 0.85 0.85 0.86 0.86 0.87 0.87 0.87 0.88 0.88 0.89 0.89 0.9 0.9 0.91 0.91 0.92 0.92 0.93 0.93 0.93 0.94 0.94 0.95 0.9

In [26]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_august_week4_port53_detect_abs1000_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 1 * 100,
    dos.shape[0], dos.shape[0] / 1 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 5287316 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 12 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 1 * 100,
    blacklist.shape[0], blacklist.shape[0] / 3069118 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 854170414 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 8356446 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 5287328 * 100,
))

0.02 0.04 0.05 0.07 0.09 0.11 0.13 0.15 0.16 0.18 0.2 0.22 0.24 0.25 0.27 0.29 0.31 0.33 0.35 0.36 0.38 0.4 0.42 0.44 0.45 0.47 0.49 0.51 0.53 0.55 0.56 0.58 0.6 0.62 0.64 0.65 0.67 0.69 0.71 0.73 0.75 0.76 0.78 0.8 0.82 0.84 0.85 0.87 0.89 0.91 0.93 0.95 0.96 0.98 1.0 
There are 3362614 traffic detected as anomaly 26255(0.78%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-spam 0(0.00%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 0(0.00%), blacklist 26255(0.86%)
background misclassfied 3336359(0.39%),
anomaly detected 26255(0.31%), anomaly detected(w/o blacklist) 0(0.00%)



In [44]:
for port, abs_value in [(6667, 10), (25, 200), (53, 1000)]:
    for root, _, files in os.walk(f"anomaly_time_series_august_week4_port{port}_detect_abs{abs_value}_merged"):
        i = 0
        total = len(files)
        for f in sorted(files):
            if f.endswith(".csv"):
                df = pd.read_csv(join(root, f)).set_index('index')
                df = df.drop_duplicates()
                if total_traffic is None:
                    total_traffic = df.index
                    anomaly_traffic = df[df.label != "background"].index
                    scan_44 = df[df.label == "scan44"].index
                    scan_11 = df[df.label == "scan11"].index
                    dos = df[df.label == "dos"].index
                    anomaly_spam = df[df.label == "anomaly-spam"].index
                    anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                    anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                    nerisbotnet = df[df.label == "nerisbotnet"].index
                    blacklist = df[df.label == "blacklist"].index
                else:
                    total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                    anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                    scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                    scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                    dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                    anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                    anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                    anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                    nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                    blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
                i += 1
                print(round(i/total, 2), end=" ")
        break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 1 * 100,
    dos.shape[0], dos.shape[0] / 1 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 5287316 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 12 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 1 * 100,
    blacklist.shape[0], blacklist.shape[0] / 3069118 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 854170414 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 8356446 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 5287328 * 100,
))

1.0 0.0 0.01 0.01 0.02 0.02 0.03 0.03 0.04 0.04 0.05 0.05 0.06 0.06 0.07 0.07 0.07 0.08 0.08 0.09 0.09 0.1 0.1 0.11 0.11 0.12 0.12 0.13 0.13 0.13 0.14 0.14 0.15 0.15 0.16 0.16 0.17 0.17 0.18 0.18 0.19 0.19 0.2 0.2 0.2 0.21 0.21 0.22 0.22 0.23 0.23 0.24 0.24 0.25 0.25 0.26 0.26 0.27 0.27 0.27 0.28 0.28 0.29 0.29 0.3 0.3 0.31 0.31 0.32 0.32 0.33 0.33 0.33 0.34 0.34 0.35 0.35 0.36 0.36 0.37 0.37 0.38 0.38 0.39 0.39 0.4 0.4 0.4 0.41 0.41 0.42 0.42 0.43 0.43 0.44 0.44 0.45 0.45 0.46 0.46 0.47 0.47 0.47 0.48 0.48 0.49 0.49 0.5 0.5 0.51 0.51 0.52 0.52 0.53 0.53 0.53 0.54 0.54 0.55 0.55 0.56 0.56 0.57 0.57 0.58 0.58 0.59 0.59 0.6 0.6 0.6 0.61 0.61 0.62 0.62 0.63 0.63 0.64 0.64 0.65 0.65 0.66 0.66 0.67 0.67 0.67 0.68 0.68 0.69 0.69 0.7 0.7 0.71 0.71 0.72 0.72 0.73 0.73 0.73 0.74 0.74 0.75 0.75 0.76 0.76 0.77 0.77 0.78 0.78 0.79 0.79 0.8 0.8 0.8 0.81 0.81 0.82 0.82 0.83 0.83 0.84 0.84 0.85 0.85 0.86 0.86 0.87 0.87 0.87 0.88 0.88 0.89 0.89 0.9 0.9 0.91 0.91 0.92 0.92 0.93 0.93 0.93 0.94 0.94 0.95

In [33]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_august_week5_1min_mse4_port2.5_bidir_20.0_1hr_flag6_numBytes12_hardcode_dest_UDP_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('Unnamed: 0')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 1 * 100,
    dos.shape[0], dos.shape[0] / 1 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 136402 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 1 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 1 * 100,
    blacklist.shape[0], blacklist.shape[0] / 3069118 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 40153192 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 136403 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 1 * 100,
))

0.12 0.25 0.38 0.5 0.62 0.75 0.88 1.0 
There are 264104 traffic detected as anomaly 4(0.0%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-spam 0(0.00%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 0(0.00%), blacklist 4(0.00%)
background misclassfied 264100(0.66%),
anomaly detected 4(0.00%), anomaly detected(w/o blacklist) 0(0.00%)



In [11]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_august_week5_port6667_detect_abs10_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 1 * 100,
    dos.shape[0], dos.shape[0] / 1 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 136402 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 1 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 1 * 100,
    blacklist.shape[0], blacklist.shape[0] / 3069118 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 40153192 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 136403 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 1 * 100,
))

1.0 
There are 521 traffic detected as anomaly 0(0.0%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-spam 0(0.00%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 0(0.00%), blacklist 0(0.00%)
background misclassfied 521(0.00%),
anomaly detected 0(0.00%), anomaly detected(w/o blacklist) 0(0.00%)



In [19]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_august_week5_port25_detect_abs200_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 1 * 100,
    dos.shape[0], dos.shape[0] / 1 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 136402 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 1 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 1 * 100,
    blacklist.shape[0], blacklist.shape[0] / 3069118 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 40153192 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 136403 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 1 * 100,
))

0.08 0.17 0.25 0.33 0.42 0.5 0.58 0.67 0.75 0.83 0.92 1.0 
There are 175440 traffic detected as anomaly 4(0.0%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-spam 0(0.00%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 0(0.00%), blacklist 4(0.00%)
background misclassfied 175436(0.44%),
anomaly detected 4(0.00%), anomaly detected(w/o blacklist) 0(0.00%)



In [27]:
total_traffic = None
anomaly_traffic = None
scan_44 = None
scan_11 = None
dos = None
anomaly_udpscan = None
anomaly_sshscan = None
anomaly_spam = None
nerisbotnet = None
blacklist = None
for root, _, files in os.walk("anomaly_time_series_august_week5_port53_detect_abs1000_merged"):
    i = 0
    total = len(files)
    for f in sorted(files):
        if f.endswith(".csv"):
            df = pd.read_csv(join(root, f)).set_index('index')
            df = df.drop_duplicates()
            if total_traffic is None:
                total_traffic = df.index
                anomaly_traffic = df[df.label != "background"].index
                scan_44 = df[df.label == "scan44"].index
                scan_11 = df[df.label == "scan11"].index
                dos = df[df.label == "dos"].index
                anomaly_spam = df[df.label == "anomaly-spam"].index
                anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                nerisbotnet = df[df.label == "nerisbotnet"].index
                blacklist = df[df.label == "blacklist"].index
            else:
                total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
            i += 1
            print(round(i/total, 2), end=" ")
    break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 1 * 100,
    dos.shape[0], dos.shape[0] / 1 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 136402 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 1 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 1 * 100,
    blacklist.shape[0], blacklist.shape[0] / 3069118 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 40153192 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 136403 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 1 * 100,
))

0.03 0.06 0.09 0.11 0.14 0.17 0.2 0.23 0.26 0.29 0.31 0.34 0.37 0.4 0.43 0.46 0.49 0.51 0.54 0.57 0.6 0.63 0.66 0.69 0.71 0.74 0.77 0.8 0.83 0.86 0.89 0.91 0.94 0.97 1.0 
There are 296186 traffic detected as anomaly 0(0.0%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-spam 0(0.00%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 0(0.00%), blacklist 0(0.00%)
background misclassfied 296186(0.74%),
anomaly detected 0(0.00%), anomaly detected(w/o blacklist) 0(0.00%)



In [34]:
for port, abs_value in [(6667, 10), (25, 200), (53, 1000)]:
    for root, _, files in os.walk(f"anomaly_time_series_august_week5_port{port}_detect_abs{abs_value}_merged"):
        i = 0
        total = len(files)
        for f in sorted(files):
            if f.endswith(".csv"):
                df = pd.read_csv(join(root, f)).set_index('index')
                df = df.drop_duplicates()
                if total_traffic is None:
                    total_traffic = df.index
                    anomaly_traffic = df[df.label != "background"].index
                    scan_44 = df[df.label == "scan44"].index
                    scan_11 = df[df.label == "scan11"].index
                    dos = df[df.label == "dos"].index
                    anomaly_spam = df[df.label == "anomaly-spam"].index
                    anomaly_udpscan = df[df.label == "anomaly-udpscan"].index
                    anomaly_sshscan = df[df.label == "anomaly-sshscan"].index
                    nerisbotnet = df[df.label == "nerisbotnet"].index
                    blacklist = df[df.label == "blacklist"].index
                else:
                    total_traffic = total_traffic.append(df[~df.index.isin(total_traffic)].index)
                    anomaly_traffic = anomaly_traffic.append(df[(df.label != "background") & (~df.index.isin(anomaly_traffic))].index)
                    scan_44 = scan_44.append(df[(df.label == "scan44") & (~df.index.isin(scan_44))].index)
                    scan_11 = scan_11.append(df[(df.label == "scan11") & (~df.index.isin(scan_11))].index)
                    dos = dos.append(df[(df.label == "dos") & (~df.index.isin(dos))].index)
                    anomaly_spam = anomaly_spam.append(df[(df.label == "anomaly-spam") & (~df.index.isin(anomaly_spam))].index)
                    anomaly_udpscan = anomaly_udpscan.append(df[(df.label == "anomaly-udpscan") & (~df.index.isin(anomaly_udpscan))].index)
                    anomaly_sshscan = anomaly_sshscan.append(df[(df.label == "anomaly-sshscan") & (~df.index.isin(anomaly_sshscan))].index)
                    nerisbotnet = nerisbotnet.append(df[(df.label == "nerisbotnet") & (~df.index.isin(nerisbotnet))].index)
                    blacklist = blacklist.append(df[(df.label == "blacklist") & (~df.index.isin(blacklist))].index)
                i += 1
                print(round(i/total, 2), end=" ")
        break
print("\nThere are {} traffic detected as anomaly {}({}%) being labeled anomaly traffic".format(
    total_traffic.shape[0], anomaly_traffic.shape[0], round(anomaly_traffic.shape[0] / float(total_traffic.shape[0]) * 100, 2)))
result = "Scan 44 {}({:.2f}%), Scan 11 {}({:.2f}%), DOS {}({:.2f}%),\n"
result += "anomaly-spam {}({:.2f}%), anomaly-udpscan {}({:.2f}%), anomaly-sshscan {}({:.2f}%),\n"
result += "nerisbotnet {}({:.2f}%), blacklist {}({:.2f}%)\n"
result += "background misclassfied {}({:.2f}%),\n"
result += "anomaly detected {}({:.2f}%), anomaly detected(w/o blacklist) {}({:.2f}%)\n"
print(result.format(
    scan_44.shape[0], scan_44.shape[0] / 1 * 100, 
    scan_11.shape[0], scan_11.shape[0] / 1 * 100,
    dos.shape[0], dos.shape[0] / 1 * 100,
    anomaly_spam.shape[0], anomaly_spam.shape[0] / 1 * 100,
    anomaly_udpscan.shape[0], anomaly_udpscan.shape[0] / 1 * 100,
    anomaly_sshscan.shape[0], anomaly_sshscan.shape[0] / 1 * 100,
    nerisbotnet.shape[0], nerisbotnet.shape[0] / 1 * 100,
    blacklist.shape[0], blacklist.shape[0] / 3069118 * 100,
    total_traffic.shape[0] - anomaly_traffic.shape[0], (total_traffic.shape[0] - anomaly_traffic.shape[0]) / 40153192 * 100,
    anomaly_traffic.shape[0], anomaly_traffic.shape[0] / 136403 * 100,
    anomaly_traffic.shape[0] - blacklist.shape[0], (anomaly_traffic.shape[0] - blacklist.shape[0]) / 1 * 100,
))

1.0 0.08 0.17 0.25 0.33 0.42 0.5 0.58 0.67 0.75 0.83 0.92 1.0 0.03 0.06 0.09 0.11 0.14 0.17 0.2 0.23 0.26 0.29 0.31 0.34 0.37 0.4 0.43 0.46 0.49 0.51 0.54 0.57 0.6 0.63 0.66 0.69 0.71 0.74 0.77 0.8 0.83 0.86 0.89 0.91 0.94 0.97 1.0 
There are 661469 traffic detected as anomaly 8(0.0%) being labeled anomaly traffic
Scan 44 0(0.00%), Scan 11 0(0.00%), DOS 0(0.00%),
anomaly-spam 0(0.00%), anomaly-udpscan 0(0.00%), anomaly-sshscan 0(0.00%),
nerisbotnet 0(0.00%), blacklist 8(0.00%)
background misclassfied 661461(1.65%),
anomaly detected 8(0.01%), anomaly detected(w/o blacklist) 0(0.00%)

