In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
def calculate_attack_type(filename, chunksize=200_000, low_memory=False):
    num_benign = 0
    num_attack = 0
    attack_rows = []
    benign_rows = []
    with pd.read_csv(f'data/CICDDos2019/{filename}', chunksize=chunksize, low_memory=low_memory) as reader:
        for chunk in reader:
            benign_row = chunk.loc[chunk[' Label'] == 'BENIGN']
            attack_row = chunk.loc[chunk[' Label'] != 'BENIGN']
            num_benign += benign_row.shape[0]
            num_attack += attack_row.shape[0]
            attack_rows.append(attack_row)
            benign_rows.append(benign_row)

    return num_benign, num_attack, benign_rows, attack_rows

In [3]:
class Colors:
    RESET = "\033[0m"
    BLACK = "\033[30m"
    RED = "\033[31m"
    GREEN = "\033[32m"
    YELLOW = "\033[33m"
    BLUE = "\033[34m"
    MAGENTA = "\033[35m"
    CYAN = "\033[36m"
    WHITE = "\033[37m"
    BOLD = "\033[1m"
    UNDERLINE = "\033[4m"
    BACKGROUND_RED = "\033[41m"

In [4]:
data_files = ['DrDoS_LDAP.csv',
              'DrDoS_NetBIOS.csv',
              'DrDoS_SNMP.csv',
              'DrDoS_UDP.csv',
              'TFTP.csv',
              'DrDoS_DNS.csv',
              'DrDoS_MSSQL.csv',
              'DrDoS_NTP.csv',
              'DrDoS_SSDP.csv',
              'Syn.csv',
              'UDPLag.csv']


benign_allfiles = 0
attack_allfiles = 0
total_time = 0

benign_dflist = []
attack_dflist = []

for data_file in data_files:
    start_time = time.time()
    benign, attack, benign_rows, attack_rows = calculate_attack_type(data_file)
    end_time = time.time()
    print(f'file: {Colors.BOLD}{data_file}{Colors.RESET}, time: {(end_time-start_time) :<20.10f}s')
    print(f'    benign:  {benign:<20} proportion: {benign/(benign+attack):<20.10f}')
    print(f'    attack:  {attack:<20} proportion: {attack/(benign+attack):<20.10f}')
    print()
    benign_allfiles += benign
    attack_allfiles  += attack
    total_time += (end_time-start_time)    
    
    benign_dflist.append(benign_rows)
    attack_dflist.append(attack_rows)

print(f'{Colors.BOLD}{Colors.RED}All Files, time{Colors.RESET}: {Colors.BOLD}{Colors.BLUE}{total_time :<20.10f}{Colors.RESET}s')

print(f'    {Colors.GREEN}benign{Colors.RESET}:  {Colors.BOLD}\
{Colors.BLUE}{benign_allfiles:<20}{Colors.RESET} \
{Colors.GREEN}proportion{Colors.RESET}:  \
{Colors.BOLD}{Colors.BLUE}{benign_allfiles/(benign_allfiles+attack_allfiles) :<20.10f}{Colors.RESET}')

print(f'    {Colors.GREEN}attack{Colors.RESET}:  {Colors.BOLD}\
{Colors.BLUE}{attack_allfiles:<20}{Colors.RESET} \
{Colors.GREEN}proportion{Colors.RESET}:  \
{Colors.BOLD}{Colors.BLUE}{attack_allfiles/(benign_allfiles+attack_allfiles) :<20.10f}{Colors.RESET}')

file: [1mDrDoS_LDAP.csv[0m, time: 10.9098076820       s
    benign:  1612                 proportion: 0.0007389269        
    attack:  2179930              proportion: 0.9992610731        

file: [1mDrDoS_NetBIOS.csv[0m, time: 20.3643326759       s
    benign:  1707                 proportion: 0.0004168512        
    attack:  4093279              proportion: 0.9995831488        

file: [1mDrDoS_SNMP.csv[0m, time: 25.2348771095       s
    benign:  1507                 proportion: 0.0002919763        
    attack:  5159870              proportion: 0.9997080237        

file: [1mDrDoS_UDP.csv[0m, time: 16.3895571232       s
    benign:  2157                 proportion: 0.0006876430        
    attack:  3134645              proportion: 0.9993123570        

file: [1mTFTP.csv[0m, time: 100.7450129986      s
    benign:  25247                proportion: 0.0012555807        
    attack:  20082580             proportion: 0.9987444193        

file: [1mDrDoS_DNS.csv[0m, time: 25.6

In [5]:
def print_dflist(dflists):
    for i, dflist in enumerate(dflists):
        print(f'{data_files[i]:<18} \
        num_dfs: {len(dflists[i]):<5} \
        mean_num_rows: {round(np.mean([df_.shape[0] for df_ in dflists[i]])):<5} \
        total_rows: {np.sum([df_.shape[0] for df_ in dflists[i]]):<10}\
        select_per_df: {round(300_000/len(dflists[i]))}')

In [6]:
def clean_dflist(dflists):
    copy_dflist = []
    clean_columns = [c.strip() for c in dflists[0][0].columns]
    for dflist in dflists:
        new_dflist = [pd.DataFrame(df) for df in dflist]
        for df in new_dflist:
            df.columns = clean_columns
        copy_dflist.append(new_dflist)
    return copy_dflist

In [7]:
attack_dflist = clean_dflist(attack_dflist)

In [8]:
attack_dflist[0][0].info()

<class 'pandas.core.frame.DataFrame'>
Index: 199957 entries, 0 to 199999
Data columns (total 88 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Unnamed: 0                   199957 non-null  int64  
 1   Flow ID                      199957 non-null  object 
 2   Source IP                    199957 non-null  object 
 3   Source Port                  199957 non-null  int64  
 4   Destination IP               199957 non-null  object 
 5   Destination Port             199957 non-null  int64  
 6   Protocol                     199957 non-null  int64  
 7   Timestamp                    199957 non-null  object 
 8   Flow Duration                199957 non-null  int64  
 9   Total Fwd Packets            199957 non-null  int64  
 10  Total Backward Packets       199957 non-null  int64  
 11  Total Length of Fwd Packets  199957 non-null  float64
 12  Total Length of Bwd Packets  199957 non-null  float64
 13  Fwd 

In [9]:
def get_sample_size(dflist, target):
    sample_size = target//len(dflist)
    remaining = target - (sample_size * (len(dflist) - 1)) - min(sample_size, dflist[-1:][0].shape[0])
    sample_size += remaining//(len(dflist) - 1)
    return sample_size

In [10]:
def sample_dflist(dflist, chunk_sample_size):
    chunks = []
    for df in dflist:
        sample = df.sample(min(chunk_sample_size, df.shape[0]), 
                                random_state=42, 
                                axis=0, 
                                ignore_index=True)
        chunks.append(sample)
    return pd.concat(chunks)

In [11]:
sampled_attack_data = []
for dflist in attack_dflist:
    sampled_attack_data.append(sample_dflist(dflist, 
                                            get_sample_size(dflist, 250_000)))


In [353]:
for i, sample in enumerate(sampled_attack_data):
    print(f'{data_files[i]}: {sample.shape[0]}')

DrDoS_LDAP.csv: 249997
DrDoS_NetBIOS.csv: 249984
DrDoS_SNMP.csv: 249990
DrDoS_UDP.csv: 250000
TFTP.csv: 249975
DrDoS_DNS.csv: 249990
DrDoS_MSSQL.csv: 249987
DrDoS_NTP.csv: 249995
DrDoS_SSDP.csv: 249989
Syn.csv: 250000
UDPLag.csv: 250000


In [12]:
def concat_benign_data(dflists):
    chunks = []
    for dflist in dflists:
        for df in dflist:
            chunks.append(df)
    return pd.concat(chunks)
        

In [355]:
benign_dflist = clean_dflist(benign_dflist)
benign_df = concat_benign_data(benign_dflist)

In [373]:
df = sampled_attack_data[0]
df_1, df_2, df_3 = df.iloc[0:df.shape[0]//3,:], df.iloc[df.shape[0]//3:2*df.shape[0]//3,:], df.iloc[2*df.shape[0]//3:,:]
df.shape, df_1.shape, df_2.shape, df_3.shape, benign_df.iloc[0:benign_df.shape[0]//3,:].shape, benign_df.iloc[benign_df.shape[0]//3:2*benign_df.shape[0]//3,:].shape, benign_df.iloc[2*benign_df.shape[0]//3:,:].shape

((249997, 88),
 (83332, 88),
 (83332, 88),
 (83333, 88),
 (18954, 88),
 (18954, 88),
 (18955, 88))

In [375]:
benign_1, benign_2, benign_3 = benign_df.iloc[0:benign_df.shape[0]//3,:], benign_df.iloc[benign_df.shape[0]//3:2*benign_df.shape[0]//3,:], benign_df.iloc[2*benign_df.shape[0]//3:,:]

for i, df in enumerate(sampled_attack_data):
    df_1, df_2, df_3 = df.iloc[0:df.shape[0]//3,:], df.iloc[df.shape[0]//3:2*df.shape[0]//3,:], df.iloc[2*df.shape[0]//3:,:]
    df_1 = pd.concat([df_1, benign_1])
    df_1.to_csv(f'data/CICDDos2019/sampled_data/sampled_{data_files[i].split('.')[0]}_1.csv', index=False)
    df_2 = pd.concat([df_2, benign_2])
    df_2.to_csv(f'data/CICDDos2019/sampled_data/sampled_{data_files[i].split('.')[0]}_2.csv', index=False)
    df_3 = pd.concat([df_3, benign_3])
    df_3.to_csv(f'data/CICDDos2019/sampled_data/sampled_{data_files[i].split('.')[0]}_3.csv', index=False)

In [376]:
benign_df.to_csv(f'data/CICDDos2019/sampled_data/aggreggated_benign.csv', index=False)