In [206]:
import pandas as pd
import numpy as np
import time

In [207]:
def calculate_attack_type(filename, chunksize=200_000, low_memory=False):
    num_benign = 0
    num_attack = 0
    attack_rows = []
    benign_rows = []
    with pd.read_csv(f'data/CICDDos2019/{filename}', chunksize=chunksize, low_memory=low_memory) as reader:
        for chunk in reader:
            benign_row = chunk.loc[chunk[' Label'] == 'BENIGN']
            attack_row = chunk.loc[chunk[' Label'] != 'BENIGN']
            num_benign += benign_row.shape[0]
            num_attack += attack_row.shape[0]
            attack_rows.append(attack_row)
            benign_rows.append(benign_row)

    return num_benign, num_attack, benign_rows, attack_rows

In [208]:
class Colors:
    RESET = "\033[0m"
    BLACK = "\033[30m"
    RED = "\033[31m"
    GREEN = "\033[32m"
    YELLOW = "\033[33m"
    BLUE = "\033[34m"
    MAGENTA = "\033[35m"
    CYAN = "\033[36m"
    WHITE = "\033[37m"
    BOLD = "\033[1m"
    UNDERLINE = "\033[4m"
    BACKGROUND_RED = "\033[41m"

In [209]:
data_files = ['DrDoS_LDAP.csv',
              'DrDoS_NetBIOS.csv',
              'DrDoS_SNMP.csv',
              'DrDoS_UDP.csv',
              'TFTP.csv',
              'DrDoS_DNS.csv',
              'DrDoS_MSSQL.csv',
              'DrDoS_NTP.csv',
              'DrDoS_SSDP.csv',
              'Syn.csv',
              'UDPLag.csv']


benign_allfiles = 0
attack_allfiles = 0
total_time = 0

benign_dflist = []
attack_dflist = []

for data_file in data_files:
    start_time = time.time()
    benign, attack, benign_rows, attack_rows = calculate_attack_type(data_file)
    end_time = time.time()
    print(f'file: {Colors.BOLD}{data_file}{Colors.RESET}, time: {(end_time-start_time) :<20.10f}s')
    print(f'    benign:  {benign:<20} proportion: {benign/(benign+attack):<20.10f}')
    print(f'    attack:  {attack:<20} proportion: {attack/(benign+attack):<20.10f}')
    print()
    benign_allfiles += benign
    attack_allfiles  += attack
    total_time += (end_time-start_time)    
    
    benign_dflist.append(benign_rows)
    attack_dflist.append(attack_rows)

print(f'{Colors.BOLD}{Colors.RED}All Files, time{Colors.RESET}: {Colors.BOLD}{Colors.BLUE}{total_time :<20.10f}{Colors.RESET}s')

print(f'    {Colors.GREEN}benign{Colors.RESET}:  {Colors.BOLD}\
{Colors.BLUE}{benign_allfiles:<20}{Colors.RESET} \
{Colors.GREEN}proportion{Colors.RESET}:  \
{Colors.BOLD}{Colors.BLUE}{benign_allfiles/(benign_allfiles+attack_allfiles) :<20.10f}{Colors.RESET}')

print(f'    {Colors.GREEN}attack{Colors.RESET}:  {Colors.BOLD}\
{Colors.BLUE}{attack_allfiles:<20}{Colors.RESET} \
{Colors.GREEN}proportion{Colors.RESET}:  \
{Colors.BOLD}{Colors.BLUE}{attack_allfiles/(benign_allfiles+attack_allfiles) :<20.10f}{Colors.RESET}')

file: [1mDrDoS_LDAP.csv[0m, time: 10.5104079247       s
    benign:  1612                 proportion: 0.0007389269        
    attack:  2179930              proportion: 0.9992610731        

file: [1mDrDoS_NetBIOS.csv[0m, time: 18.9888818264       s
    benign:  1707                 proportion: 0.0004168512        
    attack:  4093279              proportion: 0.9995831488        

file: [1mDrDoS_SNMP.csv[0m, time: 24.6971669197       s
    benign:  1507                 proportion: 0.0002919763        
    attack:  5159870              proportion: 0.9997080237        

file: [1mDrDoS_UDP.csv[0m, time: 16.5309391022       s
    benign:  2157                 proportion: 0.0006876430        
    attack:  3134645              proportion: 0.9993123570        

file: [1mTFTP.csv[0m, time: 103.5478088856      s
    benign:  25247                proportion: 0.0012555807        
    attack:  20082580             proportion: 0.9987444193        

file: [1mDrDoS_DNS.csv[0m, time: 25.8

In [210]:
def print_dflist(dflists):
    for i, dflist in enumerate(dflists):
        print(f'{data_files[i]:<18} \
        num_dfs: {len(dflists[i]):<5} \
        mean_num_rows: {round(np.mean([df_.shape[0] for df_ in dflists[i]])):<5} \
        total_rows: {np.sum([df_.shape[0] for df_ in dflists[i]]):<10}\
        select_per_df: {round(300_000/len(dflists[i]))}')

In [211]:
def clean_dflist(dflists):
    copy_dflist = []
    clean_columns = [c.strip() for c in dflists[0][0].columns]
    for dflist in dflists:
        new_dflist = [pd.DataFrame(df) for df in dflist]
        for df in new_dflist:
            df.columns = clean_columns
        copy_dflist.append(new_dflist)
    return copy_dflist

In [330]:
attack_dflist = clean_dflist(attack_dflist)

In [None]:
attack_dflist[0][0].info()

In [332]:
def get_sample_size(dflist, target):
    sample_size = target//len(dflist)
    remaining = target - (sample_size * (len(dflist) - 1)) - min(sample_size, dflist[-1:][0].shape[0])
    sample_size += remaining//(len(dflist) - 1)
    return sample_size

In [333]:
def sample_dflist(dflist, chunk_sample_size):
    chunks = []
    for df in dflist:
        sample = df.sample(min(chunk_sample_size, df.shape[0]), 
                                random_state=42, 
                                axis=0, 
                                ignore_index=True)
        chunks.append(sample)
    return pd.concat(chunks)

In [336]:
sampled_attack_data = []
for dflist in attack_dflist:
    sampled_attack_data.append(sample_dflist(dflist, 
                                            get_sample_size(dflist, 250_000)))


In [337]:
for i, sample in enumerate(sampled_attack_data):
    print(f'{data_files[i]}: {sample.shape[0]}')

DrDoS_LDAP.csv: 249997
DrDoS_NetBIOS.csv: 249984
DrDoS_SNMP.csv: 249990
DrDoS_UDP.csv: 250000
TFTP.csv: 249975
DrDoS_DNS.csv: 249990
DrDoS_MSSQL.csv: 249987
DrDoS_NTP.csv: 249995
DrDoS_SSDP.csv: 249989
Syn.csv: 250000
UDPLag.csv: 250000


In [339]:
def concat_benign_data(dflists):
    chunks = []
    for dflist in dflists:
        for df in dflist:
            chunks.append(df)
    return pd.concat(chunks)
        

In [340]:
benign_dflist = clean_dflist(benign_dflist)
benign_df = concat_benign_data(benign_dflist)

In [342]:
sampled_data = []
for df in sampled_attack_data:
    sampled_data.append(pd.concat([df, benign_df], axis=0))
    

In [343]:
for i, sample in enumerate(sampled_data):
    print(f'{data_files[i]}: {sample.shape[0]}')

DrDoS_LDAP.csv: 306860
DrDoS_NetBIOS.csv: 306847
DrDoS_SNMP.csv: 306853
DrDoS_UDP.csv: 306863
TFTP.csv: 306838
DrDoS_DNS.csv: 306853
DrDoS_MSSQL.csv: 306850
DrDoS_NTP.csv: 306858
DrDoS_SSDP.csv: 306852
Syn.csv: 306863
UDPLag.csv: 306863


In [346]:
sampled_data[0]['Label'].value_counts(normalize=True)

Label
DrDoS_LDAP    0.814694
BENIGN        0.185306
Name: proportion, dtype: float64

In [348]:
for i, df in enumerate(sampled_data):
    path = f'data/CICDDos2019/sampled_data/sampled_{data_files[i]}'
    df.to_csv(path)