In [None]:
import numpy as np
import pandas as pd
import csv

In [None]:
import shutil


def append_csv_files(output_file: str, input_files: list) -> None:
    with open(output_file, "wb") as outfile:  # Open in binary mode for efficiency
        for file_idx, input_file in enumerate(input_files):
            with open(input_file, "rb") as infile:
                if file_idx > 0:
                    # Skip the header for subsequent files
                    next(infile)
                # Use shutil to copy file-like objects efficiently
                shutil.copyfileobj(infile, outfile)

In [None]:
def append_csv_files(output_file: str, input_files: list) -> None:
    with open(output_file, "w", newline="") as outfile:
        writer = csv.writer(outfile)

        # Process each input file one by one
        for file_idx, input_file in enumerate(input_files):
            with open(input_file, "r") as infile:
                reader = csv.reader(infile)
                if file_idx > 0:
                    # Skip the header for all files except the first one
                    next(reader, None)
                # Append rows from the current file to the output file
                for row in reader:
                    writer.writerow(row)

In [None]:
import csv
from concurrent.futures import ThreadPoolExecutor


def process_file(input_file: str, output_file: str, skip_header: bool) -> None:
    with open(input_file, "r") as infile:
        reader = csv.reader(infile)
        with open(output_file, "a", newline="") as outfile:
            writer = csv.writer(outfile)
            if skip_header:
                next(reader, None)
            for row in reader:
                writer.writerow(row)


def append_csv_files(output_file: str, input_files: list) -> None:
    # Open output file once to clear it
    with open(output_file, "w", newline=""):
        pass  # This ensures the output file is empty

    with ThreadPoolExecutor() as executor:
        futures = []
        for idx, input_file in enumerate(input_files):
            futures.append(
                executor.submit(process_file, input_file, output_file, idx > 0)
            )
        # Ensure all threads finish
        for future in futures:
            future.result()

In [None]:
DATA_PATH = "../data/raw/NIDS_DATA/"
DATA_LIST = [
    "NF-BoT-IoT-v2.csv",
    "NF-CSE-CIC-IDS2018-v2.csv",
    "NF-ToN-IoT-v2.csv",
    "NF-UNSW-NB15-v2.csv",
]

COMB_DATA = [DATA_PATH + DATA for DATA in DATA_LIST]
COMB_DATA

In [None]:
# append_csv_files("test2.csv", COMB_DATA)

In [3]:
import polars as pl

# pl.scan_csv("test2.csv", ignore_errors=True)

In [4]:
import subprocess

num_lines = int(subprocess.check_output("wc -l test2.csv", shell=True).split()[0]) - 1

In [None]:
import polars as pl
import random


def lazy_load_and_shuffle(input_files: list, output_file: str) -> None:
    # Create a list to hold lazy frames
    lazy_frames = []

    for file in input_files:
        # Load each file lazily
        lazy_df = pl.scan_csv(file)
        lazy_frames.append(lazy_df)

    # Concatenate the lazy frames
    combined_lazy_df = pl.concat(lazy_frames)

    # Collect the data (materialize), shuffle it, and then write it to CSV
    df = combined_lazy_df.collect()

    # Shuffle the DataFrame rows
    shuffled_df = df.sample(
        frac=1, with_replacement=False
    ).to_pandas()  # Optional: to_pandas() if compatibility is needed

    # Write to output CSV
    shuffled_df.write_csv(output_file)


# Usage
output_file = "shuffled_output.csv"
lazy_load_and_shuffle(COMB, output_file)

In [16]:
pl.read_csv(COMB_DATA[1])

IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,SERVER_TCP_FLAGS,FLOW_DURATION_MILLISECONDS,DURATION_IN,DURATION_OUT,MIN_TTL,MAX_TTL,LONGEST_FLOW_PKT,SHORTEST_FLOW_PKT,MIN_IP_PKT_LEN,MAX_IP_PKT_LEN,SRC_TO_DST_SECOND_BYTES,DST_TO_SRC_SECOND_BYTES,RETRANSMITTED_IN_BYTES,RETRANSMITTED_IN_PKTS,RETRANSMITTED_OUT_BYTES,RETRANSMITTED_OUT_PKTS,SRC_TO_DST_AVG_THROUGHPUT,DST_TO_SRC_AVG_THROUGHPUT,NUM_PKTS_UP_TO_128_BYTES,NUM_PKTS_128_TO_256_BYTES,NUM_PKTS_256_TO_512_BYTES,NUM_PKTS_512_TO_1024_BYTES,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
str,i64,str,i64,i64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str
"""13.58.98.64""",40894,"""172.31.69.25""",22,6,92.0,3164,23,3765,21,27,27,27,0,0,0,63,63,1028,52,52,1028,3164.0,3765.0,0,0,0,0,25312000,30120000,33,7,1,2,1,26883,26847,0,0,0,0,0,0,1,"""SSH-Bruteforce"""
"""213.202.230.143""",29622,"""172.31.66.103""",3389,6,0.0,1919,14,2031,11,223,219,30,0,0,0,101,101,1195,40,40,1195,1919.0,2031.0,0,0,0,0,15352000,16248000,17,6,0,1,1,8192,64000,0,0,0,0,0,0,0,"""Benign"""
"""172.31.66.5""",65456,"""172.31.0.2""",53,17,0.0,116,2,148,2,0,0,0,0,0,0,128,128,74,58,58,74,116.0,148.0,0,0,0,0,928000,1184000,4,0,0,0,0,0,0,0,0,2511,1,5,0,0,"""Benign"""
"""172.31.64.92""",57918,"""172.31.0.2""",53,17,0.0,70,1,130,1,0,0,0,0,0,0,0,0,130,70,70,130,70.0,130.0,0,0,0,0,560000,1040000,1,1,0,0,0,0,0,0,0,3371,1,60,0,0,"""Benign"""
"""18.219.32.43""",63269,"""172.31.69.25""",80,6,7.0,232,5,1136,4,223,222,27,4294827,140,0,127,127,1004,40,40,1004,232.0,1136.0,0,0,0,0,8000,9088000,8,0,0,1,0,8192,26883,0,0,0,0,0,0,1,"""DDoS attacks-LOIC-HTTP"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""172.31.66.111""",59578,"""172.31.0.2""",53,17,0.0,77,1,138,1,0,0,0,0,0,0,0,0,138,77,77,138,77.0,138.0,0,0,0,0,616000,1104000,1,1,0,0,0,0,0,0,0,14428,1,60,0,0,"""Benign"""
"""172.31.67.86""",52954,"""104.16.84.55""",443,6,91.22,1729,20,6719,16,27,27,27,0,0,0,128,128,1500,40,40,1500,1729.0,6719.0,0,0,0,0,13832000,53752000,26,3,3,1,3,8192,29200,0,0,0,0,0,0,0,"""Benign"""
"""18.218.115.60""",59664,"""172.31.69.28""",80,6,7.0,561,5,1147,5,219,219,27,4294937,29,29,127,127,975,40,40,975,561.0,1147.0,0,0,0,0,144000,304000,8,0,1,1,0,65535,26883,0,0,0,0,0,0,1,"""DDOS attack-HOIC"""
"""23.246.192.59""",1780,"""172.31.66.15""",445,6,41.0,120,3,124,3,23,19,22,0,0,0,106,106,44,40,40,44,120.0,124.0,0,0,0,0,960000,992000,6,0,0,0,0,65392,65392,0,0,0,0,0,0,0,"""Benign"""


In [None]:
CHUNK_SIZE = 5000
csv_file_list = ["file1.csv", "file2.csv", "file3.csv"]
output_file = "./result_merge/output.csv"

first_one = True
for csv_file_name in csv_file_list:
    if (
        not first_one
    ):  # if it is not the first csv file then skip the header row (row 0) of that file
        skip_row = [0]
    else:
        skip_row = []

    chunk_container = pd.read_csv(
        csv_file_name, chunksize=CHUNK_SIZE, skiprows=skip_row
    )
    for chunk in chunk_container:
        chunk.to_csv(output_file, mode="a", index=False)
    first_one = False

In [None]:
pd.read_csv(COMB_DATA[1])

In [None]:
pd.read_csv(COMB_DATA[3])