In [1]:
!which python

/mnt/netapp2/Store_uni/home/ulc/co/rpj/projects/CP-HOSfing/venv/bin/python


In [2]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
import glob

from exps.utils.io_utils import *
from exps.utils.os_tree_func import graph_tree, text_tree, print_text_tree, circle_plot, square_plot, stacked_plot


sns.set(color_codes=True)
%matplotlib inline 

ModuleNotFoundError: No module named 'exps'

In [None]:
seed = 42
random.seed(seed)
np.random.seed(seed)

### Dataset Load

In [None]:
import os

filenames = glob.glob(os.path.join(os.environ['STORE'], 'projects/CP-HOSfing/data/lastovicka2023/*ORIGINAL.csv'))

print("Summary of CSV files found:")
total_files = len(filenames)
total_size = 0

for f in filenames:
    file_name = os.path.basename(f)
    try:
        size = os.path.getsize(f)
        with open(f, 'r', encoding='utf-8', errors='ignore') as file:
            num_lines = sum(1 for _ in file)
    except Exception as e:
        size = -1
        num_lines = -1
    total_size += size if size > 0 else 0
    print(f"File: {file_name:40}  Lines: {num_lines:8,}  Size: {size/1024/1024:8.2f} MB")

print(f"\nTotal number of files: {total_files}")
print(f"Total size: {total_size/1024/1024:.2f} MB")


In [None]:
df = pd.concat(
    [
        pd.read_csv(
            f,
            sep=";",
            on_bad_lines="warn",
            na_values=["", " ", "-", "NA", "NaN", "null", "NULL", "n/a"],
            low_memory=False,
            # thousands=",",   # remove if not applicable
            # # decimal=","    # uncomment if decimals use comma
        )
        for f in filenames
    ],
    ignore_index=True,
)

In [None]:
df

### Columns Selection

In [None]:
# COLUMNS SELECTION
    
# Label
labels = ['UA OS family', 'UA OS major', 'UA OS minor', 'UA OS patch']

# Categorical features
categorical_f = [
    "TCP flags A", "TLS_CONTENT_TYPE", "TLS_HANDSHAKE_TYPE",
    "TLS_CIPHER_SUITE", "TLS_CLIENT_VERSION", "TLS_CIPHER_SUITES",
    "TLS_CLIENT_SESSION_ID", "TLS_EXTENSION_TYPES", "TLS_CLIENT_KEY_LENGTH",
    "TLS_EXTENSION_LENGTHS", "TLS_ELLIPTIC_CURVES", "TLS_EC_POINT_FORMATS",
    "IPv4DontFragmentforward", "tcpOptionWindowScaleforward",
    "tcpOptionSelectiveAckPermittedforward", "tcpOptionNoOperationforward",
    "flowEndReason", "TLS_JA3_FINGERPRINT", "IP ToS"
]

# Numerical features
numerical_f = [
    "SRC port", "TCP SYN Size", "TCP Win Size", "TCP SYN TTL", 
    "NPM_CLIENT_NETWORK_TIME", "NPM_ROUND_TRIP_TIME", "NPM_RESPONSE_TIMEOUTS_A",
    "NPM_TCP_RETRANSMISSION_A", "NPM_TCP_OUT_OF_ORDER_A", "NPM_JITTER_DEV_A",
    "NPM_JITTER_AVG_A", "NPM_JITTER_MIN_A", "NPM_JITTER_MAX_A",
    "NPM_DELAY_DEV_A", "NPM_DELAY_AVG_A", "NPM_DELAY_MIN_A", "NPM_DELAY_MAX_A",
    "NPM_DELAY_HISTOGRAM_1_A", "TLS_SETUP_TIME", 
    "tcpOptionMaximumSegmentSizeforward"
]

features_list = list(numerical_f) + list(categorical_f)

# Drop all columns except for the features and the label
df.drop(columns=df.columns.difference(features_list + list(labels)), inplace=True)

# Drop columns with constant values
numeric_columns = df.select_dtypes(include=['float', 'int'])
low_variance_columns = numeric_columns.columns[numeric_columns.var() <= 0]
df.drop(columns=low_variance_columns, inplace=True)

### Rows Selection

In [None]:
# ROWS SELECTION

# Drop rows with NaN values in selected columns
drop_nans_rows = [
    "TCP SYN Size",
    "TCP Win Size"
]
df.dropna(subset=drop_nans_rows, 
        inplace=True)

# Round to the higher power of two the values of column "TCP SYN TTL"
df["TCP SYN TTL"] = df["TCP SYN TTL"].apply(lambda x: 2**np.ceil(np.log2(x)))

### Labelling Adjustment

#### Column Renaming

In [None]:
# LABELLING ADJUSTMENT

# Class modification
# Rename columns from UA OS family, UA OS major, UA OS minor, UA OS patch to OS family, OS major, OS minor, OS patch
df.rename(columns={
    'UA OS family': 'OS family',
    'UA OS major': 'OS major',
    'UA OS minor': 'OS minor',
    'UA OS patch': 'OS patch'
}, inplace=True)
# Also update the labels variable to reflect new column names
labels = ['OS family', 'OS major', 'OS minor', 'OS patch']

In [None]:
# Combine "OS minor" and "OS patch" into "OS minor"
#   Finally not used, because it introduces (artificial) more classes at minor level.
#   Instead, I removed "OS patch" column.
    # def combine_minor_patch(row):
    #     minor = row['OS minor']
    #     patch = row['OS patch']
    #     if pd.isna(minor) or str(minor) == '' or str(minor).lower() == 'nan':
    #         return minor
    #     elif pd.isna(patch) or str(patch) == '' or str(patch).lower() == 'nan':
    #         return minor
    #     else:
    #         return f"{minor}_{patch}"
    # df['OS minor'] = df.apply(combine_minor_patch, axis=1)
df.drop('OS patch', axis=1, inplace=True)

labels = ['OS family', 'OS major', 'OS minor']
l_family = labels[0]
l_major = labels[1]
l_minor = labels[2]

#### OS Classes Adjustment 

In [None]:
for col in labels:
    df[col] = df[col].astype(str)

In [None]:
# Unknown markers
df[l_major] = df[l_major].replace({'*':'<MUnk>','nan':'<MUnk>',None:'<MUnk>',np.nan:'<MUnk>'})
df[l_minor] = df[l_minor].replace({'*':'<mUnk>','nan':'<mUnk>',None:'<mUnk>',np.nan:'<mUnk>'})

##### Linux

In [None]:
# Change OS classes to "Linux": "Symbian OS"; "Slackware"; "Linux Mint"
df.loc[df[l_family] == 'Symbian OS', l_family] = 'Linux'
df.loc[df[l_family] == 'Slackware', l_family] = 'Linux'
df.loc[df[l_family] == 'Linux Mint', l_family] = 'Linux'

In [None]:
# Normalize Linux distros
linux_distros = {'Linux', 'Ubuntu', 'Fedora'}
mask = df[l_family].isin(linux_distros)

old_family = df.loc[mask, l_family].copy()
old_major = df.loc[mask, l_major].copy()

df.loc[mask, l_family] = 'Linux'
df.loc[mask, l_major] = old_family.values
df.loc[mask, l_minor] = old_major.values

mask_linux_debian = (
    (df[l_family] == 'Linux') &
    (df[l_major] == 'Linux') &
    (df[l_minor].isin(['9', '10']))
)
df.loc[mask_linux_debian, l_major] = 'Debian'

linux_MUnk_mask = (df[l_family] == 'Linux') & (df[l_major] == 'Linux')
df.loc[linux_MUnk_mask, l_major] = '<MUnk>'

# For Linux, if <MUnk> is in minor, put <mUnk>
linux_minor_MUnk_mask = (df[l_family] == 'Linux') & (df[l_minor] == '<MUnk>')
df.loc[linux_minor_MUnk_mask, l_minor] = '<mUnk>'


##### Windows

In [None]:
WIN_MAP = {
    '3.1':'Legacy','95':'Legacy','98':'Legacy','ME':'Legacy','2000':'Legacy',
    'XP':'XP','Vista':'Vista','7':'7','8':'8','10':'10','NT':'Legacy','CE':'Legacy'
}

df[l_major] = df[l_major].astype(str)
df.loc[df[l_family]=='Windows',l_major] = df.loc[df[l_family]=='Windows'].apply(
    lambda r: WIN_MAP[r[l_major]] if r[l_major] in WIN_MAP else r[l_major], axis=1)

##### macOS

In [None]:
df.loc[df[l_family] == 'Mac OS X', l_family] = 'macOS'

mask_macos_10_16 = (
    (df[l_family] == 'macOS') &
    (df[l_major] == '10') &
    (df[l_minor] == '16.0')
)
df.loc[mask_macos_10_16, l_major] = '11'
df.loc[mask_macos_10_16, l_minor] = '<mUnk>'

##### ChromeOS

In [None]:
df.loc[df[l_family] == 'Chrome OS', l_family] = 'ChromeOS'

def norm_chromeos_major(maj):
    # map 13020 -> 130 (milestone), 12239 -> 122, etc.
    s = str(maj)
    return s[:3] if s.isdigit() and len(s)>=3 else '<MUnk>'

# ChromeOS fix
mask_chrome = df[l_family]=='ChromeOS'
df.loc[mask_chrome,l_major] = df.loc[mask_chrome,l_major].apply(norm_chromeos_major)
# optionally drop browser version from labels:
df.loc[mask_chrome,l_minor] = '<mUnk>'

##### Others

In [None]:
df.loc[df[l_family] == 'Other', l_family] = 'OTHER'

mask_major_munk = (df[l_major] == '<MUnk>') & (df[l_family] != 'OTHER')
df.loc[mask_major_munk, l_major] = 'OTHER'

df.loc[
    (df[l_family] == 'Windows') & (df[l_major] == '8') & (df[l_minor] == '<mUnk>'),
    l_minor
] = 'OTHER'

df.loc[
    (df[l_family] == 'iOS') & (df[l_major] == '12') & (df[l_minor] == '<mUnk>'),
    l_minor
] = 'OTHER'

df.loc[
    (df[l_family] == 'Linux') & (df[l_major] == 'Ubuntu') & (df[l_minor] == '<mUnk>'),
    l_minor
] = 'OTHER'

#### Plotting

##### Circle Plotting

In [None]:
# circle_plot(df, labels).show()

##### Square Plotting

In [None]:
# square_plot(df, labels).show()

##### Stacked Plotting

In [None]:
# stacked_plot(df, labels).show()

##### Text Tree

In [None]:
print_text_tree(text_tree(df, labels))

##### Graph Tree

###### (not unk cut)

In [None]:
graph_tree(df, labels, max_children_per_node=25)

###### (with unk cut)

In [None]:
graph_tree(df, labels, max_children_per_node=25,ignore_unknown=True)

##### Statistics per level (`value_counts`)

In [None]:
for i in range(1, len(labels)+1):
    print(f"Value counts for: {labels[:i]}")
    print(df[labels[:i]].value_counts())
    print()

In [None]:
# Summary of the features in the dataset

print("Number of rows:", len(df))
print("Number of columns:", df.shape[1])
print("Column names:")
print(df.columns.tolist())
print("\nData types:")
print(df.dtypes)
print("\nNumber of missing values per column:")
print(df.isnull().sum())
print("\nBasic stats for numeric columns:")
print(df.describe())
print("\nBasic stats for categorical columns:")
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
for col in categorical_cols:
    print(f"\nValue counts for '{col}':")
    print(df[col].value_counts(dropna=False))


### Save to disk

In [None]:
# Find all CSV files in the given directory
csv_files = glob.glob(os.path.join(os.environ['STORE'], 'projects/CP-HOSfing/data/lastovicka2023/*.csv'))

for csv_file in csv_files:
    df.to_csv(
        csv_file.replace('ORIGINAL.csv', 'PREPARED.csv'),
        sep=";",
        index=False,
        na_rep="",
        # quoting=csv.QUOTE_MINIMAL  # Uncomment/import csv if needed for quoting
    )