### Preparing Dataset with same distribution as the complete dataset

In [None]:
import pandas as pd
import os
from glob import glob

# Define the folder path where your CSV files are stored
data_folder = "/content/drive/MyDrive/Group5/Data"

# Get a list of CSV files matching the pattern (adjust the pattern if needed)
csv_files = glob(os.path.join(data_folder, "Network_dataset_*.csv"))

# Dictionary to hold the total counts of each label
label_counts_total = {}

# Loop over each CSV file
for file in csv_files:
    print(f"Processing {file} ...")
    try:
        # Read only the 'label' column; change 'label' if your column has a different name
        df = pd.read_csv(file, usecols=["label"])
    except Exception as e:
        print(f"Error reading {file}: {e}")
        continue

    # Count label occurrences in the current file
    counts = df["label"].value_counts().to_dict()

    # Update the overall counts
    for label, count in counts.items():
        label_counts_total[label] = label_counts_total.get(label, 0) + count

# Display the overall label distribution
print("Overall label distribution:")
print(label_counts_total)


Processing /content/drive/MyDrive/Group5/Data/Network_dataset_1.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_10.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_11.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_13.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_14.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_12.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_16.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_17.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_15.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_18.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_19.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_20.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_2.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network

In [None]:
import pandas as pd
import os
from glob import glob

# Define the folder path where your CSV files are stored
data_folder = "/content/drive/MyDrive/Group5/Data"

# Get a list of CSV files matching the pattern
csv_files = glob(os.path.join(data_folder, "Network_dataset_*.csv"))

# Dictionary to hold the total counts of each type
type_counts_total = {}

# Loop over each CSV file
for file in csv_files:
    print(f"Processing {file} ...")
    try:
        # Read only the 'type' column; adjust the column name if necessary
        df = pd.read_csv(file, usecols=["type"])
    except Exception as e:
        print(f"Error reading {file}: {e}")
        continue

    # Count type occurrences in the current file
    counts = df["type"].value_counts().to_dict()

    # Update the overall counts
    for t, count in counts.items():
        type_counts_total[t] = type_counts_total.get(t, 0) + count

# Display the overall type distribution
print("Overall 'type' distribution:")
print(type_counts_total)


Processing /content/drive/MyDrive/Group5/Data/Network_dataset_1.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_10.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_11.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_13.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_14.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_12.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_16.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_17.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_15.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_18.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_19.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_20.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_2.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network

In [None]:
import pandas as pd
import os
from glob import glob
from sklearn.model_selection import StratifiedShuffleSplit

# Define the folder path where your CSV files are stored
data_folder = "/content/drive/MyDrive/Group5/Data"
csv_files = glob(os.path.join(data_folder, "Network_dataset_*.csv"))

# --- Step 1: Find the common columns across all CSV files ---
common_columns = None
for file in csv_files:
    try:
        # Read only header of the CSV file
        df_header = pd.read_csv(file, nrows=0)
        file_columns = set(df_header.columns)
        if common_columns is None:
            common_columns = file_columns
        else:
            common_columns = common_columns.intersection(file_columns)
    except Exception as e:
        print(f"Error reading header from {file}: {e}")

common_columns = list(common_columns)
print("Common columns across all CSV files:")
print(common_columns)

# --- Step 2: Sample rows from each file using only the common columns ---
sample_list = []
chunksize = 100000
target_rows = 1000000
# Calculate sampling fraction based on approximate total rows (adjust as needed)
# Here, using an estimate of 22,339,021 rows in the full dataset.
sampling_frac = target_rows / 22339021
print(f"Using a sampling fraction of {sampling_frac:.4f} per chunk.")

for file in csv_files:
    print(f"Processing {file} ...")
    try:
        for chunk in pd.read_csv(file, usecols=common_columns, chunksize=chunksize):
            # Sample a fraction of rows from the chunk with a fixed seed for reproducibility
            sample_chunk = chunk.sample(frac=sampling_frac, random_state=42)
            sample_list.append(sample_chunk)
    except Exception as e:
        print(f"Error processing {file}: {e}")

# Combine all sampled chunks
sample_df = pd.concat(sample_list, ignore_index=True)
print(f"Initial sampled rows: {len(sample_df)}")

# If the sampled rows are less than the target, adjust target_rows
if len(sample_df) < target_rows:
    print(f"Sampled rows ({len(sample_df)}) are less than target ({target_rows}). Using all sampled rows.")
    target_rows = len(sample_df)

# --- Step 3: Stratified Sampling to get exactly target_rows if needed ---
if len(sample_df) == target_rows:
    subset_df = sample_df.copy()
else:
    # We assume that the common columns include 'type' and 'label' needed for stratification.
    # Create a helper column combining 'type' and 'label'
    sample_df["stratify_group"] = sample_df["type"].astype(str) + "_" + sample_df["label"].astype(str)

    sss = StratifiedShuffleSplit(n_splits=1, train_size=target_rows, random_state=42)
    for train_index, _ in sss.split(sample_df, sample_df["stratify_group"]):
        subset_df = sample_df.iloc[train_index].copy()
    # Drop the helper column after sampling
    subset_df = subset_df.drop(columns=["stratify_group"])

print(f"Final subset rows: {len(subset_df)}")

# --- Step 4: Save the subset without extra index or columns ---
subset_df.to_csv("subset_1M.csv", index=False)
print("Subset saved to 'subset_1M.csv'.")


Common columns across all CSV files:
['conn_state', 'weird_addl', 'http_orig_mime_types', 'dst_bytes', 'http_trans_depth', 'http_status_code', 'duration', 'src_pkts', 'dns_qclass', 'dst_ip', 'http_referrer', 'http_user_agent', 'dns_AA', 'dns_RA', 'type', 'dst_ip_bytes', 'src_ip', 'http_uri', 'http_method', 'dst_port', 'ssl_cipher', 'src_bytes', 'missed_bytes', 'ssl_established', 'service', 'http_response_body_len', 'dns_RD', 'http_version', 'label', 'http_request_body_len', 'ssl_subject', 'dst_pkts', 'dns_rejected', 'src_port', 'dns_rcode', 'ts', 'dns_query', 'dns_qtype', 'ssl_version', 'http_resp_mime_types', 'ssl_resumed', 'weird_notice', 'weird_name', 'ssl_issuer', 'proto', 'src_ip_bytes']
Using a sampling fraction of 0.0448 per chunk.
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_1.csv ...


  for chunk in pd.read_csv(file, usecols=common_columns, chunksize=chunksize):
  for chunk in pd.read_csv(file, usecols=common_columns, chunksize=chunksize):


Processing /content/drive/MyDrive/Group5/Data/Network_dataset_10.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_11.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_13.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_14.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_12.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_16.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_17.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_15.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_18.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_19.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_20.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_2.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_22.csv ...


  for chunk in pd.read_csv(file, usecols=common_columns, chunksize=chunksize):
  for chunk in pd.read_csv(file, usecols=common_columns, chunksize=chunksize):
  for chunk in pd.read_csv(file, usecols=common_columns, chunksize=chunksize):


Processing /content/drive/MyDrive/Group5/Data/Network_dataset_23.csv ...


  for chunk in pd.read_csv(file, usecols=common_columns, chunksize=chunksize):


Processing /content/drive/MyDrive/Group5/Data/Network_dataset_21.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_3.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_5.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_4.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_6.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_7.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_8.csv ...
Processing /content/drive/MyDrive/Group5/Data/Network_dataset_9.csv ...
Initial sampled rows: 999895
Sampled rows (999895) are less than target (1000000). Using all sampled rows.
Final subset rows: 999895
Subset saved to 'subset_1M.csv'.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the subset from the CSV file
subset_df = pd.read_csv("subset_1M.csv")

# Create a helper column for stratification that combines 'type' and 'label'
subset_df["stratify_group"] = subset_df["type"].astype(str) + "_" + subset_df["label"].astype(str)

# Split into 20% labeled and 80% unlabeled portions using stratification
labeled_df, unlabeled_df = train_test_split(
    subset_df,
    test_size=0.8,   # 80% of the data will be unlabeled
    stratify=subset_df["stratify_group"],
    random_state=42
)

# Drop the helper column as it's no longer needed
labeled_df = labeled_df.drop(columns=["stratify_group"])
unlabeled_df = unlabeled_df.drop(columns=["stratify_group"])

# Function to print counts with percentages
def print_distribution(df, column_name):
    counts = df[column_name].value_counts()
    percentages = df[column_name].value_counts(normalize=True) * 100
    for category in counts.index:
        print(f"{category}: {counts[category]} ({percentages[category]:.2f}%)")

# Display information for the labeled portion
print("Labeled portion shape:", labeled_df.shape)
print("\nLabeled portion distribution (label counts):")
print_distribution(labeled_df, "label")
print("\nLabeled portion distribution (type counts):")
print_distribution(labeled_df, "type")

# Display information for the unlabeled portion
print("\nUnlabeled portion shape:", unlabeled_df.shape)
print("\nUnlabeled portion distribution (label counts):")
print_distribution(unlabeled_df, "label")
print("\nUnlabeled portion distribution (type counts):")
print_distribution(unlabeled_df, "type")

# Optionally, save the splits for further processing
labeled_df.to_csv("labeled_20pct.csv", index=False)
unlabeled_df.to_csv("unlabeled_80pct.csv", index=False)
print("\nSplits saved as 'labeled_20pct.csv' and 'unlabeled_80pct.csv'.")


  subset_df = pd.read_csv("subset_1M.csv")


Labeled portion shape: (199979, 46)

Labeled portion distribution (label counts):
1: 192786 (96.40%)
0: 7193 (3.60%)

Labeled portion distribution (type counts):
scanning: 63915 (31.96%)
ddos: 55159 (27.58%)
dos: 30188 (15.10%)
xss: 18865 (9.43%)
password: 15388 (7.69%)
normal: 7193 (3.60%)
backdoor: 4545 (2.27%)
injection: 4058 (2.03%)
ransomware: 658 (0.33%)
mitm: 10 (0.01%)

Unlabeled portion shape: (799916, 46)

Unlabeled portion distribution (label counts):
1: 771145 (96.40%)
0: 28771 (3.60%)

Unlabeled portion distribution (type counts):
scanning: 255661 (31.96%)
ddos: 220637 (27.58%)
dos: 120754 (15.10%)
xss: 75459 (9.43%)
password: 61550 (7.69%)
normal: 28771 (3.60%)
backdoor: 18180 (2.27%)
injection: 16234 (2.03%)
ransomware: 2632 (0.33%)
mitm: 38 (0.00%)

Splits saved as 'labeled_20pct.csv' and 'unlabeled_80pct.csv'.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the labeled portion from the CSV file
labeled_df = pd.read_csv("labeled_20pct.csv")

# Create a helper column for stratification combining 'type' and 'label'
labeled_df["stratify_group"] = labeled_df["type"].astype(str) + "_" + labeled_df["label"].astype(str)

# Define the proportions
# We'll reserve 15% of the labeled data for the test set.
test_frac = 0.15

# Split off the test set
labeled_remaining, test_df = train_test_split(
    labeled_df,
    test_size=test_frac,
    stratify=labeled_df["stratify_group"],
    random_state=42
)

# Now, we want the training set to be 70% of the labeled data and validation 15%.
# Since we already took out 15% for test, the remaining 85% should be split into:
# Training: 70/85 ≈ 0.8235 of the remaining data
# Validation: 15/85 ≈ 0.1765 of the remaining data

train_frac = 0.8235  # approximately 70% overall
labeled_remaining["stratify_group"] = labeled_remaining["type"].astype(str) + "_" + labeled_remaining["label"].astype(str)
train_df, val_df = train_test_split(
    labeled_remaining,
    test_size=(1 - train_frac),  # remaining fraction for validation
    stratify=labeled_remaining["stratify_group"],
    random_state=42
)

# Drop the helper stratification column from all splits
for df in [train_df, val_df, test_df]:
    df.drop(columns=["stratify_group"], inplace=True)

# Function to print counts with percentages
def print_distribution(df, column_name, set_name):
    print(f"\n{set_name} distribution for {column_name}:")
    counts = df[column_name].value_counts()
    percentages = df[column_name].value_counts(normalize=True) * 100
    for category in counts.index:
        print(f"  {category}: {counts[category]} ({percentages[category]:.2f}%)")

# Print shapes and distributions for each split
print("Training set shape:", train_df.shape)
print("Validation set shape:", val_df.shape)
print("Test set shape:", test_df.shape)

print_distribution(train_df, "label", "Training")
print_distribution(train_df, "type", "Training")

print_distribution(val_df, "label", "Validation")
print_distribution(val_df, "type", "Validation")

print_distribution(test_df, "label", "Test")
print_distribution(test_df, "type", "Test")

# Optionally, save the splits for further processing
train_df.to_csv("train_70pct.csv", index=False)
val_df.to_csv("validation_15pct.csv", index=False)
test_df.to_csv("test_15pct.csv", index=False)
print("\nSplits saved as 'train_70pct.csv', 'validation_15pct.csv', and 'test_15pct.csv'.")


  labeled_df = pd.read_csv("labeled_20pct.csv")


Training set shape: (139980, 46)
Validation set shape: (30002, 46)
Test set shape: (29997, 46)

Training distribution for label:
  1: 134945 (96.40%)
  0: 5035 (3.60%)

Training distribution for type:
  scanning: 44739 (31.96%)
  ddos: 38610 (27.58%)
  dos: 21131 (15.10%)
  xss: 13205 (9.43%)
  password: 10771 (7.69%)
  normal: 5035 (3.60%)
  backdoor: 3181 (2.27%)
  injection: 2840 (2.03%)
  ransomware: 460 (0.33%)
  mitm: 8 (0.01%)

Validation distribution for label:
  1: 28923 (96.40%)
  0: 1079 (3.60%)

Validation distribution for type:
  scanning: 9589 (31.96%)
  ddos: 8275 (27.58%)
  dos: 4529 (15.10%)
  xss: 2830 (9.43%)
  password: 2309 (7.70%)
  normal: 1079 (3.60%)
  backdoor: 682 (2.27%)
  injection: 609 (2.03%)
  ransomware: 99 (0.33%)
  mitm: 1 (0.00%)

Test distribution for label:
  1: 28918 (96.40%)
  0: 1079 (3.60%)

Test distribution for type:
  scanning: 9587 (31.96%)
  ddos: 8274 (27.58%)
  dos: 4528 (15.09%)
  xss: 2830 (9.43%)
  password: 2308 (7.69%)
  normal: 107