# Data prep

This notebook is used for preparing the data, and splitting it into the train, test, and benchmark sets

In [1]:
# imports
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
# import dataset
dataset = pd.read_csv("datasets/malicious_phish.csv")
dataset.head()

In [None]:
dataset["type"].value_counts()

In [None]:
labels = dataset["type"].unique().tolist()
label_array = {
    "benign": 0,
    "phishing": 1,
    "defacement": 1,
    "malware": 1
}
dataset["type"] = dataset["type"].map(label_array)
dataset.head()

In [None]:
dataset["type"].value_counts()

In [None]:
dataset.rename(columns={"type": "label"}, inplace=True)
dataset.head()

In [7]:
fraction = 0.03
benign = dataset[dataset["label"] == 0].sample(frac=fraction, random_state=42)
malicious = dataset[dataset["label"] == 1].sample(frac=fraction, random_state=42)

# Split the benign data
benign_train, benign_temp = train_test_split(benign, test_size=0.2, random_state=42)
benign_val, benign_benchmark = train_test_split(benign_temp, test_size=0.5, random_state=42)

# Split the malicious data
malicious_train, malicious_temp = train_test_split(malicious, test_size=0.2, random_state=42)
malicious_val, malicious_benchmark = train_test_split(malicious_temp, test_size=0.5, random_state=42)

# Combine the train, validation, and benchmark datasets
train = pd.concat([benign_train, malicious_train])
validation = pd.concat([benign_val, malicious_val])
benchmark = pd.concat([benign_benchmark, malicious_benchmark])

train.reset_index(drop=True, inplace=True)
validation.reset_index(drop=True, inplace=True)
benchmark.reset_index(drop=True, inplace=True)

In [None]:
train.head()

In [None]:
train["label"].value_counts()

In [None]:
validation.head()

In [None]:
validation["label"].value_counts()

In [None]:
benchmark.head()

In [None]:
benchmark["label"].value_counts()

In [14]:
train.to_csv("datasets/train.csv", index=False)
validation.to_csv("datasets/validation.csv", index=False)
benchmark.to_csv("datasets/benchmark.csv", index=False)