# Data prep

This notebook is used for preparing the data, and splitting it into the train, test, and benchmark sets

In [1]:
# imports
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
# setting up the import paths
phiusiil = r"datasets\PhiUSIIL_Phishing_URL_Dataset.csv"
malicious_phish = r"datasets\malicious_phish.csv"
url_dataset = r"datasets\URL dataset.csv"

# choose which dataset to import here
to_import = "url_dataset"

# choose the fraction of data that will be used for train/val/test
# malicious_phish: 0.03
# phiusiil: 0.1
# url_dataset: 0.04
fraction = 0.04

# import dataset
if to_import == "phiusiil":
    dataset = pd.read_csv(phiusiil)
elif to_import == "malicious_phish":
    dataset = pd.read_csv(malicious_phish)
elif to_import == "url_dataset":
    dataset = pd.read_csv(url_dataset)
dataset.head()

## malicious_phish

In [3]:
if to_import == "malicious_phish":
    labels = dataset["type"].unique().tolist()
    label_array = {
        "benign": 0,
        "phishing": 1,
        "defacement": 1,
        "malware": 1
    }
    dataset["type"] = dataset["type"].map(label_array)
    dataset.rename(columns={"type": "label"}, inplace=True)

## phiusill

In [4]:
if to_import == "phiusiil":
    dataset = dataset[["URL", "label"]]
    dataset.rename(columns={"URL": "url"}, inplace=True)
    dataset["label"] = dataset["label"].apply(lambda x: 0 if x == 1 else 1)

## url_dataset

In [5]:
if to_import == "url_dataset":
    labels = dataset["type"].unique().tolist()
    label_array = {
        "legitimate": 0,
        "phishing": 1,
    }
    dataset["type"] = dataset["type"].map(label_array)
    dataset.rename(columns={"type": "label"}, inplace=True)

## final processing

In [6]:
benign = dataset[dataset["label"] == 0].sample(frac=fraction, random_state=42)
malicious = dataset[dataset["label"] == 1].sample(frac=fraction, random_state=42)

# Split the benign data
benign_train, benign_temp = train_test_split(benign, test_size=0.2, random_state=42)
benign_val, benign_benchmark = train_test_split(benign_temp, test_size=0.5, random_state=42)

# Split the malicious data
malicious_train, malicious_temp = train_test_split(malicious, test_size=0.2, random_state=42)
malicious_val, malicious_benchmark = train_test_split(malicious_temp, test_size=0.5, random_state=42)

# Combine the train, validation, and benchmark datasets
train = pd.concat([benign_train, malicious_train])
validation = pd.concat([benign_val, malicious_val])
benchmark = pd.concat([benign_benchmark, malicious_benchmark])

train.reset_index(drop=True, inplace=True)
validation.reset_index(drop=True, inplace=True)
benchmark.reset_index(drop=True, inplace=True)

## data.head()

In [None]:
train.head()

In [None]:
train["label"].value_counts()

In [None]:
validation.head()

In [None]:
validation["label"].value_counts()

In [None]:
benchmark.head()

In [None]:
benchmark["label"].value_counts()

In [13]:
train.to_csv(f"datasets/train.csv", index=False)
validation.to_csv(f"datasets/validation.csv", index=False)
benchmark.to_csv(f"datasets/benchmark.csv", index=False)