# Data prep

This notebook is used for preparing the data, and splitting it into the train, test, and benchmark sets

In [1]:
# imports
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
# import dataset
dataset = pd.read_csv("datasets/malicious_phish.csv")
dataset.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [3]:
dataset["type"].value_counts()

type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64

In [4]:
labels = dataset["type"].unique().tolist()
label_array = {
    "benign": 0,
    "phishing": 1,
    "defacement": 1,
    "malware": 1
}
dataset["type"] = dataset["type"].map(label_array)
dataset.head()

Unnamed: 0,url,type
0,br-icloud.com.br,1
1,mp3raid.com/music/krizz_kaliko.html,0
2,bopsecrets.org/rexroth/cr/1.htm,0
3,http://www.garage-pirenne.be/index.php?option=...,1
4,http://adventure-nicaragua.net/index.php?optio...,1


In [5]:
dataset["type"].value_counts()

type
0    428103
1    223088
Name: count, dtype: int64

In [6]:
dataset.rename(columns={"type": "label"}, inplace=True)
dataset.head()

Unnamed: 0,url,label
0,br-icloud.com.br,1
1,mp3raid.com/music/krizz_kaliko.html,0
2,bopsecrets.org/rexroth/cr/1.htm,0
3,http://www.garage-pirenne.be/index.php?option=...,1
4,http://adventure-nicaragua.net/index.php?optio...,1


In [7]:
fraction = 0.005
benign = dataset[dataset["label"] == 0].sample(frac=fraction, random_state=42)
malicious = dataset[dataset["label"] == 1].sample(frac=fraction, random_state=42)

# Split the benign data
benign_train, benign_temp = train_test_split(benign, test_size=0.2, random_state=42)
benign_val, benign_benchmark = train_test_split(benign_temp, test_size=0.5, random_state=42)

# Split the malicious data
malicious_train, malicious_temp = train_test_split(malicious, test_size=0.2, random_state=42)
malicious_val, malicious_benchmark = train_test_split(malicious_temp, test_size=0.5, random_state=42)

# Combine the train, validation, and benchmark datasets
train = pd.concat([benign_train, malicious_train])
validation = pd.concat([benign_val, malicious_val])
benchmark = pd.concat([benign_benchmark, malicious_benchmark])

train.reset_index(drop=True, inplace=True)
validation.reset_index(drop=True, inplace=True)
benchmark.reset_index(drop=True, inplace=True)

In [8]:
train.head()

Unnamed: 0,url,label
0,en.wikipedia.org/wiki/Blaqk_Audio,0
1,davidfongs.com/,0
2,drpeppersnapplegroup.com/brands/7up/,0
3,dallasnews.com/news/community-news/dallas/head...,0
4,192.com/atoz/people/davis/bernard/,0


In [9]:
train["label"].value_counts()

label
0    1712
1     892
Name: count, dtype: int64

In [10]:
validation.head()

Unnamed: 0,url,label
0,diva.sfsu.edu/collections/sfbatv/bundles/189349,0
1,blogs.wsj.com/speakeasy/2010/05/05/babies-can-...,0
2,123people.com/b/john+o'donoghue,0
3,adf.ly/2sLad,0
4,last.fm/music/A+Perfect+Murder,0


In [11]:
validation["label"].value_counts()

label
0    214
1    111
Name: count, dtype: int64

In [12]:
benchmark.head()

Unnamed: 0,url,label
0,http://elcomercio.pe/mundo/latinoamerica/mexic...,0
1,we3d.net/ethan-hawke-welcomes-his-fourth-child/,0
2,isthisyour.name/directory/fullnames/ma~cropper...,0
3,voiceplaces.com/yoshis-at-jack-london-square-s...,0
4,dewayneinsd.com/rick-mccoy-and-parker-d-cocksu...,0


In [13]:
benchmark["label"].value_counts()

label
0    215
1    112
Name: count, dtype: int64

In [14]:
train.to_csv("datasets/train.csv", index=False)
validation.to_csv("datasets/validation.csv", index=False)
benchmark.to_csv("datasets/benchmark.csv", index=False)