# Data prep

This notebook is used for preparing the data, and splitting it into the train, test, and benchmark sets

In [1]:
# imports
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
# import dataset
dataset = pd.read_csv("datasets/malicious_phish.csv")
dataset.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [3]:
dataset["type"].value_counts()

type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64

In [4]:
labels = dataset["type"].unique().tolist()
label_array = {
    "benign": 0,
    "phishing": 1,
    "defacement": 1,
    "malware": 1
}
dataset["type"] = dataset["type"].map(label_array)
dataset.head()

Unnamed: 0,url,type
0,br-icloud.com.br,1
1,mp3raid.com/music/krizz_kaliko.html,0
2,bopsecrets.org/rexroth/cr/1.htm,0
3,http://www.garage-pirenne.be/index.php?option=...,1
4,http://adventure-nicaragua.net/index.php?optio...,1


In [5]:
dataset["type"].value_counts()

type
0    428103
1    223088
Name: count, dtype: int64

In [6]:
dataset.rename(columns={"type": "label"}, inplace=True)
dataset.head()

Unnamed: 0,url,label
0,br-icloud.com.br,1
1,mp3raid.com/music/krizz_kaliko.html,0
2,bopsecrets.org/rexroth/cr/1.htm,0
3,http://www.garage-pirenne.be/index.php?option=...,1
4,http://adventure-nicaragua.net/index.php?optio...,1


In [7]:
fraction = 0.01
benign = dataset[dataset["label"] == 0].sample(frac=fraction, random_state=42)
malicious = dataset[dataset["label"] == 1].sample(frac=fraction, random_state=42)

# Split the benign data
benign_train, benign_temp = train_test_split(benign, test_size=0.2, random_state=42)
benign_val, benign_benchmark = train_test_split(benign_temp, test_size=0.5, random_state=42)

# Split the malicious data
malicious_train, malicious_temp = train_test_split(malicious, test_size=0.2, random_state=42)
malicious_val, malicious_benchmark = train_test_split(malicious_temp, test_size=0.5, random_state=42)

# Combine the train, validation, and benchmark datasets
train = pd.concat([benign_train, malicious_train])
validation = pd.concat([benign_val, malicious_val])
benchmark = pd.concat([benign_benchmark, malicious_benchmark])

train.reset_index(drop=True, inplace=True)
validation.reset_index(drop=True, inplace=True)
benchmark.reset_index(drop=True, inplace=True)

In [8]:
train.head()

Unnamed: 0,url,label
0,new-haven-connecticut-ct-dui-dwi-drunk-driving...,0
1,youtube.com/watch?v=lcHzE_8DJYk,0
2,hotcouponworld.com/forums/usercp.php,0
3,manta.com/c/mtr85l8/k101,0
4,jamescampbell.remax.com/,0


In [9]:
train["label"].value_counts()

label
0    3424
1    1784
Name: count, dtype: int64

In [10]:
validation.head()

Unnamed: 0,url,label
0,youtube.com/watch?v=HxbTgIhuwdw,0
1,kb32.com/stream-online-shimizu-s-pulse-v-urawa...,0
2,users.ipfw.edu/cooperw/,0
3,http://steamcommunity.com/sharedfiles/filedeta...,0
4,zoominfo.com/people/de%20GrandprÃ©_Jean_344774...,0


In [11]:
validation["label"].value_counts()

label
0    428
1    223
Name: count, dtype: int64

In [12]:
benchmark.head()

Unnamed: 0,url,label
0,gothamist.com/2011/02/17/rip_len_uncle_leo_les...,0
1,en.wikipedia.org/wiki/Category:Icebreakers_of_...,0
2,sangeetdhwani.net/searchresult?search=A%20typi...,0
3,facebook.com/pages/Nikki-Yanofsky/335188996411,0
4,www.d4j.net/daz/www.paypal.fr/fr/cgibin/,0


In [13]:
benchmark["label"].value_counts()

label
0    429
1    224
Name: count, dtype: int64

In [14]:
train.to_csv("datasets/train.csv", index=False)
validation.to_csv("datasets/validation.csv", index=False)
benchmark.to_csv("datasets/benchmark.csv", index=False)