In [1]:
# Filter the uneccesary warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import pandas and numpy
import pandas as pd
import numpy as np

# Fix the random seed
np.random.seed(7)

In [3]:
# Load the dataset
data = pd.read_csv("Phishing.csv")
data.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
having_IP_Address,-1,1,1,1,1,-1,1,1,1,1
URL_Length,1,1,0,0,0,0,0,0,0,1
Shortining_Service,1,1,1,1,-1,-1,-1,1,-1,-1
having_At_Symbol,1,1,1,1,1,1,1,1,1,1
double_slash_redirecting,-1,1,1,1,1,-1,1,1,1,1
Prefix_Suffix,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
having_Sub_Domain,-1,0,-1,-1,1,1,-1,-1,1,-1
SSLfinal_State,-1,1,-1,-1,1,1,-1,-1,1,1
Domain_registeration_length,-1,-1,-1,1,-1,-1,1,1,-1,-1
Favicon,1,1,1,1,1,1,1,1,1,1


It is not a good practice to build machine learning models where the labels are encoded as negative values. It affects the performance of the models. So, you would want to change the -1 values to 0. 

In [4]:
# Rebase the values
data.rename(columns={"Result": "Class"}, inplace=True)

data["Class"] = data["Class"].map({-1:0, 1:1})
data["Class"].unique()

array([0, 1])

We can look for missing values in the dataset like so -

In [5]:
data.isna().sum()

having_IP_Address              0
URL_Length                     0
Shortining_Service             0
having_At_Symbol               0
double_slash_redirecting       0
Prefix_Suffix                  0
having_Sub_Domain              0
SSLfinal_State                 0
Domain_registeration_length    0
Favicon                        0
port                           0
HTTPS_token                    0
Request_URL                    0
URL_of_Anchor                  0
Links_in_tags                  0
SFH                            0
Submitting_to_email            0
Abnormal_URL                   0
Redirect                       0
on_mouseover                   0
RightClick                     0
popUpWidnow                    0
Iframe                         0
age_of_domain                  0
DNSRecord                      0
web_traffic                    0
Page_Rank                      0
Google_Index                   0
Links_pointing_to_page         0
Statistical_report             0
Class     

As I had mentioned that missing values can be present in a lot of various forms. Refer to the article I mentioned in the previous task's solution to know more about this. 

Let's now split the dataset in a 80:20 ratio. 

In [1]:
from sklearn.model_selection import train_test_split

In [6]:
X = data.iloc[:,0:30].values.astype(int)
y = data.iloc[:,30].values.astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=np.random.seed(7))

Let's serialize the splits as well. Remember that our splits are now nothing but `numpy` arrays and those can be easily serialized like the following. 

In [7]:
# Serialize the numpy arrays
np.save("X_train.npy", X_train), np.save("y_train.npy", y_train)
np.save("X_test.npy", X_train), np.save("y_test.npy", y_train)

(None, None)

You can safely ignore the output. 