In [1]:
import numpy as np
import pandas as pd 
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
import pickle
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("dataset_phishing.csv")

In [4]:
df.head()

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,legitimate


In [5]:
df.shape

(11430, 89)

In [7]:
df.drop("url",axis="columns",inplace=True)

In [9]:
df["status"] = df["status"].apply(lambda x: 0 if x=="phishing" else 1)

In [10]:
df.head()

Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,37,19,0,3,0,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,1
1,77,23,1,1,0,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,0
2,126,50,1,4,1,0,1,2,0,3,...,1,0,0,14,4004,5828815,0,1,0,0
3,18,11,0,2,0,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,1
4,55,15,0,2,2,0,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,1


In [11]:
df.describe()

Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
count,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,...,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0,11430.0
mean,61.126684,21.090289,0.150569,2.480752,0.99755,0.022222,0.141207,0.162292,0.0,0.293176,...,0.775853,0.439545,0.072878,492.532196,4062.543745,856756.6,0.020122,0.533946,3.185739,0.5
std,55.297318,10.777171,0.357644,1.369686,2.087087,0.1555,0.364456,0.821337,0.0,0.998317,...,0.417038,0.496353,0.259948,814.769415,3107.7846,1995606.0,0.140425,0.498868,2.536955,0.500022
min,12.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-1.0,-12.0,0.0,0.0,0.0,0.0,0.0
25%,33.0,15.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,84.0,972.25,0.0,0.0,0.0,1.0,0.0
50%,47.0,19.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,242.0,3993.0,1651.0,0.0,1.0,3.0,0.5
75%,71.0,24.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,449.0,7026.75,373845.5,0.0,1.0,5.0,1.0
max,1641.0,214.0,1.0,24.0,43.0,4.0,3.0,19.0,0.0,19.0,...,1.0,1.0,1.0,29829.0,12874.0,10767990.0,1.0,1.0,10.0,1.0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11430 entries, 0 to 11429
Data columns (total 88 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   length_url                  11430 non-null  int64  
 1   length_hostname             11430 non-null  int64  
 2   ip                          11430 non-null  int64  
 3   nb_dots                     11430 non-null  int64  
 4   nb_hyphens                  11430 non-null  int64  
 5   nb_at                       11430 non-null  int64  
 6   nb_qm                       11430 non-null  int64  
 7   nb_and                      11430 non-null  int64  
 8   nb_or                       11430 non-null  int64  
 9   nb_eq                       11430 non-null  int64  
 10  nb_underscore               11430 non-null  int64  
 11  nb_tilde                    11430 non-null  int64  
 12  nb_percent                  11430 non-null  int64  
 13  nb_slash                    114

In [13]:
Y = df[["status"]].values
X = df.drop("status", axis="columns")

In [14]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(X)

In [15]:
pca = PCA(n_components=10)
x_pca = pca.fit_transform(x_scaled)

In [28]:
X_train, X_test, Y_train, Y_test = train_test_split(x_pca,Y,test_size=0.2)

In [29]:
model = RandomForestClassifier(n_estimators=20)
model.fit(X_train,Y_train)

RandomForestClassifier(n_estimators=20)

In [30]:
model.score(X_test,Y_test)

0.9238845144356955

In [31]:
cross_val_score(RandomForestClassifier(n_estimators=10), x_pca,Y,cv=10)

array([0.94138233, 0.92125984, 0.9343832 , 0.92650919, 0.92038495,
       0.93000875, 0.9256343 , 0.92738408, 0.91426072, 0.9063867 ])

In [32]:
with open("model", "wb") as file:
    pickle.dump(model, file)