In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
# Load the dataset from a CSV file

df = pd.read_csv("phishing.csv")
X = df.drop(["class","Index"], axis=1) # features
y = df["class"]              # target

In [4]:
# dataset sample view
df.head()

Unnamed: 0,Index,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,...,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
0,0,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
1,1,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
2,2,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
3,3,1,0,-1,1,1,-1,1,1,-1,...,-1,1,-1,-1,0,-1,1,1,1,1
4,4,-1,0,-1,1,-1,-1,1,1,-1,...,1,1,1,1,1,-1,1,-1,-1,1


In [5]:
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Decision tree model creation

In [4]:
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42)

In [5]:
# export your model

import joblib

dtree_model = 'dtree_model.pkl'
joblib.dump(dt_classifier, dtree_model)

['dtree_model.pkl']

In [6]:
# making predictions
y_pred = dt_classifier.predict(X_test)

In [8]:
# model evaluate

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

# Additional evaluation metrics
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 95.79375848032564
              precision    recall  f1-score   support

          -1       0.95      0.95      0.95       976
           1       0.96      0.96      0.96      1235

    accuracy                           0.96      2211
   macro avg       0.96      0.96      0.96      2211
weighted avg       0.96      0.96      0.96      2211

[[ 928   48]
 [  45 1190]]


# Implementing Random forest classifier

In [6]:
from sklearn.ensemble import RandomForestClassifier

In [7]:
# model creation
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [16]:
import pickle

# Specify the path where the model is saved
model_filename = "model.pkl"

# Open the file in binary read mode and load the model
with open(model_filename, 'wb') as model_file:
    loaded_model = pickle.dump(rf_classifier,model_file)

In [9]:
# export your model
import joblib

rf_model = 'new_rf_model.pkl'
joblib.dump(rf_classifier, rf_model)

['new_rf_model.pkl']

In [10]:
# making prediction
y_pred = rf_classifier.predict(X_test)

In [11]:
# model evaluate

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

# Additional evaluation metrics
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 96.92446856625962
              precision    recall  f1-score   support

          -1       0.97      0.96      0.96       976
           1       0.97      0.98      0.97      1235

    accuracy                           0.97      2211
   macro avg       0.97      0.97      0.97      2211
weighted avg       0.97      0.97      0.97      2211

[[ 937   39]
 [  29 1206]]


# Implementing support vector machine classifier

In [9]:
from sklearn.svm import SVC

In [10]:
# svm model creation
svm_classifier = SVC(kernel='rbf', C=1.0, random_state=42)
svm_classifier.fit(X_train, y_train)

SVC(random_state=42)

In [11]:
# export your model

svm_model = 'svm_model.pkl'
joblib.dump(svm_classifier, svm_model)

['svm_model.pkl']

In [7]:
# making prediction
y_pred = svm_classifier.predict(X_test)

In [8]:
# model evaluate

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

# Additional evaluation metrics
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 55.85707824513795
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       976
           1       0.56      1.00      0.72      1235

    accuracy                           0.56      2211
   macro avg       0.28      0.50      0.36      2211
weighted avg       0.31      0.56      0.40      2211

[[   0  976]
 [   0 1235]]


  _warn_prf(average, modifier, msg_start, len(result))


# xgboost classifier implementation

In [12]:
import xgboost as xgb

In [13]:
# model creation

xgb_model = xgb.XGBClassifier(
    n_estimators=100,  # Number of boosting rounds (trees)
    max_depth=3,       # Maximum depth of each tree
    learning_rate=0.1, # Learning rate
    subsample=0.8,     # Fraction of samples used for training each tree
    colsample_bytree=0.8, # Fraction of features used for training each tree
)

In [14]:
df['class'] = df['class'].replace(-1, 0)

In [15]:
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
xgb_model.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=3, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)

In [17]:
# export your model

xgboost_model = 'xgboost_model.pkl'
joblib.dump(xgb_model, xgboost_model)

['xgboost_model.pkl']

In [14]:
# making predictions
y_pred = xgb_model.predict(X_test)

In [15]:
# model evaluate

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

# Additional evaluation metrics
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 95.20578923563998
              precision    recall  f1-score   support

           0       0.95      0.94      0.95       976
           1       0.95      0.96      0.96      1235

    accuracy                           0.95      2211
   macro avg       0.95      0.95      0.95      2211
weighted avg       0.95      0.95      0.95      2211

[[ 914   62]
 [  44 1191]]
