In [303]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [304]:
#import pandas and read the dataset

import pandas as pd
data = pd.read_csv('/content/drive/MyDrive/PhishingDetectionModel/Phishing Detection Model/dataset.csv')

In [305]:
#filter rows with missing values
data = data.dropna(axis=0)

In [306]:
data.columns

Index(['id', 'NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash',
       'NumDashInHostname', 'AtSymbol', 'TildeSymbol', 'NumUnderscore',
       'NumPercent', 'NumQueryComponents', 'NumAmpersand', 'NumHash',
       'NumNumericChars', 'NoHttps', 'RandomString', 'IpAddress',
       'DomainInSubdomains', 'DomainInPaths', 'HttpsInHostname',
       'HostnameLength', 'PathLength', 'QueryLength', 'DoubleSlashInPath',
       'NumSensitiveWords', 'EmbeddedBrandName', 'PctExtHyperlinks',
       'PctExtResourceUrls', 'ExtFavicon', 'InsecureForms',
       'RelativeFormAction', 'ExtFormAction', 'AbnormalFormAction',
       'PctNullSelfRedirectHyperlinks', 'FrequentDomainNameMismatch',
       'FakeLinkInStatusBar', 'RightClickDisabled', 'PopUpWindow',
       'SubmitInfoToEmail', 'IframeOrFrame', 'MissingTitle',
       'ImagesOnlyInForm', 'SubdomainLevelRT', 'UrlLengthRT',
       'PctExtResourceUrlsRT', 'AbnormalExtFormActionR', 'ExtMetaScriptLinkRT',
       'PctExtNullSelfRedirectHyperl

In [307]:
#select features to work with
features = ['NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash',
       'NumDashInHostname', 'AtSymbol', 'TildeSymbol', 'NumUnderscore',
       'NumPercent', 'NumQueryComponents', 'NumAmpersand', 'NumHash',
       'NumNumericChars', 'NoHttps', 'RandomString', 'IpAddress',
       'DomainInSubdomains', 'DomainInPaths', 'HttpsInHostname',
       'HostnameLength', 'PathLength', 'QueryLength', 'DoubleSlashInPath',
       'NumSensitiveWords', 'EmbeddedBrandName', 'PctExtHyperlinks',
       'PctExtResourceUrls', 'ExtFavicon', 'InsecureForms',
       'RelativeFormAction', 'ExtFormAction', 'AbnormalFormAction',
       'PctNullSelfRedirectHyperlinks', 'FrequentDomainNameMismatch',
       'FakeLinkInStatusBar', 'RightClickDisabled', 'PopUpWindow',
       'SubmitInfoToEmail', 'IframeOrFrame', 'MissingTitle',
       'ImagesOnlyInForm', 'SubdomainLevelRT', 'UrlLengthRT',
       'PctExtResourceUrlsRT', 'AbnormalExtFormActionR', 'ExtMetaScriptLinkRT',
       'PctExtNullSelfRedirectHyperlinksRT']

X = data[features]

#Choosing target label
y = data.CLASS_LABEL

In [308]:
#importing required libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#shuffling the data

from sklearn.utils import shuffle

X, y = shuffle(X, y, random_state=42)

#Scaling the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

#splitting the data into train and test sets
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=42)

**Importing Models**

In [309]:
import keras
import tensorflow as tf

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.svm import SVC

from sklearn.linear_model import LogisticRegression

In [310]:
# NN Model
base_model_1 = keras.Sequential()

#Random Forest Model
base_model_2 = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

#Gradient Boosting Model
base_model_3 = GradientBoostingClassifier(n_estimators=70, learning_rate=0.1, max_depth=5, random_state=12)

#SVC Model
base_model_4 = SVC(probability=True, random_state=10, C=1.0, kernel='rbf')  # SVC with probability=True

#Logistic Regression
base_model_5 = LogisticRegression(random_state=52, C=6, max_iter=500)

In [311]:
# Add input layer
base_model_1.add(keras.layers.Input(shape=(train_X.shape[1],)))

# Add hidden layers
base_model_1.add(keras.layers.Dense(64, activation='relu'))
base_model_1.add(keras.layers.Dense(32, activation='relu'))

# Add output layer
base_model_1.add(keras.layers.Dense(1, activation='sigmoid'))

# Compile the model
base_model_1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Fit the NN Model
base_model_1.fit(train_X, train_y, epochs=9, batch_size=32, validation_data=(val_X, val_y))

# Fit the Random Forest Model
base_model_2.fit(train_X, train_y)

# Fit the Gradient Boosting Model
base_model_3.fit(train_X, train_y)

# Fit the SVC Model
base_model_4.fit(train_X, train_y)

# Fit the Logistic Regression Model
base_model_5.fit(train_X, train_y)

Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


In [312]:
# Make predictions using your base classifiers
pred1 = base_model_1.predict(val_X)
pred2 = base_model_2.predict(val_X)
pred3 = base_model_3.predict(val_X)
pred4 = base_model_4.predict(val_X)
pred5 = base_model_5.predict(val_X)



In [313]:
from sklearn.metrics import accuracy_score

# Define a threshold to convert probabilities to binary predictions
threshold = 0.5

# Calculate and display the accuracy score for each base model
accuracy_scores = []

models = ["Base Model 1", "Base Model 2", "Base Model 3", "Base Model 4", "Base Model 5"]
predictions = [pred1, pred2, pred3, pred4, pred5]

for model_name, predictions in zip(models, predictions):
    binary_predictions = (predictions > threshold).astype(int)
    accuracy = accuracy_score(val_y, binary_predictions)
    accuracy_scores.append((model_name, accuracy))
    print(f'{model_name} Accuracy: {accuracy * 100:.2f}%')

# Display the accuracy scores for each model
print("\nAccuracy Scores:")
for model_name, accuracy in accuracy_scores:
    print(f'{model_name}: {accuracy * 100:.2f}%')


Base Model 1 Accuracy: 96.70%
Base Model 2 Accuracy: 97.40%
Base Model 3 Accuracy: 98.30%
Base Model 4 Accuracy: 96.15%
Base Model 5 Accuracy: 94.05%

Accuracy Scores:
Base Model 1: 96.70%
Base Model 2: 97.40%
Base Model 3: 98.30%
Base Model 4: 96.15%
Base Model 5: 94.05%


In [314]:
# Define the weights for each base classifier
weights = [0.3, 0.4, 0.3, 0.2, 0.5]

# Calculate the weighted sum of predictions
weighted_predictions = (weights[0] * pred1 + weights[1] * pred2 + weights[2] * pred3 + weights[3] * pred4 + weights[4] * pred5)

In [315]:
# Calculate the probability of the positive class for each base model
probs1 = base_model_1.predict(val_X).flatten()  # For neural network
probs2 = base_model_2.predict_proba(val_X)[:, 1]  # For Random Forest
probs3 = base_model_3.predict_proba(val_X)[:, 1]  # For Gradient Boosting
probs4 = base_model_4.predict_proba(val_X)[:, 1]  # For SVC
probs5 = base_model_5.predict_proba(val_X)[:, 1]  # For Logistic Regression

# Calculate the weighted sum of probabilities
weighted_probabilities = (weights[0] * probs1 + weights[1] * probs2 + weights[2] * probs3 + weights[3] * probs4 + weights[4] * probs5)

# Choose a threshold to convert the weighted sum into binary predictions
threshold = 0.5
final_binary_predictions = (weighted_probabilities > threshold).astype(int)




In [316]:
print("Shape of weighted_predictions:", weighted_predictions.shape)


Shape of weighted_predictions: (2000, 2000)


In [317]:
import numpy as np

print("val_y unique values:", np.unique(val_y))
print("val_y data type:", type(val_y))
print("final_binary_predictions unique values:", np.unique(final_binary_predictions))
print("final_binary_predictions data type:", type(final_binary_predictions))


val_y unique values: [0 1]
val_y data type: <class 'pandas.core.series.Series'>
final_binary_predictions unique values: [0 1]
final_binary_predictions data type: <class 'numpy.ndarray'>


In [318]:
# Check the shape of val_y and final_binary_predictions
if val_y.shape == final_binary_predictions.shape:
    accuracy = accuracy_score(val_y, final_binary_predictions)
    print(f'Ensemble Accuracy: {accuracy * 100:.2f}%')
else:
    print("Shape mismatch between val_y and final_binary_predictions.")



Ensemble Accuracy: 96.55%


In [319]:
print("val_X shape:", val_X.shape)
print("val_y shape:", val_y.shape)


val_X shape: (2000, 48)
val_y shape: (2000,)


In [320]:
print(len(val_y))
print(len(final_binary_predictions))

2000
2000


In [321]:
print("val_y shape:", val_y.shape)
print("val_y data type:", type(val_y))
print("final_binary_predictions shape:", final_binary_predictions.shape)
print("final_binary_predictions data type:", type(final_binary_predictions))


val_y shape: (2000,)
val_y data type: <class 'pandas.core.series.Series'>
final_binary_predictions shape: (2000,)
final_binary_predictions data type: <class 'numpy.ndarray'>


In [322]:
from sklearn.metrics import accuracy_score

# Calculate the accuracy of the ensemble model
accuracy = accuracy_score(val_y, final_binary_predictions)

print(f'Ensemble Accuracy: {accuracy * 100:.2f}%')

Ensemble Accuracy: 96.55%


In [323]:
# Include probability results in the predictions
final_probabilities = weighted_probabilities

# Display binary predictions and probabilities for a few examples
for i in range(10):
    print(f'Example {i + 1}: Binary Prediction={final_binary_predictions[i]}, Probability={final_probabilities[i].item():.4f}')

Example 1: Binary Prediction=0, Probability=0.0405
Example 2: Binary Prediction=0, Probability=0.0114
Example 3: Binary Prediction=1, Probability=1.6926
Example 4: Binary Prediction=0, Probability=0.0031
Example 5: Binary Prediction=1, Probability=1.6933
Example 6: Binary Prediction=0, Probability=0.0193
Example 7: Binary Prediction=0, Probability=0.0313
Example 8: Binary Prediction=0, Probability=0.2355
Example 9: Binary Prediction=0, Probability=0.0578
Example 10: Binary Prediction=0, Probability=0.0848


In [324]:
from sklearn.metrics import classification_report

# Generate a classification report
classification_rep = classification_report(val_y, final_binary_predictions)

# Print the classification report
print("Classification Report:\n", classification_rep)

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.95      0.96       987
           1       0.95      0.99      0.97      1013

    accuracy                           0.97      2000
   macro avg       0.97      0.97      0.97      2000
weighted avg       0.97      0.97      0.97      2000



In [325]:
from sklearn.ensemble import VotingClassifier

# Define the models for ensemble
models = [
    ('Random Forest', base_model_2),
    ('Gradient Boosting', base_model_3),
    ('SVC', base_model_4),
    ('Logistic Regression', base_model_5)
]

# Create a VotingClassifier with weights
ensemble_model = VotingClassifier(estimators=models, voting='soft', weights=[0.3, 0.3, 0.2, 0.5])

# Fit the ensemble model on your training data
ensemble_model.fit(train_X, train_y)

# Make predictions using the ensemble model
ensemble_predictions = ensemble_model.predict(val_X)

# Calculate the accuracy of the ensemble model
ensemble_accuracy = accuracy_score(val_y, ensemble_predictions)
print(f'Ensemble Accuracy: {ensemble_accuracy * 100:.2f}%')


Ensemble Accuracy: 97.20%


In [326]:
import joblib

# Save the trained model to a file
joblib.dump(final_model, 'final_model.pkl')

NameError: ignored