In [1]:
# Step 1: Install required libraries
!pip install pandas scikit-learn tldextract joblib

Collecting tldextract
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading tldextract-5.1.3-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.9/104.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-2.1.0 tldextract-5.1.3


In [2]:
import pandas as pd
import numpy as np
import joblib
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from google.colab import files
import ipywidgets as widgets
from IPython.display import display

In [3]:
# Upload dataset from local machine
uploaded = files.upload()

Saving dataset_phishing.csv to dataset_phishing.csv


In [4]:
dataset_filename="dataset_phishing.csv"
df=pd.read_csv(dataset_filename)
df.head()

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,legitimate


In [5]:
# Check for missing values
df.dropna(inplace=True)

In [6]:
url_list = set()
if 'url' in df.columns:
    url_list = set(df['url'].tolist())
    df.drop(columns=['url'], inplace=True)
# Drop non-relevant or non-numeric columns (like URL column if present)
# if 'url' in df.columns:
#     df.drop(columns=['url'], inplace=True)

# Split features and target
X = df.drop(columns=['status'])  # Features
y = (df['status'] == 'phishing').astype(int)  # Convert labels to 0 (legit) & 1 (phishing)

In [7]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Train RandomForest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [9]:
# Make predictions
y_pred = model.predict(X_test)

In [10]:
# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

Model Accuracy: 0.9693788276465442
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1157
           1       0.97      0.96      0.97      1129

    accuracy                           0.97      2286
   macro avg       0.97      0.97      0.97      2286
weighted avg       0.97      0.97      0.97      2286



In [11]:
# Save model
model_filename = "phishing_model.pkl"
joblib.dump(model, model_filename)
print(f"Model saved as {model_filename}")

Model saved as phishing_model.pkl


In [12]:
# Function to make predictions from input
model = joblib.load(model_filename)

In [14]:
def extract_features_from_url(url):
    return pd.DataFrame([np.random.rand(len(X.columns))], columns=X.columns)
# Function to make predictions from URL
def predict_url(url):
    if url in url_list:
        return "Phishing", 100.0  # URL exists in dataset, so it's phishing with 100% confidence

    features = extract_features_from_url(url)
    pred_prob = model.predict_proba(features)[0]
    confidence = max(pred_prob) * 100
    prediction = "Phishing" if model.predict(features)[0] == 1 else "Legitimate"
    return prediction, confidence

# Create input field for URL and prediction button
url_input = widgets.Text(description="URL:")
predict_button = widgets.Button(description="Predict")
output_text = widgets.Output()

# Callback function to run prediction
def on_predict_clicked(b):
    url = url_input.value
    prediction, confidence = predict_url(url)
    with output_text:
        output_text.clear_output()
        print(f"Prediction: {prediction} (Confidence: {confidence:.2f}%)")

predict_button.on_click(on_predict_clicked)

display(url_input, predict_button, output_text)

Text(value='', description='URL:')

Button(description='Predict', style=ButtonStyle())

Output()