In [12]:
from urllib.parse import urlparse
import socket
import tldextract
import re
from collections import Counter

def random_domain(url):
    hostname = urlparse(url).hostname or ""
    return 1 if re.match(r'^[a-zA-Z0-9]{7,}$', hostname.split('.')[0]) else 0

def shortening_service(url):
    shorteners = ["bit.ly", "t.co", "goo.gl", "tinyurl.com", "is.gd", "ow.ly"]
    return 1 if any(service in url for service in shorteners) else 0

def path_extension(url):
    path = urlparse(url).path
    return path.split(".")[-1] if "." in path else ""

def nb_redirection(url):
    return url.count("//") - 1

def length_words_raw(url):
    return len(url)


def char_repeat(url):
    return max(Counter(url).values())

def word_lengths(hostname):
    words = hostname.replace("-", " ").split(".")
    if not words:
        return 0, 0, 0
    return min(map(len, words)), max(map(len, words)), sum(map(len, words)) / len(words)

def extract_word_features(url):
    hostname = urlparse(url).hostname or ""
    return word_lengths(hostname)

def phish_hints(url):
    suspicious_words = ["secure", "account", "bank", "login", "verify", "update"]
    return 1 if any(word in url.lower() for word in suspicious_words) else 0

known_brands = ["paypal", "google", "facebook", "bank"]

def brand_in_domain(url):
    domain = tldextract.extract(url).domain
    return 1 if any(brand in domain.lower() for brand in known_brands) else 0

def brand_in_subdomain(url):
    subdomain = tldextract.extract(url).subdomain
    return 1 if any(brand in subdomain.lower() for brand in known_brands) else 0

def brand_in_path(url):
    path = urlparse(url).path
    return 1 if any(brand in path.lower() for brand in known_brands) else 0

import whois

def domain_registration_length(url):
    try:
        domain_info = whois.whois(urlparse(url).hostname)
        expiration_date = domain_info.expiration_date
        creation_date = domain_info.creation_date
        if isinstance(expiration_date, list):
            expiration_date = expiration_date[0]
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        return (expiration_date - creation_date).days if expiration_date and creation_date else 0
    except:
        return 0

from datetime import datetime

def domain_age(url):
    try:
        domain_info = whois.whois(urlparse(url).hostname)
        creation_date = domain_info.creation_date
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        return (datetime.now() - creation_date).days if creation_date else 0
    except:
        return 0

import requests

def web_traffic(url):
    try:
        domain = urlparse(url).hostname
        alexa_url = f"https://data.alexa.com/data?cli=10&dat=s&url={domain}"
        response = requests.get(alexa_url)
        if "<POPULARITY URL=" in response.text:
            rank = response.text.split("<POPULARITY URL=")[1].split("TEXT=")[1].split("/>")[0]
            return int(rank)
    except:
        return -1

def google_index(url):
    try:
        query = f"site:{urlparse(url).hostname}"
        response = requests.get(f"https://www.google.com/search?q={query}")
        return 1 if "did not match any documents" not in response.text else 0
    except:
        return 0

def page_rank(url):
    try:
        rank_api = f"https://api.example.com/pagerank?domain={urlparse(url).hostname}"
        response = requests.get(rank_api)
        return int(response.text) if response.status_code == 200 else 0
    except:
        return 0

def http_in_path(url):
    path = urlparse(url).path
    return 1 if "http" in path else 0

def https_token(url):
    path = urlparse(url).path
    return 1 if "https" in path else 0

def ratio_digits_url(url):
    return sum(c.isdigit() for c in url) / len(url)

def ratio_digits_host(url):
    hostname = urlparse(url).hostname
    return sum(c.isdigit() for c in hostname) / len(hostname) if hostname else 0

def punycode(url):
    hostname = urlparse(url).hostname
    return 1 if hostname and "xn--" in hostname else 0

def port(url):
    parsed_url = urlparse(url)
    return parsed_url.port if parsed_url.port else (-1 if parsed_url.scheme == "http" else 443)

def tld_in_path(url):
    tld = tldextract.extract(url).suffix
    return 1 if tld in urlparse(url).path else 0

def tld_in_subdomain(url):
    extracted = tldextract.extract(url)
    return 1 if extracted.suffix in extracted.subdomain else 0

def abnormal_subdomain(url):
    return 1 if urlparse(url).hostname.count('.') > 2 else 0

def nb_subdomains(url):
    return urlparse(url).hostname.count('.')

def prefix_suffix(url):
    return 1 if "-" in urlparse(url).hostname else 0

def url_preprocessing(url):
    parameters = []
    print("preprocessing started!")
    parameters.append(len(url)) #url
    parameters.append(len(urlparse(url).hostname)) #hostname
    parameters.append(len(socket.gethostbyname(urlparse(url).hostname))) #ip
    parameters.append(url.count('.')) #dots
    parameters.append(url.count('-')) #hyphens
    parameters.append(url.count('@')) #@
    parameters.append(url.count('?')) #?
    parameters.append(url.count('&')) #&
    parameters.append(url.count('|')) #pipeline
    parameters.append(url.count('=')) #eq
    parameters.append(url.count('_')) #underscore
    parameters.append(url.count('~')) #tilde
    parameters.append(url.count('%')) #percentage
    parameters.append(url.count('/')) #slash
    parameters.append(url.count('*')) #star
    parameters.append(url.count(':'))    
    parameters.append(url.count(','))
    parameters.append(url.count(';'))
    parameters.append(url.count('$'))
    parameters.append(url.count(' '))
    parameters.append(url.count('www'))
    parameters.append(url.count('com'))
    parameters.append(url.count('//'))   
    parameters.append(http_in_path(url))
    parameters.append(https_token(url))
    parameters.append(ratio_digits_url(url))
    parameters.append(ratio_digits_host(url))
    parameters.append(punycode(url))
    parameters.append(port(url))
    parameters.append(tld_in_path(url))
    parameters.append(tld_in_subdomain(url))
    parameters.append(abnormal_subdomain(url))
    parameters.append(nb_subdomains(url))
    parameters.append(prefix_suffix(url))
    parameters.append(random_domain(url))
    parameters.append(shortening_service(url))
    parameters.append(path_extension(url))
    parameters.append(nb_redirection(url))
    parameters.append(nb_redirection(url))
    parameters.append(length_words_raw(url))
    parameters.append(char_repeat(url))
    shortest_word_host, longest_word_host, avg_word_host = extract_word_features(url)
    parameters.append(shortest_word_host)
    parameters.append(longest_word_host)
    parameters.append(avg_word_host)
    # check once again
    parameters.append(phish_hints(url))
    parameters.append(brand_in_domain(url))
    parameters.append(brand_in_subdomain(url))
    parameters.append(brand_in_path(url))

    return parameters

print(url_preprocessing("https://console.cloud.google.com/apis/dashboard"))

preprocessing started!
preprocessing complete!
[47, 24, 13, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0.0, 0.0, 0, 443, 0, 0, 1, 3, 0, 1, 0, '', 0, 0, 47, 7, 3, 7, 5.25, 0, 1, 0, 0]


In [3]:
import joblib
import numpy as np

rf_model = joblib.load('phishing_detector.pkl')

# parameters = [33,26,0,2,0,0,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,1,0.03030303,0.038461538,0,0,0,0,0,2,0,0,1,0,1,0,2,0,8,8,0,13,13,0,10.5,10.5,0.0,0,1,0,0,0,0,10,0.1,0.9,0,0,0,0.7777777779999999,0,0.5555555560000001,0,1,0.0,0,0.0,0.0,0,0,0,33.33333333,0,0,0,1,0,0,374,7295,0,0,1,5]
parameters = []

parameters = np.array(parameters).reshape(1, -1)

y_pred = rf_model.predict(parameters)

print("Phishing" if y_pred[0] == 1 else "Legitimate")

Phishing


