In [12]:
from urllib.parse import urlparse, urljoin
import os
import re
import requests
import tldextract
from bs4 import BeautifulSoup

url = "https://www.slant.co/topics/2404/~file-managers-for-windows"

def length_of_hostname(url):
    length = len(urlparse(url).hostname)
    return length

def length_of_ip(url):
    parsed_url = urlparse(url)
    hostname = parsed_url.hostname
    ip_pattern = r"^\d{1,3}(\.\d{1,3}){3}$"
    if hostname and re.match(ip_pattern, hostname):
        return len(hostname)
    else:
        return 0

def http_in_path(url):
    path = urlparse(url).path
    return 1 if "http" in path else 0

def https_token(url):
    path = urlparse(url).path
    return 1 if "https" in path else 0

def ratio_digits_url(url):
    return sum(c.isdigit() for c in url) / len(url)

def ratio_digits_host(url):
    hostname = urlparse(url).hostname
    return sum(c.isdigit() for c in hostname) / len(hostname) if hostname else 0

def punycode(url):
    hostname = urlparse(url).hostname
    return 1 if hostname and "xn--" in hostname else 0

def get_extension_length(url):
    parsed_url = urlparse(url)
    path = parsed_url.path
    _, ext = os.path.splitext(path)
    return len(ext)

def nb_redirection(url):
    return url.count("//") - 1

def get_nb_external_redirections(url):
    try:
        session = requests.Session()
        response = session.get(url, allow_redirects=True)
        original_domain = urlparse(url).netloc
        nb_external_redirections = 0
        for resp in response.history:
            redirected_domain = urlparse(resp.url).netloc
            if redirected_domain != original_domain:
                nb_external_redirections += 1
                original_domain = redirected_domain  
        return nb_external_redirections
    except requests.exceptions.RequestException:
        return 0

def get_length_words_raw(url):
    parsed_url = urlparse(url)
    raw_text = parsed_url.netloc + parsed_url.path + parsed_url.query
    words = re.split(r'[\W_]+', raw_text)
    return len([word for word in words if word])-1

def check_port_flag(url):
    parsed = urlparse(url)
    return 1 if parsed.port else 0

def tld_in_path(url):
    tld = tldextract.extract(url).suffix
    return 1 if tld in urlparse(url).path else 0

def tld_in_subdomain(url):
    extracted = tldextract.extract(url)
    return 1 if extracted.suffix in extracted.subdomain else 0

def abnormal_subdomain(url):
    return 1 if urlparse(url).hostname.count('.') > 2 else 0

def nb_subdomains(url):
    return urlparse(url).hostname.count('.')

def prefix_suffix(url):
    return 1 if "-" in urlparse(url).hostname else 0

def random_domain(url):
    hostname = urlparse(url).hostname or ""
    return 1 if re.match(r'^[a-zA-Z0-9]{7,}$', hostname.split('.')[0]) else 0

def shortening_service(url):
    shorteners = ["bit.ly", "t.co", "goo.gl", "tinyurl.com", "is.gd", "ow.ly"]
    return 1 if any(service in url for service in shorteners) else 0

def get_char_repeat(url):
    parsed_url = urlparse(url)
    raw_text = parsed_url.netloc + parsed_url.path + parsed_url.query
    max_repeat = max((len(m.group()) for m in re.finditer(r'(.)\1*', raw_text)), default=1)
    return max_repeat

def get_shortest_words_raw(url):
    parsed_url = urlparse(url)
    raw_text = parsed_url.netloc + parsed_url.path + parsed_url.query
    words = re.split(r'[\W_]+', raw_text)
    shortest_word_length = min((len(word) for word in words if word), default=0)
    return shortest_word_length+1

def get_shortest_word_host(url):
    parsed_url = urlparse(url)
    host = parsed_url.netloc
    words = re.split(r'[.-]', host)
    shortest_word_length = min((len(word) for word in words if word), default=0)
    return shortest_word_length

def extract_url_features(url):
    parsed_url = urlparse(url)
    host = parsed_url.netloc
    path = parsed_url.path
    raw_text = host + path + parsed_url.query
    words_raw = re.split(r'[\W_]+', raw_text)  
    words_host = re.split(r'[.-]', host)       
    words_path = re.split(r'[\W_]+', path)     
    shortest_word_path = min((len(word) for word in words_path if word), default=0)
    longest_words_raw = max((len(word) for word in words_raw if word), default=0)
    longest_word_host = max((len(word) for word in words_host if word), default=0)
    longest_word_path = max((len(word) for word in words_path if word), default=0)
    return shortest_word_path, longest_words_raw, longest_word_host, longest_word_path

PHISHING_KEYWORDS = ["secure", "account", "login", "bank", "verify", "password", "update", "confirm", "webscr", "signin", "ebayisapi", "paypal", "click", "auth", "identity"]
def extract_url_features(url):
    parsed_url = urlparse(url)
    host = parsed_url.netloc
    path = parsed_url.path
    raw_text = host + path + parsed_url.query
    words_raw = re.split(r'[\W_]+', raw_text)  
    words_host = re.split(r'[.-]', host)       
    words_path = re.split(r'[\W_]+', path)     
    avg_words_raw = sum(len(word) for word in words_raw if word) / len(words_raw) if words_raw else 0
    avg_word_host = sum(len(word) for word in words_host if word) / len(words_host) if words_host else 0
    avg_word_path = sum(len(word) for word in words_path if word) / len(words_path) if words_path else 0
    phish_hints = sum(1 for word in words_raw if word.lower() in PHISHING_KEYWORDS)
    return avg_words_raw, avg_word_host, avg_word_path, phish_hints

BRAND_NAMES = ["paypal", "google", "facebook", "amazon", "bank", "microsoft", "apple", "netflix", "ebay"]
SUSPICIOUS_TLDS = ["tk", "ml", "ga", "cf", "gq", "xyz", "top", "biz", "club", "pw", "info"]

def extract_url_features(url):
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    path = parsed_url.path
    parts = domain.split(".")
    main_domain = parts[-2] if len(parts) >= 2 else ""
    subdomain = ".".join(parts[:-2]) if len(parts) > 2 else ""
    domain_in_brand = any(brand in main_domain.lower() for brand in BRAND_NAMES)
    brand_in_subdomain = any(brand in subdomain.lower() for brand in BRAND_NAMES)
    brand_in_path = any(brand in path.lower() for brand in BRAND_NAMES)
    tld = parts[-1] if len(parts) > 1 else ""
    suspecious_tld = tld in SUSPICIOUS_TLDS
    return domain_in_brand, brand_in_subdomain, brand_in_path, suspecious_tld

BLACKLISTED_DOMAINS = {"phishingsite.com", "malicious-login.tk", "scam-bank.xyz"}
def extract_url_features(url):
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    statistical_report = domain in BLACKLISTED_DOMAINS
    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, "html.parser")
        all_links = [a.get("href") for a in soup.find_all("a", href=True)]
        nb_hyperlinks = len(all_links)
        internal_links = []
        external_links = []
        for link in all_links:
            absolute_link = urljoin(url, link) 
            link_domain = urlparse(absolute_link).netloc
            if link_domain == domain:
                internal_links.append(link)
            else:
                external_links.append(link)
        ratio_intHyperlinks = len(internal_links) / nb_hyperlinks if nb_hyperlinks else 0
        ratio_extHyperlinks = len(external_links) / nb_hyperlinks if nb_hyperlinks else 0
    except requests.RequestException:
        nb_hyperlinks = 0
        ratio_intHyperlinks = 0
        ratio_extHyperlinks = 0
    return statistical_report, nb_hyperlinks, ratio_intHyperlinks, ratio_extHyperlinks

def URL_Feature_check(url):
    parameter = []
    parameter.append(len(url))
    parameter.append(length_of_hostname(url))
    parameter.append(length_of_ip(url))
    parameter.append(url.count('.')) #dots
    parameter.append(url.count('-')) #hyphens
    parameter.append(url.count('@')) #@
    parameter.append(url.count('?')) #?
    parameter.append(url.count('&')) #&
    parameter.append(url.count('|')) #pipeline
    parameter.append(url.count('=')) #eq
    parameter.append(url.count('_')) #underscore
    parameter.append(url.count('~')) #tilde
    parameter.append(url.count('%')) #percentage
    parameter.append(url.count('/')) #slash
    parameter.append(url.count('*')) #star 
    parameter.append(url.count(':'))    
    parameter.append(url.count(','))
    parameter.append(url.count(';'))
    parameter.append(url.count('$'))
    parameter.append(url.count(' '))
    parameter.append(url.count('www'))
    parameter.append(url.count('com'))
    parameter.append(url.count('//'))
    parameter.append(http_in_path(url))
    parameter.append(https_token(url))
    parameter.append(ratio_digits_url(url))
    parameter.append(ratio_digits_host(url))
    parameter.append(punycode(url))
    parameter.append(check_port_flag(url)) 
    parameter.append(tld_in_path(url))
    parameter.append(tld_in_subdomain(url))
    parameter.append(abnormal_subdomain(url))
    parameter.append(nb_subdomains(url))
    parameter.append(prefix_suffix(url))
    parameter.append(random_domain(url))
    parameter.append(shortening_service(url))
    parameter.append(get_extension_length(url)) # --- path extension
    parameter.append(nb_redirection(url))
    parameter.append(get_nb_external_redirections(url))
    parameter.append(get_length_words_raw(url)) 
    parameter.append(get_char_repeat(url))
    parameter.append(get_shortest_words_raw(url))
    parameter.append(get_shortest_word_host(url))
    shortest_word_path, longest_words_raw, longest_word_host, longest_word_path = extract_url_features(url)
    parameter.append(shortest_word_path)
    parameter.append(longest_words_raw)
    parameter.append(longest_word_host)
    parameter.append(longest_word_path)
    avg_words_raw, avg_word_host, avg_word_path, phish_hints = extract_url_features(url)
    parameter.append(avg_words_raw)
    parameter.append(avg_word_host)
    parameter.append(avg_word_path)
    parameter.append(phish_hints)
    domain_in_brand, brand_in_subdomain, brand_in_path, suspecious_tld = extract_url_features(url)
    parameter.append(domain_in_brand)
    parameter.append(brand_in_subdomain)
    parameter.append(brand_in_path)
    parameter.append(suspecious_tld)
    statistical_report, nb_hyperlinks, ratio_intHyperlinks, ratio_extHyperlinks = extract_url_features(url)
    parameter.append(statistical_report)
    parameter.append(nb_hyperlinks)
    parameter.append(ratio_intHyperlinks)
    parameter.append(ratio_extHyperlinks)
    print(len(parameter))
    return parameter

print(URL_Feature_check(url))

59
[59, 12, 0, 2, 3, 0, 0, 0, 0, 0, 0, 1, 0, 5, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0.06779661016949153, 0.0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 8, 3, 3, 2, False, 0, 0, 0, False, 0, 0, 0, False, 1, 0.0, 1.0, False, 1, 0.0, 1.0]
