In [1]:
from bs4 import BeautifulSoup
from datetime import datetime
import whois
import requests
from urllib.parse import urlparse, urljoin
import socket
import tldextract
import re
from collections import Counter

def random_domain(url):
    hostname = urlparse(url).hostname or ""
    return 1 if re.match(r'^[a-zA-Z0-9]{7,}$', hostname.split('.')[0]) else 0

def shortening_service(url):
    shorteners = ["bit.ly", "t.co", "goo.gl", "tinyurl.com", "is.gd", "ow.ly"]
    return 1 if any(service in url for service in shorteners) else 0

def path_extension(url):
    path = urlparse(url).path
    return path.split(".")[-1] if "." in path else ""

def nb_redirection(url):
    return url.count("//") - 1

def length_words_raw(url):
    return len(url)


def char_repeat(url):
    return max(Counter(url).values())

def word_lengths(hostname):
    words = hostname.replace("-", " ").split(".")
    if not words:
        return 0, 0, 0
    return min(map(len, words)), max(map(len, words)), sum(map(len, words)) / len(words)

def extract_word_features(url):
    hostname = urlparse(url).hostname or ""
    return word_lengths(hostname)

def phish_hints(url):
    suspicious_words = ["secure", "account", "bank", "login", "verify", "update"]
    return 1 if any(word in url.lower() for word in suspicious_words) else 0

known_brands = ["paypal", "google", "facebook", "bank"]

def brand_in_domain(url):
    domain = tldextract.extract(url).domain
    return 1 if any(brand in domain.lower() for brand in known_brands) else 0

def brand_in_subdomain(url):
    subdomain = tldextract.extract(url).subdomain
    return 1 if any(brand in subdomain.lower() for brand in known_brands) else 0

def brand_in_path(url):
    path = urlparse(url).path
    return 1 if any(brand in path.lower() for brand in known_brands) else 0

suspicious_tlds = {"xyz", "top", "tk", "ml", "cf", "ga"}
def get_tld(url):
    extracted = tldextract.extract(url)
    return extracted.suffix

def is_suspicious_tld(url):
    tld = tldextract.extract(url).suffix
    return 1 if tld in suspicious_tlds else 0


def domain_registration_length(url):
    try:
        domain_info = whois.whois(urlparse(url).hostname)
        expiration_date = domain_info.expiration_date
        creation_date = domain_info.creation_date
        if isinstance(expiration_date, list):
            expiration_date = expiration_date[0]
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        return (expiration_date - creation_date).days if expiration_date and creation_date else 0
    except:
        return 0


def domain_age(url):
    try:
        domain_info = whois.whois(urlparse(url).hostname)
        creation_date = domain_info.creation_date
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        return (datetime.now() - creation_date).days if creation_date else 0
    except:
        return 0


def web_traffic(url):
    try:
        domain = urlparse(url).hostname
        alexa_url = f"https://data.alexa.com/data?cli=10&dat=s&url={domain}"
        response = requests.get(alexa_url)
        if "<POPULARITY URL=" in response.text:
            rank = response.text.split("<POPULARITY URL=")[1].split("TEXT=")[1].split("/>")[0]
            return int(rank)
    except:
        return -1

def google_index(url):
    try:
        query = f"site:{urlparse(url).hostname}"
        response = requests.get(f"https://www.google.com/search?q={query}")
        return 1 if "did not match any documents" not in response.text else 0
    except:
        return 0

def page_rank(url):
    try:
        rank_api = f"https://api.example.com/pagerank?domain={urlparse(url).hostname}"
        response = requests.get(rank_api)
        return int(response.text) if response.status_code == 200 else 0
    except:
        return 0

def http_in_path(url):
    path = urlparse(url).path
    return 1 if "http" in path else 0

def https_token(url):
    path = urlparse(url).path
    return 1 if "https" in path else 0

def ratio_digits_url(url):
    return sum(c.isdigit() for c in url) / len(url)

def ratio_digits_host(url):
    hostname = urlparse(url).hostname
    return sum(c.isdigit() for c in hostname) / len(hostname) if hostname else 0

def punycode(url):
    hostname = urlparse(url).hostname
    return 1 if hostname and "xn--" in hostname else 0

def port(url):
    parsed_url = urlparse(url)
    return parsed_url.port if parsed_url.port else (-1 if parsed_url.scheme == "http" else 443)

def tld_in_path(url):
    tld = tldextract.extract(url).suffix
    return 1 if tld in urlparse(url).path else 0

def tld_in_subdomain(url):
    extracted = tldextract.extract(url)
    return 1 if extracted.suffix in extracted.subdomain else 0

def abnormal_subdomain(url):
    return 1 if urlparse(url).hostname.count('.') > 2 else 0

def nb_subdomains(url):
    return urlparse(url).hostname.count('.')

def prefix_suffix(url):
    return 1 if "-" in urlparse(url).hostname else 0


def get_nb_hyperlinks(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 0
        soup = BeautifulSoup(response.text, "html.parser")
        links = soup.find_all("a")
        return len(links)
    except Exception as e:
        return 0

def get_ratio_intHyperlinks(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 0  
        soup = BeautifulSoup(response.text, "html.parser")
        all_links = soup.find_all("a")  
        total_links = len(all_links)
        if total_links == 0:
            return 0  
        domain = urlparse(url).netloc  
        internal_links = 0
        for link in all_links:
            href = link.get("href")
            if href:
                full_url = urljoin(url, href)  
                if urlparse(full_url).netloc == domain:  
                    internal_links += 1
        ratio = internal_links / total_links  
        return round(ratio, 4)  
    except Exception as e:
        return 0
    
def get_ratio_extHyperlinks(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 0  
        soup = BeautifulSoup(response.text, "html.parser")
        all_links = soup.find_all("a")  
        total_links = len(all_links)
        if total_links == 0:
            return 0  
        domain = urlparse(url).netloc  
        external_links = 0
        for link in all_links:
            href = link.get("href")
            if href:
                full_url = urljoin(url, href)  
                if urlparse(full_url).netloc and urlparse(full_url).netloc != domain:  
                    external_links += 1
        ratio = external_links / total_links  
        return round(ratio, 4)  
    except Exception as e:
        return 0

def get_ratio_nullHyperlinks(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 0  
        soup = BeautifulSoup(response.text, "html.parser")
        all_links = soup.find_all("a")  
        total_links = len(all_links)
        if total_links == 0:
            return 0  
        null_links = sum(1 for link in all_links if not link.get("href") or link.get("href").strip() == "")
        ratio = null_links / total_links  
        return round(ratio, 4)  
    except Exception as e:
        return 0  

def get_nb_extCSS(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 0  
        soup = BeautifulSoup(response.text, "html.parser")
        css_links = soup.find_all("link", rel="stylesheet")  
        domain = urlparse(url).netloc
        external_css = 0
        for link in css_links:
            href = link.get("href")
            if href:
                full_url = urljoin(url, href)  
                if urlparse(full_url).netloc and urlparse(full_url).netloc != domain:  
                    external_css += 1
        return external_css  
    except Exception as e:
        return 0  

def get_ratio_intRedirection(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 0  
        soup = BeautifulSoup(response.text, "html.parser")
        all_links = soup.find_all("a", href=True)  
        total_redirections = 0
        internal_redirections = 0
        domain = urlparse(url).netloc
        for link in all_links:
            href = link.get("href")
            full_url = urljoin(url, href)  
            try:
                redir_response = requests.head(full_url, allow_redirects=True, timeout=3)
                if redir_response.url != full_url:  
                    total_redirections += 1
                    if urlparse(redir_response.url).netloc == domain:  
                        internal_redirections += 1
            except requests.RequestException:
                continue  
        if total_redirections == 0:
            return 0  
        ratio = internal_redirections / total_redirections  
        return round(ratio, 4)  
    except Exception:
        return 0

def get_ratio_extRedirection(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 0  
        soup = BeautifulSoup(response.text, "html.parser")
        all_links = soup.find_all("a", href=True)  
        total_redirections = 0
        external_redirections = 0
        domain = urlparse(url).netloc
        for link in all_links:
            href = link.get("href")
            full_url = urljoin(url, href)  
            try:
                redir_response = requests.head(full_url, allow_redirects=True, timeout=3)
                if redir_response.url != full_url:  
                    total_redirections += 1
                    if urlparse(redir_response.url).netloc != domain:
                        external_redirections += 1
            except requests.RequestException:
                continue  
        if total_redirections == 0:
            return 0  
        ratio = external_redirections / total_redirections  
        return round(ratio, 4)  
    except Exception:
        return 0

def get_ratio_intErrors(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 0  
        soup = BeautifulSoup(response.text, "html.parser")
        all_links = soup.find_all("a", href=True)  
        domain = urlparse(url).netloc
        total_internal_links = 0
        internal_error_links = 0
        for link in all_links:
            href = link.get("href")
            full_url = urljoin(url, href)  
            if urlparse(full_url).netloc == domain:  
                total_internal_links += 1
                try:
                    link_response = requests.head(full_url, timeout=3)
                    if link_response.status_code >= 400:  
                        internal_error_links += 1
                except requests.RequestException:
                    internal_error_links += 1  
        if total_internal_links == 0:
            return 0  
        ratio = internal_error_links / total_internal_links  
        return round(ratio, 4)  
    except Exception:
        return 0

def get_ratio_extErrors(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 0  
        soup = BeautifulSoup(response.text, "html.parser")
        all_links = soup.find_all("a", href=True)  
        domain = urlparse(url).netloc
        total_external_links = 0
        external_error_links = 0
        for link in all_links:
            href = link.get("href")
            full_url = urljoin(url, href)  
            if urlparse(full_url).netloc != domain:  
                total_external_links += 1
                try:
                    link_response = requests.head(full_url, timeout=3)  
                    if link_response.status_code >= 400:  
                        external_error_links += 1
                except requests.RequestException:
                    external_error_links += 1  
        if total_external_links == 0:
            return 0  
        ratio = external_error_links / total_external_links  
        return round(ratio, 4)  
    except Exception:
        return 0 

def has_login_form(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 0  
        soup = BeautifulSoup(response.text, "html.parser")
        forms = soup.find_all("form")  
        login_keywords = ["login", "signin", "log-in", "sign-in", "password"]
        for form in forms:
            inputs = form.find_all("input")
            has_password = any(inp.get("type") == "password" for inp in inputs)
            has_login_keyword = any(
                any(keyword in (form.get(attr) or "").lower() for keyword in login_keywords)
                for attr in ["id", "name", "class", "action"]
            )
            if has_password or has_login_keyword:
                return 1  
        return 0  
    except Exception:
        return 0 

def get_links_in_tags(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 0  
        soup = BeautifulSoup(response.text, "html.parser")
        head_links = soup.head.find_all("a", href=True) if soup.head else []
        meta_links = soup.find_all("meta", attrs={"content": True})
        script_links = soup.find_all("script", attrs={"src": True})
        meta_href_count = sum(1 for meta in meta_links if "http" in meta.get("content", ""))
        script_href_count = sum(1 for script in script_links if "http" in script.get("src", ""))
        total_links = len(head_links) + meta_href_count + script_href_count
        return total_links
    except Exception:
        return 0 

def has_submit_email(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 0  
        soup = BeautifulSoup(response.text, "html.parser")
        forms = soup.find_all("form")  
        email_keywords = ["email", "mail"]
        for form in forms:
            inputs = form.find_all("input")
            has_email_input = any(inp.get("type") == "email" for inp in inputs)
            has_email_keyword = any(
                any(keyword in (form.get(attr) or "").lower() for keyword in email_keywords)
                for attr in ["id", "name", "class", "action"]
            )
            if has_email_input or has_email_keyword:
                return 1  
        return 0  
    except Exception:
        return 0 

def get_ratio_intMedia(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 0  
        soup = BeautifulSoup(response.text, "html.parser")
        domain = urlparse(url).netloc  
        media_tags = soup.find_all(["img", "video", "audio", "source"])  
        total_media = len(media_tags)
        if total_media == 0:
            return 0  
        internal_media = sum(1 for tag in media_tags if tag.get("src") and urlparse(tag["src"]).netloc in ["", domain])
        return internal_media / total_media  
    except Exception:
        return 0

def get_ratio_extMedia(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 0  
        soup = BeautifulSoup(response.text, "html.parser")
        domain = urlparse(url).netloc  
        media_tags = soup.find_all(["img", "video", "audio", "source"])  
        total_media = len(media_tags)
        if total_media == 0:
            return 0  
        external_media = sum(1 for tag in media_tags if tag.get("src") and urlparse(tag["src"]).netloc not in ["", domain])
        return external_media / total_media  
    except Exception:
        return 0

def get_sfh(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 0  
        soup = BeautifulSoup(response.text, "html.parser")
        domain = urlparse(url).netloc  
        forms = soup.find_all("form")  
        if not forms:
            return 1  
        for form in forms:
            action = form.get("action", "").strip() 
            if action in ["", "#"]:
                return -1  
            action_domain = urlparse(action).netloc
            if action_domain == "" or action_domain == domain:
                return 1 
            return 0
        return 1  
    except Exception:
        return 0

def get_iframe_ratio(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 0  # Return 0 if the page is not accessible

        soup = BeautifulSoup(response.text, "html.parser")
        total_elements = len(soup.find_all())  # Total HTML elements
        iframe_count = len(soup.find_all("iframe"))  # Count of iframe elements

        return iframe_count / total_elements if total_elements else 0  # Ratio of iframes
    except Exception:
        return 0 
    
def has_popup_window(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 0  # Page not accessible

        soup = BeautifulSoup(response.text, "html.parser")
        scripts = soup.find_all("script")

        for script in scripts:
            if script.string and re.search(r"window\.open|window\.showModalDialog", script.string):
                return 1  # Suspicious popup detected

        return 0  # No suspicious popups found
    except Exception:
        return 0 
    
def get_safe_anchor_ratio(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 0  # Return 0 if the page is not accessible

        soup = BeautifulSoup(response.text, "html.parser")
        anchors = soup.find_all("a")
        total_anchors = len(anchors)

        if total_anchors == 0:
            return 1  # If no anchors, assume safe

        safe_anchors = sum(1 for a in anchors if a.get("href") and not a["href"].startswith(("#", "javascript:void(0)")))

        return safe_anchors / total_anchors  # Ratio of safe anchors
    except Exception:
        return 0

def is_right_click_disabled(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 0  # Page not accessible

        soup = BeautifulSoup(response.text, "html.parser")
        
        # Check for oncontextmenu attribute in body
        if soup.body and "oncontextmenu" in soup.body.attrs:
            if soup.body["oncontextmenu"].strip().lower() == "return false;":
                return 1  # Right-click is disabled

        # Check for JavaScript disabling right-click
        scripts = soup.find_all("script")
        for script in scripts:
            if script.string and re.search(r"event\.button\s*==\s*2|oncontextmenu\s*=\s*['\"]return false;['\"]", script.string):
                return 1  # Suspicious

        return 0  # Right-click is allowed
    except Exception:
        return 0 
    
def has_empty_title(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 1  # Consider empty if page not accessible

        soup = BeautifulSoup(response.text, "html.parser")
        title = soup.title.string.strip() if soup.title else ""

        return 1 if not title else 0  # 1 if empty, 0 if title exists
    except Exception:
        return 1 
    
def is_domain_in_title(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 0  # Consider it safe if page is inaccessible

        soup = BeautifulSoup(response.text, "html.parser")
        title = soup.title.string.lower().strip() if soup.title else ""
        domain = urlparse(url).netloc.split(".")[-2]  # Extract main domain name

        return 1 if domain in title else 0  # 1 if domain is in title, otherwise 0
    except Exception:
        return 0
    
def is_domain_in_copyright(url):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return 0  # Consider safe if page is inaccessible

        soup = BeautifulSoup(response.text, "html.parser")
        domain = urlparse(url).netloc.split(".")[-2]  # Extract main domain name
        copyright_patterns = ["©", "copyright", "all rights reserved"]
        
        text = soup.get_text().lower()
        if any(phrase in text for phrase in copyright_patterns):
            return 1 if domain in text else 0  # 1 if domain is mentioned, otherwise 0

        return 0  # No copyright mention
    except Exception:
        return 0
    
def get_registered_domain(url):
    try:
        domain_info = whois.whois(url)
        return len(domain_info.domain_name)  # Returns the registered domain name
    except Exception:
        return 0

def get_domain_registration_length(url):
    try:
        domain_info = whois.whois(url)
        if isinstance(domain_info.expiration_date, list):
            expiration_date = domain_info.expiration_date[0]
        else:
            expiration_date = domain_info.expiration_date

        if isinstance(domain_info.creation_date, list):
            creation_date = domain_info.creation_date[0]
        else:
            creation_date = domain_info.creation_date

        if expiration_date and creation_date:
            return (expiration_date - creation_date).days  # Registration length in days
        return 0
    except Exception:
        return 0

def get_domain_registration_length(url):
    try:
        domain_info = whois.whois(url)
        if isinstance(domain_info.expiration_date, list):
            expiration_date = domain_info.expiration_date[0]
        else:
            expiration_date = domain_info.expiration_date

        if isinstance(domain_info.creation_date, list):
            creation_date = domain_info.creation_date[0]
        else:
            creation_date = domain_info.creation_date

        if expiration_date and creation_date:
            return (expiration_date - creation_date).days  # Registration length in days
        return 0
    except Exception:
        return 0
    
def get_domain_age(url):
    try:
        domain_info = whois.whois(url)
        if isinstance(domain_info.creation_date, list):
            creation_date = domain_info.creation_date[0]
        else:
            creation_date = domain_info.creation_date

        if creation_date:
            return (datetime.now() - creation_date).days  # Domain age in days
        return 0
    except Exception:
        return 0
    
def get_alexa_rank(url):
    try:
        domain = url.replace("http://", "").replace("https://", "").split("/")[0]  # Extract domain name
        response = requests.get(f"https://data.alexa.com/data?cli=10&dat=snbamz&url={domain}")
        
        if "<POPULARITY" in response.text:
            rank = response.text.split('TEXT="')[1].split('"')[0]
            return int(rank)
        return 0  # If no ranking found
    except Exception:
        return 0
# [51, 17, 13(0), 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0.0, 0.0, 0, 443, 0, 0, 0, 2, 0, 0, 0, 0, 0, 51, 5, 3, 9, 5.0, 3, 9, 5.0, 3, 9, 0, 0, 0, 0, 0, 262, 0.9389, 0.0496, 0.0038, 3, 0.9412, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 7284, 2788, 1, 7, 2]
def url_preprocessing(url):
    parameters = []
    print("preprocessing started!")
    parameters.append(len(url)) #url
    parameters.append(len(urlparse(url).hostname)) #hostname
    parameters.append(len(socket.gethostbyname(urlparse(url).hostname))) #ip -- wrong
    parameters.append(url.count('.')) #dots
    parameters.append(url.count('-')) #hyphens
    parameters.append(url.count('@')) #@
    parameters.append(url.count('?')) #?
    parameters.append(url.count('&')) #&
    parameters.append(url.count('|')) #pipeline
    parameters.append(url.count('=')) #eq
    parameters.append(url.count('_')) #underscore
    parameters.append(url.count('~')) #tilde
    parameters.append(url.count('%')) #percentage
    parameters.append(url.count('/')) #slash
    parameters.append(url.count('*')) #star
    parameters.append(url.count(':'))    
    parameters.append(url.count(','))
    parameters.append(url.count(';'))
    parameters.append(url.count('$'))
    parameters.append(url.count(' '))
    parameters.append(url.count('www'))
    parameters.append(url.count('com'))
    parameters.append(url.count('//'))   
    parameters.append(http_in_path(url))
    parameters.append(https_token(url))
    parameters.append(ratio_digits_url(url))
    parameters.append(ratio_digits_host(url))
    parameters.append(punycode(url))
    parameters.append(port(url))
    parameters.append(tld_in_path(url))
    parameters.append(tld_in_subdomain(url))
    parameters.append(abnormal_subdomain(url))
    parameters.append(nb_subdomains(url))
    parameters.append(prefix_suffix(url))
    parameters.append(random_domain(url))
    parameters.append(shortening_service(url))
    # parameters.append(path_extension(url))
    parameters.append(nb_redirection(url))
    parameters.append(nb_redirection(url))
    parameters.append(length_words_raw(url))
    parameters.append(char_repeat(url))
    shortest_word_host, longest_word_host, avg_word_host = extract_word_features(url)
    parameters.append(shortest_word_host)
    parameters.append(longest_word_host)
    parameters.append(avg_word_host)
    parameters.append(shortest_word_host)
    parameters.append(longest_word_host)
    parameters.append(avg_word_host)
    parameters.append(shortest_word_host)
    parameters.append(longest_word_host)
    # check once again
    parameters.append(phish_hints(url))
    parameters.append(brand_in_domain(url))
    parameters.append(brand_in_subdomain(url))
    parameters.append(brand_in_path(url))
    parameters.append(is_suspicious_tld(url))
    parameters.append(get_nb_hyperlinks(url))
    parameters.append(get_ratio_intHyperlinks(url))
    parameters.append(get_ratio_extHyperlinks(url))
    parameters.append(get_ratio_nullHyperlinks(url))
    parameters.append(get_nb_extCSS(url))
    parameters.append(get_ratio_intRedirection(url))
    parameters.append(get_ratio_extRedirection(url))
    parameters.append(get_ratio_extErrors(url))
    parameters.append(has_login_form(url))
    parameters.append(get_links_in_tags(url))
    parameters.append(has_submit_email(url))
    parameters.append(get_ratio_extMedia(url))
    parameters.append(get_sfh(url))
    parameters.append(get_iframe_ratio(url))
    parameters.append(has_popup_window(url))
    parameters.append(get_safe_anchor_ratio(url))
    parameters.append(has_empty_title(url))
    parameters.append(has_empty_title(url))
    parameters.append(is_domain_in_title(url))
    parameters.append(is_domain_in_copyright(url))
    parameters.append(get_registered_domain(url))
    parameters.append(get_domain_registration_length(url))
    parameters.append(get_domain_age(url))
    parameters.append(get_alexa_rank(url))
    parameters.append(0)
    parameters.append(0)
    parameters.append(1)
    parameters.append(0)
    parameters.append(0)
    parameters.append(7284)
    parameters.append(2788)
    parameters.append(1)
    parameters.append(7)
    parameters.append(2)
    print(parameters)
    return parameters

#print(url_preprocessing("https://console.cloud.google.com/apis/dashboard"))
parameters = url_preprocessing("https://www.slant.co/topics/2404/~file-managers-for-windows")

preprocessing started!
[59, 12, 14, 2, 3, 0, 0, 0, 0, 0, 0, 1, 0, 5, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0.06779661016949153, 0.0, 0, 443, 0, 0, 0, 2, 0, 0, 1, 0, 0, 59, 5, 2, 5, 3.3333333333333335, 2, 5, 3.3333333333333335, 2, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 7284, 2788, 1, 7, 2]


In [38]:
import joblib
import numpy as np
rf_model = joblib.load('phishing_detector.pkl')
# parameters = [51,17,0,3,0,0,0,0,0,0,0,0,0,4,0,1,0,0,0,0,1,0,0,0,0,0.0,0.0,0,0,0,0,0,3,0,0,0,0,2,0,5,3,3,3,4,12,9,12,7.0,6.0,7.666666667,0,0,0,0,0,0,147,0.7959183670000001,0.204081633,0,3,0,0.133333333,0,0.0,0,1,0.0,0,81.25,18.75,0,0,0,16.66666667,0,0,0,0,1,0,1847,7284,2788,0,1,7]
parameters = np.array(parameters).reshape(1, -1)
y_pred = rf_model.predict(parameters)
print("Phishing" if y_pred[0] == 1 else "Legitimate")

Phishing


