In [1]:
from urllib.parse import urlparse

def NumDots(url):
    """
    Counts the number of . in a url. 
    Returns the number of dots present in the URL.
    """
    parsed_url = urlparse(url)
    num_dots = parsed_url.netloc.count('.')
    if parsed_url.path:
        num_dots += parsed_url.path.count('.')
    if url.endswith('.'):
        num_dots -= 1
    return num_dots

In [2]:
from urllib.parse import urlparse

def SubdomainLevel(url):
    """
    Computes the number of subdomain levels in the URL.
    """

    hostname = urlparse(url).hostname
    subdomains = hostname.split('.')
    num_subdomains = len(subdomains) - 1
    return num_subdomains

In [3]:
from urllib.parse import urlparse

def PathLevel(url):
    """
    Computes the number of path levels in the URL.
    """
    
    path = urlparse(url).path
    num_levels = path.count('/') - 1
    return num_levels

In [4]:
from urllib.parse import urlparse

def UrlLength(url):
    """
    Computes the length of the URL.
    """
    parsed_url = urlparse(url)
    return len(parsed_url.geturl())


In [None]:
def NumDash(url):
    """
    Counts the number of - in a url. 
    Returns the number of dashes present in the URL.
    """
    parsed_url = urlparse(url)
    num_dashes = parsed_url.netloc.count('-')
    if parsed_url.path:
        num_dashes += parsed_url.path.count('-')
    if url.endswith('-'):
        num_dashes -= 1
    return num_dashes

In [5]:
def NumDashInHostName(url):
    """
    Counts the number of dashes in the hostname component of a URL.
    """
    hostname = urlparse(url).hostname
    num_dashes_in_host = hostname.count('-')
    return num_dashes_in_host

In [6]:
from urllib.parse import urlparse

def AtSymbol(url):
    """
    Checks if @ symbol is present in the URL.
    """
    parsed_url = urlparse(url)
    if '@' in parsed_url.netloc:
        return 1
    else:
        return 0

In [7]:
from urllib.parse import urlparse

def TildeSymbol(url):
    """
    Checks if ~ symbol is present in the URL.
    """
    parsed_url = urlparse(url)
    if '~' in parsed_url.path:
        return 1
    else:
        return 0

In [8]:
def NumUnderscore(url):
    """
    Counts the number of _ in the URL.
    Returns the number of underscores present in the parsed URL.
    """
    parsed_url = urlparse(url)
    num_underscores = parsed_url.netloc.count('_')
    if parsed_url.path:
        num_underscores += parsed_url.path.count('_')
    if url.endswith('_'):
        num_underscores -= 1
    return num_underscores

In [9]:
from urllib.parse import urlparse

def NumPercent(url):
    """
    Counts the number of % in the URL.
    Returns the number of percentage symbols present in the URL.
    """
    parsed_url = urlparse(url)
    num_percent = parsed_url.path.count('%') + parsed_url.query.count('%')
    return num_percent


In [10]:
from urllib.parse import urlparse

def NumQueryComponents(url):
    """
    Counts the number of query components in the URL.
    """
    query = urlparse(url).query
    components = query.split('&')
    num_components = len(components)
    return num_components
    

In [11]:
from urllib.parse import urlparse

def NumAmpersand(url):
    """
    Counts the number of ampersands in the URL.
    """
    parsed_url = urlparse(url)
    num_ampersands = parsed_url.query.count('&')
    return num_ampersands

In [12]:
from urllib.parse import urlparse

def NumHash(url):
    """
    Counts the number of # in the URL.
    """
    parsed_url = urlparse(url)
    num_hashes = parsed_url.fragment.count('#')
    return num_hashes

In [13]:
from urllib.parse import urlparse

def NumNumericChars(url):
    """
    Counts the number of numeric characters in the URL.
    """
    parsed_url = urlparse(url)
    num_numeric = sum(c.isdigit() for c in parsed_url.path + parsed_url.query)
    return num_numeric

In [14]:
from urllib.parse import urlparse

def NoHttps(url):
    """
    Checks if https is present in the URL.
    If https is present then it returns 0,
    and if https is not present then it returns 1.
    """
    parsed_url = urlparse(url)
    if parsed_url.scheme == 'https':
        return 0
    else:
        return 1


In [15]:
import re

def RandomString(url):
    """
    Checks if random string is present in the URL.
    If random string is present then it returns 1,
    and if random string is not present then it returns 0.
    """
    pattern = r"\d{4,}" # match any sequence of 4 or more digits
    if re.search(pattern, url):
        return 1
    else:
        return 0

In [1]:
import re

def IpAddress(url):
    """
    Checks if IP address is present in the URL.
    If IP address is present then it is phishing hence the function returns 1,
    and if IP address is not present then it is benign hence return 0.
    """
    if re.search(r"^(http|https|ftp)://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", url) or \
       re.search(r"^(http|https|ftp)://\w+\.\d{1,3}\.\d{1,3}\.\d{1,3}", url) or \
       re.search(r"^(http|https|ftp)://\w+\.\w+\.\d{1,3}\.\d{1,3}", url) or \
       re.search(r"^(http|https|ftp)://\w+\.\w+\.\w+\.\d{1,3}", url) or \
       re.search(r"\?\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", url):
        return 1
    else:
        return 0

In [17]:
def DomainInSubdomains(url):
    """
    Checks if domain is present in the subdomains of the URL.
    If domain is present then it is phishing hence the function returns 1,
    and if domain is not present then it is benign hence return 0.
    """
    hostname = urlparse(url).hostname
    subdomains = hostname.split('.')
    domain = '.'.join(subdomains[-2:])
    if domain in subdomains[:-2]:
        return 1
    else:
        return 0

In [18]:
def DomainInPaths(url):
    """
    Checks if domain is present in the paths of the URL.
    If domain is present then it is phishing hence the function returns 1,
    and if domain is not present then it is benign hence return 0.
    """
    hostname = urlparse(url).netloc
    path = urlparse(url).path
    query = urlparse(url).query
    fragment = urlparse(url).fragment
    num_occurrences = path.count(hostname) + query.count(hostname) + fragment.count(hostname)
    if num_occurrences >= 1:
        return 1
    else:
        return 0

In [19]:
def HttpsInHostname(url):
    """
    Checks if https is present in the hostname of the URL.
    If https is present then it is phishing hence the function returns 1,
    and if https is not present then it is benign hence return 0.
    """
    hostname = urlparse(url).netloc
    if re.search(r"(h|x{2})ttps?", hostname):
        return 1
    else:
        return 0

In [20]:
from urllib.parse import urlsplit

def get_hostname_length(url):
    """
    Returns the length of the hostname component of the URL.
    
    Args:
    url (str): The URL to extract the hostname from.
    
    Returns:
    int: The length of the hostname component of the URL.
    """
    hostname = urlsplit(url).hostname
    return len(hostname)

In [21]:
from urllib.parse import urlparse

def PathLength(url):
    """
    Returns the length of the path component of the URL.
    
    Args:
    url (str): The URL to extract the path from.
    
    Returns:
    int: The length of the path component of the URL.
    """
    path = urlparse(url).path
    return len(path)

In [22]:
def QueryLength(url):
    """
    Returns the length of the query string component of the URL.
    
    Args:
    url (str): The URL to extract the query string from.
    
    Returns:
    int: The length of the query string component of the URL.
    """
    query_length = urlparse(url).query
    return len(query_length)

In [23]:
def DoubleSlashInPath(url):
    """
    Checks if double slash is present in the path of the URL.
    If double slash is present then it returns True,
    and if double slash is not present then it returns False.
    """
    path = urlparse(url).path
    if path.count('//') >= 1:
        return True
    else:
        return False


In [24]:
import re
import requests
from urllib.parse import urlparse
from tldextract import extract

def NumSensitiveWords(url):
    """
    Counts the number of sensitive words in the URL and the website content.
    If the number of sensitive words is less than or equal to 1 then it is benign hence return 0,
    and if the number of sensitive words is more than 1 then it is phishing hence the function returns 1.
    """
    sensitive_words = ['confirm', 'account', 'banking', 'secure', 'login', 'ebayisapi', 'webscr', 'signin', 'submit', 'password', 'authenticate', 'lucky', 'bonus', 'ssl', 'banking', 'bank', 'secure', 'update', 'money', 'ebay']
    num_sensitive = 0
    try:
        # Extract the domain name from the URL
        domain = extract(url).domain
        
        # Check if the domain name is in the URL
        if domain in url:
            num_sensitive += 1
        
        # Check if the sensitive words are in the URL
        for word in sensitive_words:
            if word in url:
                num_sensitive += 1
        
        # Use a head request to fetch the website content
        response = requests.head(url)
        content_type = response.headers.get('content-type')
        
        # Check if the content type is HTML
        if 'text/html' in content_type:
            # Use a get request to fetch the website content
            response = requests.get(url)
            content = response.text
            
            # Count the number of sensitive words in the website content
            num_sensitive += sum(content.lower().count(word) for word in sensitive_words)
    except:
        pass
    
    if num_sensitive <= 1:
        return 0
    else:
        return 1

In [2]:
import requests
from urllib.parse import urlparse
from tldextract import extract
import spacy

def EmbeddedBrandName(url):
    """
    Checks if brand name is present in the URL but is not the same as the domain name.
    If brand name is present and is not the same as the domain name, then it is phishing hence the function returns 1,
    and if brand name is not present or is the same as the domain name, then it is benign hence return 0.
    """
    try:
        # Extract the domain name from the URL
        domain = extract(url).domain
        
        # Use a head request to fetch the website content
        response = requests.head(url)
        content_type = response.headers.get('content-type')
        
        # Check if the content type is HTML
        if 'text/html' in content_type:
            # Use a get request to fetch the website content
            response = requests.get(url)
            content = response.text
            
            # Use spaCy for natural language processing
            nlp = spacy.load('en_core_web_sm')
            doc = nlp(content)
            
            # Extract the named entities from the website content
            entities = [entity.text.lower() for entity in doc.ents if entity.label_ == 'ORG']
            
            # Check if the brand name is present in the URL but is not the same as the domain name
            for entity in entities:
                if entity in url.lower() and entity != domain:
                    return 1
    except:
        pass
    
    return 0

In [26]:
import requests
from bs4 import BeautifulSoup

def get_pct_ext_hyperlinks(url):
    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all hyperlinks in the page
        all_links = soup.find_all('a')

        # Count the number of external hyperlinks
        external_links = 0

        for link in all_links:
            href = link.get('href', '')
            if href.startswith('http://') or href.startswith('https://'):
                external_links += 1

        # Calculate the percentage of external hyperlinks
        total_links = len(all_links)
        if total_links > 0:
            pct_ext_links = external_links / total_links
        else:
            pct_ext_links = 0

        return pct_ext_links
    else:
        # If the request was not successful, return an 0
        return 0

In [27]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse

def PctExtResourceUrls(url):
    """
    Calculates the percentage of external resource URLs on a given web page.
    
    Args:
    url (str): The URL of the web page to analyze.
    
    Returns:
    float: The percentage of external resource URLs on the page, as a floating-point number between 0 and 1.
    """
    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all resource URLs in the page
        all_urls = soup.find_all(['img', 'script', 'link'])

        # Count the number of external resource URLs
        external_urls = 0

        for url in all_urls:
            href = url.get('href', '')
            src = url.get('src', '')
            if href.startswith('http://') or href.startswith('https://') or src.startswith('http://') or src.startswith('https://'):
                parsed_url = urlparse(href if href else src)
                if parsed_url.netloc:
                    external_urls += 1

        # Calculate the percentage of external resource URLs
        total_urls = len(all_urls)
        if total_urls > 0:
            pct_ext_urls = external_urls / total_urls
        else:
            pct_ext_urls = 0

        return pct_ext_urls
    else:
        # If the request was not successful, return an error value
        return 0

In [28]:
import requests
from bs4 import BeautifulSoup

def ExtFavicon(url):
    try:
        # Send an HTTP HEAD request to the URL to minimize data transfer
        response = requests.head(url)

        # Check if the request was successful (status code 200 or 301/302 for redirects)
        if response.status_code in (200, 301, 302):
            # Check for the 'Content-Type' header to ensure it's an HTML page
            content_type = response.headers.get('Content-Type', '').lower()
            if content_type.startswith('text/html'):
                # Send a GET request to fetch the full page content
                response = requests.get(url)
                response.raise_for_status()  # Raise an exception for any HTTP error

                # Parse the HTML content of the page using BeautifulSoup
                soup = BeautifulSoup(response.text, 'html.parser')

                # Find the favicon link in the page
                favicon_link = soup.find('link', rel='shortcut icon') or soup.find('link', rel='icon')

                # Extract the URL of the favicon
                if favicon_link:
                    return 1
                else:
                    return 0
            else:
                # The response is not HTML, so no favicon can be present
                return 0
        else:
            # If the request was not successful, return 0
            return 0
    except Exception as e:
        return 0

In [29]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse

def InsecureForms(url):
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content of the page using BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all form elements in the page
            forms = soup.find_all('form')

            # Check if any of the forms use HTTP instead of HTTPS in the action attribute
            for form in forms:
                action = form.get('action', '')
                if action:
                    parsed_url = urlparse(action)
                    if parsed_url.scheme and parsed_url.scheme.lower() == 'http':
                        return 1

            # If no insecure forms were found, return 0
            return 0
        else:
            # If the request was not successful, return an error value
            return 0
    except Exception as e:
        # Handle any exceptions that may occur during the process
        print(f"Error: {str(e)}")
        return 0


In [30]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def RelativeFormAction(url):
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content of the page using BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all form elements in the page
            forms = soup.find_all('form')

            # Check if any of the forms have a relative action URL
            for form in forms:
                action = form.get('action', '')
                if action and not action.startswith(('http://', 'https://')):
                    return 1  # At least one relative form action exists

            # If no relative form actions were found, return 0
            return 0
        else:
            # If the request was not successful, return an error value
            return 0
    except Exception as e:
        # Handle any exceptions that may occur during the process
        return 0


In [31]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def ExtFormAction(url):
    """
    Checks if external form action is present in the URL.
    If external form action is present then it is phishing hence the function returns 1,
    and if external form action is not present then it is benign hence return 0.
    
    Args:
    url (str): The URL of the web page to check.
    
    Returns:
    int: 1 if the web page is a phishing attempt, 0 otherwise.
    """
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content of the page using BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all form elements in the page
            forms = soup.find_all('form')

            # Check if any of the forms have an external action URL
            for form in forms:
                action = form.get('action', '')
                if action and action.startswith(('http://', 'https://')):
                    return 1  # External form action exists

            # If no external form actions were found, return 0
            return 0
        else:
            # If the request was not successful, return an error value
            return 0
    except Exception as e:
        # Handle any exceptions that may occur during the process
        return 0

In [32]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def AbnormalFormAction(url):
    """
    Checks if abnormal form action is present in the URL.
    If abnormal form action is present then it is phishing hence the function returns 1,
    and if abnormal form action is not present then it is benign hence return 0.
    
    Args:
    url (str): The URL of the web page to check.
    
    Returns:
    int: 1 if the web page is a phishing attempt, 0 otherwise.
    """
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content of the page using BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all form elements in the page
            forms = soup.find_all('form')

            # Check if any of the forms have an abnormal action URL
            for form in forms:
                action = form.get('action', '')
                if action and not action.startswith(('http://', 'https://', '/')):
                    return 1  # Abnormal form action exists

            # If no abnormal form actions were found, return 0
            return 0
        else:
            # If the request was not successful, return an error value
            return 0
    except Exception as e:
        # Handle any exceptions that may occur during the process
        return 0

In [33]:
import requests
from bs4 import BeautifulSoup

def PctNullSelfRedirectHyperlinks(url):
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content of the page using BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all hyperlinks (a elements) in the page
            hyperlinks = soup.find_all('a')

            # Check if there are any self-redirect hyperlinks
            self_redirect_count = 0
            for link in hyperlinks:
                href = link.get('href', '')

                # Check if the href attribute indicates a self-redirect
                if href and href == url:
                    self_redirect_count += 1

            # Calculate the percentage of no self-redirect hyperlinks
            no_self_redirect_count = len(hyperlinks) - self_redirect_count
            pct_no_self_redirect = no_self_redirect_count / len(hyperlinks)

            return pct_no_self_redirect

        else:
            # If the request was not successful, return an error value
            return 0

    except Exception as e:
        # Handle any exceptions that may occur during the process
        return 0


In [34]:
from urllib.parse import urlparse

def FrequentDomainNameMismatch(url):
    try:
        # Parse the URL to extract the domain
        parsed_url = urlparse(url)
        url_domain = parsed_url.netloc

        # Split the domain into subdomains
        subdomains = url_domain.split('.')

        # Check for frequent subdomain mismatches
        frequent_mismatches = 0
        for i in range(len(subdomains) - 1):
            subdomain = '.'.join(subdomains[i:])
            if url_domain != subdomain:
                frequent_mismatches += 1

        # Determine if frequent mismatches exist
        if frequent_mismatches > 0:
            return 1  # Frequent domain name mismatches
        else:
            return 0  # No frequent mismatches

    except Exception as e:
        # Handle any exceptions that may occur during the process
        return 0  # Return 0 on error


In [35]:
import requests
from bs4 import BeautifulSoup
import re

def DetectFakeLinkInStatusBar(url):
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code != 200:
            return 0  # Unable to access the URL

        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract all links (a elements) from the page
        links = soup.find_all('a')

        # Check each link for potential fake attributes or behaviors
        for link in links:
            href = link.get('href', '')

            # Check if the link opens a new window or tab
            if link.get('target') == '_blank':
                return 1  # Fake link detected

            # Check for JavaScript-based actions (e.g., onclick)
            if link.has_attr('onclick'):
                return 1  # Fake link detected

            # Check for obfuscated URLs in JavaScript
            if 'javascript:' in href:
                return 1  # Fake link detected

            # Check for links that use the "data:" scheme
            if href.startswith('data:'):
                return 1  # Fake link detected

            # Check for links with JavaScript in the href
            if re.search(r'javascript:', href, re.I):
                return 1  # Fake link detected

            # Check for empty or non-standard href values
            if not href or href.strip() == '#' or href.strip().lower().startswith('javascript:'):
                return 1  # Fake link detected

            # You can add more criteria here to cover additional cases

        return 0  # No fake link detected

    except Exception as e:
        # Handle any exceptions that may occur during the process
        return 0  # Return 0 on error


In [36]:
import requests
from bs4 import BeautifulSoup
import re

def RightClickDisabled(url):
    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code != 200:
        return 0  # Unable to access the URL

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Initialize a flag to indicate whether right-click is disabled
    right_click_disabled = False

    # Check for scripts that may disable right-click
    script_tags = soup.find_all('script')
    for script in script_tags:
        script_content = script.get_text()

        # Check for various right-click disabling methods
        if re.search(r'event\.preventDefault\(\)', script_content):
            right_click_disabled = True

        if re.search(r'document\.oncontextmenu', script_content):
            right_click_disabled = True

        if re.search(r'contextmenu|context-menu', script_content, re.I):
            right_click_disabled = True

        # You can add more checks here to cover additional sophisticated methods

    # Check for attributes that may disable right-click
    elements = soup.find_all(True)
    for element in elements:
        # Check for oncontextmenu attributes
        if element.has_attr('oncontextmenu'):
            right_click_disabled = True

        # You can add more checks for other attributes or behaviors

    if right_click_disabled:
        return 1  # Right-click disabled detected
    else:
        return 0  # No right-click disabled detected

In [37]:
import requests
from bs4 import BeautifulSoup

def DetectPopUpWindow(url):
    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code != 200:
        return 0  # Unable to access the URL

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Search for elements that may trigger pop-up windows
    pop_up_triggers = soup.find_all(['a', 'button', 'input', 'div'], href=True, onclick=True)

    # Check if any elements have attributes indicating pop-up windows
    if pop_up_triggers:
        return 1  # Pop-up windows detected
    else:
        return 0  # No pop-up windows detected

In [38]:
def check_for_email_submission(url):
    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code != 200:
        return 0  # Unable to access the URL

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Initialize a variable to track if any email submissions are found
    found_email_submission = 0

    # Search for email submission forms
    forms = soup.find_all('form')
    for form in forms:
        if 'mailto:' in form.get('action', ''):
            found_email_submission = 1  # Email submission form detected

    # Search for email links using regular expressions
    email_pattern = r"[\w\.-]+@[\w\.-]+"
    text = soup.get_text()  # Extract text from the page
    if re.search(email_pattern, text):
        found_email_submission = 1  # Email address detected in text

    return found_email_submission  # Return 1 if any email submissions are found, otherwise 0

In [39]:
import requests
from bs4 import BeautifulSoup

def IframeOrFrame(url):
    """
    Checks if iframe or frame is present in the URL.
    If iframe or frame is present then it is phishing hence the function returns 1,
    and if iframe or frame is not present then it is benign hence return 0.
    """
    
    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Check for iframes and frames in the HTML content
    if soup.find_all('iframe') or soup.find_all('frame'):
        return 1  # Phishing detected
    else:
        return 0  # No iframes or frames detected


In [40]:
def MissingTitle(url):
    # Import the BeautifulSoup and requests libraries
    from bs4 import BeautifulSoup
    import requests
    # Get the HTML source code from the URL
    html = requests.get(url).text
    # Create a soup object from the HTML
    soup = BeautifulSoup(html, "html.parser")
    # Find the title element in the HTML
    title = soup.find("title")
    # Check if the title element exists or not
    if title:
        # Return 0 if the title element exists
        return 0
    else:
        # Return 1 if the title element does not exist
        return 1


In [41]:
def ImagesOnlyInForm(url):
  # Import the BeautifulSoup and requests libraries
  from bs4 import BeautifulSoup
  import requests
  # Get the HTML source code from the URL
  html = requests.get(url).text
  # Create a soup object from the HTML
  soup = BeautifulSoup(html, "html.parser")
  # Find all the image tags in the HTML
  images = soup.find_all("img")
  # Initialize a counter for images only in form tags
  img_in_form = 0
  # Loop through each image and check its parent tag
  for image in images:
    # Get the parent tag of the image
    parent = image.parent.name
    # Check if the parent tag is a form tag
    if parent == "form":
      # Increment the counter
      img_in_form += 1
  # Calculate the percentage of images only in form tags
  pct = img_in_form / len(images) * 100
  # Return 0 if the percentage is less than or equal to 50, and 1 if it's greater than 50
  if pct <= 50:
    return 0
  else:
    return 1
q

In [42]:
def SubdomainLevelRT(url):
  # Import the urlparse library to parse the URL
  from urllib.parse import urlparse
  # Parse the URL and get the domain name
  domain = urlparse(url).netloc
  # Remove the port number or other suffixes from the domain name if any
  domain = domain.split(":")[0]
  # Split the domain name by dots and get the number of parts
  parts = domain.split(".")
  # The number of subdomains is the number of parts minus two (the top-level domain and the root domain)
  subdomains = len(parts) - 2
  # Calculate the percentage of subdomains
  percentage = subdomains / len(parts)
  # Return -1 if the percentage is less than 0.2, 0 if it's between 0.2 and 0.5, and 1 if it's greater than 0.5
  if percentage < 0.2:
    return -1
  elif percentage <= 0.6:
    return 0
  else:
    return 1

In [43]:
def PctExtResourceUrlsRT(url):
  # Import the BeautifulSoup, requests, and urlparse libraries
  from bs4 import BeautifulSoup
  import requests
  from urllib.parse import urlparse
  # Get the HTML source code from the URL
  html = requests.get(url).text
  # Create a soup object from the HTML
  soup = BeautifulSoup(html, "html.parser")
  # Find all the resource tags in the HTML
  resources = soup.find_all(["img", "script", "link", "style", "font"])
  # Initialize a counter for resources with external URLs
  ext_resources = 0
  # Loop through each resource and check its attributes
  for resource in resources:
    # Check if the resource has an 'href' or 'src' attribute
    if 'href' in resource.attrs or 'src' in resource.attrs:
      # Get the URL from the attribute
      res_url = resource.attrs.get('href') or resource.attrs.get('src')
      # Parse the resource URL and get the domain name
      res_domain = urlparse(res_url).netloc
      # Parse the webpage URL and get the domain name
      web_domain = urlparse(url).netloc
      # Check if the domain names are different
      if res_domain != web_domain:
        # Increment the counter
        ext_resources += 1
  # Calculate the percentage of resources with external URLs
  pct = ext_resources / len(resources) * 100
  # Return the percentage as a categorical value based on some thresholds
  if pct < 10:
    return -1
  elif pct < 60:
    return 0
  else:
    return 1


In [44]:
def PctExtResourceUrlsRT(url):
  import requests

  html = requests.get(url).text
  
  # Import the BeautifulSoup library to parse the HTML
  from bs4 import BeautifulSoup
  # Create a soup object from the HTML
  soup = BeautifulSoup(html, "html.parser")
  # Find all the resource tags in the HTML
  resources = soup.find_all(["img", "script", "link", "style", "font"])
  # Initialize a counter for resources with external URLs
  ext_resources = 0
  # Loop through each resource and check its attributes
  for resource in resources:
    # Check if the resource has an 'href' or 'src' attribute
    if 'href' in resource.attrs or 'src' in resource.attrs:
      # Get the URL from the attribute
      res_url = resource.attrs.get('href') or resource.attrs.get('src')
      # Get the domain name of the resource URL
      res_domain = res_url.split("//")[-1].split("/")[0]
      # Get the domain name of the webpage URL
      web_domain = url.split("//")[-1].split("/")[0]
      # Check if the domain names are different
      if res_domain != web_domain:
        # Increment the counter
        ext_resources += 1
  # Calculate the percentage of resources with external URLs
  pct = ext_resources / len(resources) * 100
  # Return the percentage as a categorical value based on some thresholds
  if pct < 10:
    return -1
  elif pct < 60:
    return 0
  else:
    return 1


In [45]:
def AbnormalExtFormActionR(url):

    import requests
    html = requests.get(url).text
    # Import the BeautifulSoup library to parse the HTML
    from bs4 import BeautifulSoup
    # Create a soup object from the HTML
    soup = BeautifulSoup(html, "html.parser")
    # Find all the form tags in the HTML
    forms = soup.find_all("form")
    # Initialize a counter for forms with external URLs
    ext_forms = 0
    # Loop through each form and check its action attribute
    for form in forms:
        # Get the action attribute of the form
        action = form.get("action")
        # Check if the action is not None or empty
        if action:
            # Get the domain name of the action URL
            domain = action.split("//")[-1].split("/")[0]
            # Get the domain name of the HTML
            html_domain = html.split("//")[-1].split("/")[0]
            # Check if the domain names are different
            if domain != html_domain:
                # Increment the counter
                ext_forms += 1
    # Calculate the percentage of forms with external URLs
    pct = ext_forms / len(forms) * 100
    # Return the percentage as a categorical value based on some thresholds
    if pct < 10:
        return -1
    elif pct < 60:
        return 0
    else:
        return 1


In [46]:
def ExtMetaScriptLinkRT(url):
    import requests

    html = requests.get(url).text
    # Import the BeautifulSoup library to parse the HTML
    from bs4 import BeautifulSoup
    # Create a soup object from the HTML
    soup = BeautifulSoup(html, "html.parser")
    # Find all the meta, script, and link tags in the HTML
    tags = soup.find_all(["meta", "script", "link"])
    # Initialize a counter for tags with external URLs
    ext_tags = 0
    # Loop through each tag and check its attributes
    for tag in tags:
        # Check if the tag has an 'href' or 'src' attribute
        if 'href' in tag.attrs or 'src' in tag.attrs:
            # Get the URL from the attribute
            url = tag.attrs.get('href') or tag.attrs.get('src')
            # Get the domain name of the URL
            domain = url.split("//")[-1].split("/")[0]
            # Get the domain name of the HTML
            html_domain = html.split("//")[-1].split("/")[0]
            # Check if the domain names are different
            if domain != html_domain:
                # Increment the counter
                ext_tags += 1
    # Calculate the percentage of tags with external URLs
    pct = ext_tags / len(tags) * 100
    # Return the percentage as a categorical value based on some thresholds
    if pct < 10:
        return -1
    elif pct < 60:
        return 0
    else:
        return 1


In [47]:
def PctExtNullSelfRedirectHyperlinksRT(html):
  # Import the BeautifulSoup library to parse the HTML
  from bs4 import BeautifulSoup
  # Create a soup object from the HTML
  soup = BeautifulSoup(html, "html.parser")
  
  # Find all the hyperlinks in the HTML
  links = soup.find_all("a")
  # Initialize a counter for external or null links
  ext_null_links = 0
  # Loop through each link and check its attributes
  for link in links:
    # Get the href attribute of the link
    href = link.get("href")
    # Check if the href is None, starts with "#", or contains "javascript:void(0)"
    if href is None or href.startswith("#") or "javascript:void(0)" in href:
      # Increment the counter
      ext_null_links += 1
    else:
      # Get the domain name of the link
      domain = href.split("//")[-1].split("/")[0]
      # Get the domain name of the HTML
      html_domain = html.split("//")[-1].split("/")[0]
      # Check if the domain names are different
      if domain != html_domain:
        # Increment the counter
        ext_null_links += 1
  # Calculate the percentage of external or null links
  pct = ext_null_links / len(links) * 100
  # Return the percentage as a categorical value based on some thresholds
  if pct < 10:
    return -1
  elif pct < 60:
    return 0
  else:
    return 1


In [48]:
def extract_features(url):
    """
    Extracts all the features from the URL and returns a list of the features.
    """
    features = []
    features.append(NumDots(url))
    features.append(SubdomainLevel(url))
    features.append(PathLevel(url))
    features.append(UrlLength(url))
    features.append(NumDashInHostName(url))
    features.append(AtSymbol(url))
    features.append(TildeSymbol(url))
    features.append(NumUnderscore(url))
    features.append(NumPercent(url))
    features.append(NumQueryComponents(url))
    features.append(NumAmpersand(url))
    features.append(NumHash(url))
    features.append(NumNumericChars(url))
    features.append(NoHttps(url))
    features.append(RandomString(url))
    features.append(IpAddress(url))
    features.append(DomainInSubdomains(url))
    features.append(DomainInPaths(url))
    features.append(HttpsInHostname(url))
    features.append(HostnameLength(url))
    features.append(PathLength(url))
    features.append(QueryLength(url))
    features.append(DoubleSlashInPath(url))
    features.append(NumSensitiveWords(url))
    features.append(EmbeddedBrandName(url))
    features.append(PctExtHyperlinks(url))
    features.append(PctExtResourceUrls(url))
    features.append(ExtFavicon(url))
    features.append(InsecureForms(url))
    features.append(RelativeFormAction(url))
    features.append(ExtFormAction(url))
    features.append(AbnormalFormAction(url))
    features.append(PctNullSelfRedirectHyperlinks(url))
    features.append(FrequentDomainNameMismatch(url))
    features.append(FakeLinkInStatusBar(url))
    features.append(RightClickDisabled(url))
    features.append(PopUpWindow(url))
    features.append(SubmitInfoToEmail(url))
    features.append(IframeOrFrame(url))
    features.append(MissingTitle(url))
    features.append(ImagesOnlyInForm(url))
    features.append(SubdomainLevelRT(url))
    features.append(UrlLengthRT(url))
    features.append(PctExtResourceUrlsRT(url))
    features.append(AbnormalExtFormActionR(url))
    features.append(ExtMetaScriptLinkRT(url))
    features.append(PctExtNullSelfRedirectHyperlinksRT(url))
    
    return features

