In [43]:
!pip install pandas beautifulsoup4 tldextract
#Pandas = For data manipulation and analysis + data structures and reading data from files (CSV, JSON, TXT)
#Beautifulsoup4 = Parsing HTML and XML documents to extract specific info from HTML content (Web scraping)
#tldetract = Extract the top-level domain (TLD) from a URL, so break down URL into components and useful for analyzing and processing URL



Defaulting to user installation because normal site-packages is not writeable


In [44]:
#Import the librairies needed for the extracting functions


import re
import pandas as pd
from bs4 import BeautifulSoup
import tldextract


In [45]:

# Regex patterns for URL, email, IP, and href detection

url_regex = r'https?://(?:www\.)?[^\s/$.?#].[^\s]*'
href_regex = r'href=["\'](https?://[^\s"\'<>]+)["\']'
ip_regex = r'(https?://)?\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b'
email_regex = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'

In [46]:
#Create the function helper for the HTML extraction

#1) Check if there is a @ in the url (Bool)
def has_at_in_urls(urls):
    email_pattern = re.compile(email_regex, re.IGNORECASE)
    #Store all the email pattern to detect if it is a emal addresses
    for url in urls:
        if "@" in url and not email_pattern.search(url):
            return True
    return False


def number_attachments(html):
    soup = BeautifulSoup(html, 'html.parser')
    
    # Initialize the count of attachments
    total_attachments = 0

    # Count <a> tags that link to downloadable files (including images if needed)
    total_attachments += len(soup.find_all('a', href=lambda href: href and href.endswith(('.pdf', '.doc', '.ppt', '.zip', '.txt', '.jpg', '.png', '.gif'))))

    # Count <embed> tags (if they link to downloadable content)
    total_attachments += len(soup.find_all('embed', src=lambda src: src and src.endswith(('.pdf', '.doc', '.ppt', '.zip', '.txt', '.jpg', '.png', '.gif'))))

    # Count <object> tags (commonly used for embedding downloadable multimedia)
    total_attachments += len(soup.find_all('object', data=lambda data: data and data.endswith(('.pdf', '.doc', '.ppt', '.zip', '.txt', '.jpg', '.png', '.gif'))))

    return total_attachments
#Used the find_all to find all the tags relates to attachments like: <a>, <img>, <embed>, <iframe>, <object>

#3) Check CSS in header (value)
def count_css_links(html):
    soup = BeautifulSoup(html, "html.parser")
    link_count = len(soup.find_all("link", rel="stylesheet"))
    style_count = len(soup.find_all("style"))
    return link_count + style_count

#Used the parsed HTML to find all the <link> tag set to rel=stylesheet or <style> 


#4) Check for external resources (value)
def count_external_resources(html):
    soup = BeautifulSoup(html, "html.parser")
    return len(soup.find_all(src=True)) + len(soup.find_all("link", href=True))
#Find all the src tag or link tag with href to count the number of external resources


#5) Check for HTML content (bool)
def html_content_str(html):
    return bool(re.search(r'html', html, re.IGNORECASE))
#r'html' is the expression pattern for the string <html will look into it and re.search will scans through the string html looking for a match
#the ignorecase is to make the search case-insensitive

#6) Check for HTML form (bool)
def html_form(html):
    return bool(re.search(r'<\s?\/?\s?form\s?>', html, re.IGNORECASE))
#searching for the form tag <form> by looking for the opening <, \s?(optional spaces), |/? to allow for </form> and closing angle>

#7) Check for iframe Form (bool)
def iframe(html):
    return bool(re.search(r'<\s?\/?\s?iframe\s?>', html, re.IGNORECASE))
#same type of searching as the form but looking for <iframe> and </iframe>

#8) Check for IPs in URLS (bool)
def ips_in_urls(urls):
    for url in urls:
        if re.search(ip_regex, url):
            return True
    return False
#loop into the urls list if found in the url a pattern defined by ip_regenex then return True

#9) Check for Javascript Block (value)
def count_javascript_blocks(html):
    soup = BeautifulSoup(html, "html.parser")
    return len(soup.find_all("script"))
#Using the search to find all the <script> tags and counts them

#10) Check for URLs in the email
def extract_urls_from_html(html):
    return re.findall(url_regex, html)
    
#re.findall searches the entire HTML string to find the pattern: url_regex that contains all the patern for matching URLs
#re is tool for defining complex patterns fro string matching (works a plain text) versus beautifulSoup that works as a DOM-like parser that treats HTML as a structured documents



In [48]:
#Define the Function to check each conditions in a .txt File

def process_file(html):

    # Extract URLs from HTML using regex
    urls = extract_urls_from_html(html)

    # Initialize feature dictionary
    feature_dict = {
        '@ in URLs': has_at_in_urls(urls),
        'Attachments': number_attachments(html),
        'CSS': count_css_links(html),
        'External Resources': count_external_resources(html),
        'HTML Content': html_content_str(html),
        'Html Form': html_form(html),
        'Html iFrame': iframe(html),
        'IPs in URLs': ips_in_urls(urls),
        'JavaScript': count_javascript_blocks(html),
        'URLs': len(urls)
    }

    # Convert to DataFrame
    df = pd.DataFrame(columns=feature_dict.keys())  # Create empty DataFrame with columns as feature names
    df.loc[0] = feature_dict.values()  # Set the first row with values

    # Create an empty DataFrame with feature names as columns and populate the first row with values from the feature dictionary.
    return df