In [20]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import argparse
import tldextract  # Need to install: pip install tldextract


def get_outlinks(url):
    """
    Extract all outlinks from a given URL.
    
    Args:
        url (str): The URL to extract links from.
        
    Returns:
        list: A list of all unique outlinks found on the page.
    """
    # Initialize an empty list to store the outlinks
    outlinks = []
    
    try:
        # Send HTTP request to the specified URL
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        
        # Check if the request was successful
        response.raise_for_status()
        
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Get the base domain for identifying external links
        base_domain = urlparse(url).netloc
        
        # Find all anchor tags and extract the href attribute
        for link in soup.find_all('a'):
            href = link.get('href')
            
            # Skip if the href is None or empty
            if not href:
                continue
                
            # Convert relative URLs to absolute URLs
            absolute_url = urljoin(url, href)
            
            # Parse the URL to extract components
            parsed_url = urlparse(absolute_url)
            
            # Skip fragment identifiers that point to the same page
            if not parsed_url.netloc and not parsed_url.path and parsed_url.fragment:
                continue
                
            # Add to the list if it's not already there
            if absolute_url not in outlinks:
                outlinks.append(absolute_url)
        
        return outlinks
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []


url = "https://www.usa.gov/agency-index/b#B"

def get_unique_links(url):
    # Get the outlinks
    links = get_outlinks(url)

    # Extract the true top-level domains using tldextract
    extracted_domains = []
    for link in links:
        extract_result = tldextract.extract(link)
        # Combine domain and suffix for the true TLD (e.g., "example.gov")
        tld = f"{extract_result.domain}.{extract_result.suffix}"
        extracted_domains.append(tld)

    # Ensure each link ends with a valid government tld
    valid_tlds = ['.gov', '.mil', '.edu']
    filtered_domains = [domain for domain in extracted_domains if domain.endswith(tuple(valid_tlds))]

    # Convert to set to remove duplicates
    unique_domains = list(set(filtered_domains))

    # Print the number of unique links
    print(f"Found {len(unique_domains)} unique domains on {url}")
    return unique_domains

all_links = []
for i in ['','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']:
    all_links.extend(get_unique_links(f"https://www.usa.gov/agency-index/{i}#{i.upper() if i else ''}"))
all_links

Found 33 unique domains on https://www.usa.gov/agency-index/#
Found 25 unique domains on https://www.usa.gov/agency-index/b#B
Found 52 unique domains on https://www.usa.gov/agency-index/c#C
Found 39 unique domains on https://www.usa.gov/agency-index/d#D
Found 19 unique domains on https://www.usa.gov/agency-index/e#E
Found 45 unique domains on https://www.usa.gov/agency-index/f#F
Found 8 unique domains on https://www.usa.gov/agency-index/g#G
Found 11 unique domains on https://www.usa.gov/agency-index/h#H
Found 22 unique domains on https://www.usa.gov/agency-index/i#I
Found 16 unique domains on https://www.usa.gov/agency-index/j#J
Found 2 unique domains on https://www.usa.gov/agency-index/k#K
Found 7 unique domains on https://www.usa.gov/agency-index/l#L
Found 22 unique domains on https://www.usa.gov/agency-index/m#M
Found 65 unique domains on https://www.usa.gov/agency-index/n#N
Found 34 unique domains on https://www.usa.gov/agency-index/o#O
Found 19 unique domains on https://www.usa.go

['archives.gov',
 'usagm.gov',
 'cdc.gov',
 'ahrq.gov',
 'army.mil',
 'atf.gov',
 'abilityone.gov',
 'arc.gov',
 'usa.gov',
 'visitthecapitol.gov',
 'gsa.gov',
 'abmc.gov',
 'uscourts.gov',
 'afrh.gov',
 'usaid.gov',
 'state.gov',
 'militaryonesource.mil',
 'acus.gov',
 'usda.gov',
 'achp.gov',
 'aoc.gov',
 'arctic.gov',
 'hhs.gov',
 'usadf.gov',
 'af.mil',
 'clinicaltrials.gov',
 'ttb.gov',
 'access-board.gov',
 'justice.gov',
 'nih.gov',
 'africom.mil',
 'americorps.gov',
 'acl.gov',
 'blm.gov',
 'dol.gov',
 'atf.gov',
 'bsee.gov',
 'consumerfinance.gov',
 'usa.gov',
 'gsa.gov',
 'uscourts.gov',
 'bop.gov',
 'bea.gov',
 'moneyfactory.gov',
 'treasury.gov',
 'bls.gov',
 'state.gov',
 'bjs.gov',
 'doc.gov',
 'bpa.gov',
 'usbg.gov',
 'bia.gov',
 'census.gov',
 'bep.gov',
 'usbr.gov',
 'ttb.gov',
 'bts.gov',
 'boem.gov',
 'cftc.gov',
 'ocwr.gov',
 'cdc.gov',
 'cpsc.gov',
 'csce.gov',
 'csb.gov',
 'cfa.gov',
 'usccr.gov',
 'army.mil',
 'house.gov',
 'ojp.gov',
 'consumerfinance.gov',
 'us

In [30]:
import requests

def get_subdomains(domain):
    url = "https://subdomain-scan1.p.rapidapi.com/"

    querystring = {"domain":domain}

    headers = {
        "x-rapidapi-key": "b7cbfd0d70msh57e831a843630a8p17fb54jsn3e722d00668d",
        "x-rapidapi-host": "subdomain-scan1.p.rapidapi.com"
    }

    response = requests.get(url, headers=headers, params=querystring)

    return response.json()
all_subdomains = []
for link in all_links:
    all_subdomains.extend(get_subdomains(link))

all_subdomains = list(set(all_subdomains))




