# Download Latest Immigration Enforcement Data



In [1]:
!python -m pip install selenium beautifulsoup4 pandas requests openpyxl

Collecting selenium
  Downloading selenium-4.33.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting typing_extensions~=4.13.2 (from selenium)
  Downloading typing_extensions-4.13.2-py3-none-any.whl.metadata (3.0 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.33.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m18.

In [7]:
###----Necessary Libraries----###

import time
import re
import os
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

In [8]:
def configure_browser_options():
    """Configures Chrome options for headless Browse."""
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    return chrome_options

def get_page_content(url):
    """Fetches the content of a web page using Selenium."""
    browser_options = configure_browser_options()
    driver = webdriver.Chrome(options=browser_options)
    try:
        driver.get(url)
        time.sleep(5)  # Allow time for the page to load
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)  # Allow time for scrolling
        page_source = driver.page_source
    finally:
        driver.quit()
    return page_source

def extract_download_links(html_content):
    """Extracts download links for spreadsheet and CSV files from HTML."""
    soup = BeautifulSoup(html_content, 'html.parser')
    all_links = soup.find_all('a', href=True)

    file_links = []
    for link in all_links:
        href = link['href']
        text = link.text.strip()
        if re.search(r'\.(xlsx?|csv|xls)$', href, re.IGNORECASE):
            file_links.append((href, text))
    return file_links

def get_latest_file_info(links):
    """Sorts links by date and returns the latest file's URL and label."""
    def extract_date_key(text_or_url):
        match = re.search(r'(\d{4}[-_]\d{2})', text_or_url)
        return match.group(1) if match else text_or_url

    links.sort(key=lambda x: extract_date_key(x[0]), reverse=True)

    latest_suffix, latest_label = links[0]

    # Construct full URL if it's a relative path
    base_domain = "https://ohss.dhs.gov"
    full_url = f"{base_domain}{latest_suffix}" if latest_suffix.startswith('/') else latest_suffix

    return full_url, latest_label




In [9]:
def download_file(url, destination_folder="."):
    """Downloads a file from a given URL."""
    file_ext = url.split('.')[-1]
    local_filename = os.path.join(destination_folder, f"latest_data.{file_ext}")

    print(f"Attempting to download: {url}")
    response = requests.get(url, stream=True)
    response.raise_for_status()  # Raise an exception for bad status codes

    with open(local_filename, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

    print(f"File downloaded to: {local_filename}")
    return local_filename, file_ext

In [10]:
def load_data_to_dataframe(filepath, file_type):
    """Loads data from a file into a pandas DataFrame based on file type."""
    if file_type in ['xlsx', 'xls']:
        df = pd.read_excel(filepath)
    elif file_type == 'csv':
        df = pd.read_csv(filepath)
    else:
        raise ValueError("Unsupported file format provided.")
    return df

def main():
    target_url = "https://ohss.dhs.gov/topics/immigration/immigration-enforcement/monthly-tables"

    print("Fetching page content...")
    page_content = get_page_content(target_url)

    print("Extracting download links...")
    found_links = extract_download_links(page_content)

    if not found_links:
        print("No downloadable files found.")
        return

    print("Identifying the most recent file...")
    latest_file_url, file_label = get_latest_file_info(found_links)
    print(f"Most recent file identified: {file_label} from {latest_file_url}")

    try:
        downloaded_filepath, downloaded_file_ext = download_file(latest_file_url)

        print("Loading data into DataFrame...")
        data_df = load_data_to_dataframe(downloaded_filepath, downloaded_file_ext)

        print("\nSuccessfully loaded data. Here's a preview:")
        print(data_df.head())

    except requests.exceptions.RequestException as e:
        print(f"Error during file download: {e}")
    except ValueError as e:
        print(f"Data loading error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

if __name__ == "__main__":
    main()

Fetching page content...
Extracting download links...
Identifying the most recent file...
Most recent file identified: Immigration Enforcement and Legal Processes Monthly Tables - November 2024 from https://ohss.dhs.gov/sites/default/files/2025-01/2025_0116_ohss_immigration-enforcement-and-legal-processes-tables-november-2024.xlsx
Attempting to download: https://ohss.dhs.gov/sites/default/files/2025-01/2025_0116_ohss_immigration-enforcement-and-legal-processes-tables-november-2024.xlsx
File downloaded to: ./latest_data.xlsx
Loading data into DataFrame...

Successfully loaded data. Here's a preview:
                   Table of Contents  \
0                                NaN   
1  Click link for corresponding tab:   
2                                NaN   
3                           Category   
4                         Encounters   

                                          Unnamed: 1     Unnamed: 2  
0                                                NaN            NaN  
1            