# Download Latest Immigration Enforcement Data
This notebook scrapes the DHS OHSS monthly immigration enforcement data page, finds the latest dataset, and reads it into a pandas DataFrame.

In [1]:
!python -m pip install selenium beautifulsoup4 pandas requests openpyxl



In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time
import re
import os

# These libraries are for:

# selenium: controls the web browser to open the page and scroll
# BeautifulSoup: parses the HTML content
# pandas: handles reading Excel or CSV files
# requests: downloads files from the internet
# time: adds delays (waiting for page to load)
# re: regular expressions to match dates or file types

In [3]:
# Setup headless Chrome
# Headless mode allows automation without opening the Chrome UI.
# --no-sandbox and --disable-dev-shm-usage are stability options often used in cloud environments or VMs.
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

#This opens the target web page using the Chrome browser driver.
driver = webdriver.Chrome(options=options)
url = "https://ohss.dhs.gov/topics/immigration/immigration-enforcement/monthly-tables"
driver.get(url)

# Wait and scroll
#Many websites only load full content when the user scrolls.
#We simulate this with JavaScript: window.scrollTo(...).
time.sleep(5)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)

# driver.page_source grabs the current HTML of the page.
# soup now holds a searchable tree of all HTML elements.
# driver.quit() closes the browser.
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()

In [4]:
# Extract file links
#This grabs every <a> HTML element (hyperlink) that has an href attribute.
download_section = soup.find_all('a', href=True)

#Filters links that point to .xlsx, .xls, or .csv files.
#Stores them as a list of tuples: (URL, Link Text).
file_links = [
    (a['href'], a.text.strip()) 
    for a in download_section 
    if re.search(r'\.(xlsx?|csv|xls)$', a['href'], re.IGNORECASE)
]

# Tries to extract date formats from the URL or filename.
# Returns that value so we can sort chronologically.
def extract_date_key(text):
    match = re.search(r'(\d{4}[-_]\d{2})', text)
    return match.group(1) if match else text

#Sorts all links by extracted date in descending order.
file_links.sort(key=lambda x: extract_date_key(x[0]), reverse=True)

#Picks the most recent file (first after sorting).
#If it’s a relative URL (starts with /), it builds a full URL by prepending the base domain.
latest_url_suffix, label = file_links[0]
latest_url = f"https://ohss.dhs.gov{latest_url_suffix}" if latest_url_suffix.startswith('/') else latest_url_suffix
print("Latest file URL:", latest_url)

Latest file URL: https://ohss.dhs.gov/sites/default/files/2025-01/2025_0116_ohss_immigration-enforcement-and-legal-processes-tables-november-2024.xlsx


In [5]:
# Download and read the file
#Extracts the file extension (e.g., .xlsx, .csv).
#Prepares a filename for saving the downloaded file.
file_ext = latest_url.split('.')[-1]
local_filename = f"latest_file.{file_ext}"

#Downloads the file from the URL.
#If it fails (e.g., 404), raises an error.
r = requests.get(latest_url)

#Writes the downloaded content to a file locally.
with open(local_filename, 'wb') as f:
    f.write(r.content)

# Depending on the file extension:
# Uses pandas.read_excel() for Excel files (requires openpyxl)
# Uses pandas.read_csv() for CSV files
# Raises an error if it’s some other format
if file_ext in ['xlsx', 'xls']:
    df = pd.read_excel(local_filename)
elif file_ext == 'csv':
    df = pd.read_csv(local_filename)
else:
    raise ValueError("Unsupported file format")