In [1]:
import pandas as pd
import time
import matplotlib.pyplot as plt

# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup

# Import Selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

#### Test Scraper

In [None]:
# Set up Splinter
browser = Browser('chrome')

# Visit the National UFO Reporting Center site
url = 'https://nuforc.org/subndx/?id=all'
browser.visit(url)

# Optional delay for loading the page
table_loaded = browser.is_element_present_by_css('table#table_1', wait_time=3)

if(table_loaded):
    # Scrape the website
    html = browser.html

    # Create a BeautifulSoup object from the scraped HTML
    soup = BeautifulSoup(html, 'html.parser')

    # Select one report row article with select_one()
    first_row = soup.select_one('tbody tr')

    print(first_row)
else:
    # Quit your browsing session
    print("There was an issue loading the table")
    browser.quit()

In [None]:
# Select individual elements
link = first_row.find('a')['href']
occured = first_row.find('td', class_='column-occurred').text
city = first_row.find('td', class_='column-city').text
state = first_row.find('td', class_='column-state').text
country = first_row.find('td', class_='column-country').text
shape = first_row.find('td', class_='column-shape').text
summary = first_row.find('td', class_='column-summary').text
media = True if (first_row.find('td', class_='column-hasimage').text == "Y") else False
explaination = first_row.find('td', class_='column-explanation').text

print(link)
print(occured)
print(city)
print(state)
print(country)
print(shape)
print(summary)
print(media)
print(explaination)

# Quit your browsing session
browser.quit()

#### Scrape first 100 rows

In [None]:
# Set up Splinter
browser = Browser('chrome')

# Visit the National UFO Reporting Center site
url = 'https://nuforc.org/subndx/?id=all'
browser.visit(url)

time.sleep(3)  # Wait for page to load

# Optional delay for loading the page
table_loaded = browser.is_element_present_by_css('table#table_1', wait_time=3)

In [13]:
if(table_loaded):
    # Initialize an empty list to store all rows
    all_data = []

    # Scrape the website
    html = browser.html
    # Parse the page source with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find the table rows    
    rows = soup.select('tbody tr')  

    base_url = 'https://nuforc.org'    
    for row in rows:
        # Extract individual elements from each row
        link = base_url + row.find('a')['href']
        occured = row.find('td', class_='column-occurred').text
        city = row.find('td', class_='column-city').text
        state = row.find('td', class_='column-state').text
        country = row.find('td', class_='column-country').text
        shape = row.find('td', class_='column-shape').text
        summary = row.find('td', class_='column-summary').text
        media = True if (row.find('td', class_='column-hasimage').text == "Y") else False
        explanation = row.find('td', class_='column-explanation').text

        # Append the data as a dictionary
        all_data.append({
            "Report Link": link,
            "Sighting DateTime": occured,
            "City": city,
            "State Province": state,
            "Country": country,
            "UFO Shape": shape,
            "Report Summary": summary,
            "Media": media,
            "Explanation": explanation
        })

    # Convert the list of dictionaries into a pandas DataFrame
    ufo_df = pd.DataFrame(all_data)

    # Export the DataFrame to a CSV file
    ufo_df.to_csv('data/small_dataset.csv', index=False)
else:
    # Quit your browsing session
    print("There was an issue loading the table")

browser.quit()

In [14]:
ufo_df.head()

Unnamed: 0,Report Link,Sighting DateTime,City,State Province,Country,UFO Shape,Report Summary,Media,Explanation
0,https://nuforc.org/sighting/?id=188754,04/07/2025 07:25,Coral Springs,FL,USA,Changing,"Flying, rotating black boomerang shaped object...",False,
1,https://nuforc.org/sighting/?id=188751,04/06/2025 21:15,Lone Jack,MO,USA,Light,Saw a light in the sky that appeared and disap...,False,
2,https://nuforc.org/sighting/?id=188753,04/06/2025 21:12,Delray Beach,FL,USA,Light,Diffuse anomalys light,True,
3,https://nuforc.org/sighting/?id=188758,04/06/2025 20:30,Hyde,England,United Kingdom,Changing,"Very bright light, would fade then grow bright...",False,
4,https://nuforc.org/sighting/?id=188756,04/06/2025 20:26,,,United Kingdom,Orb,Appeared 1 by 1 in a line,True,Chinese Lantern?


#### Web Scraping all 2025 sightings (multiple pages)

In [15]:
base_url = 'https://nuforc.org'
url = "https://nuforc.org/subndx/?id=all"

# Initialize an empty list to store all rows
all_data = []

# Set up Selenium WebDriver
driver = webdriver.Chrome()  # Or use webdriver.Firefox() if using Firefox
driver.get(url) 

# Wait for the table to load
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'table#table_1')))

<selenium.webdriver.remote.webelement.WebElement (session="ce8b6e070fdc87175708cb1838e4972d", element="f.A70220328ECFFCD36CF41823C653965A.d.1863066CAD369BC693EFEEAC42464E53.e.25")>

In [16]:
for page_num in range(15):
    # Wait until the specific classes are loaded
    try:
        wait.until(lambda driver: driver.execute_script(
            "return document.querySelectorAll('tbody td.column-occurred').length > 0;"
        ))
        element_count = driver.execute_script("return document.querySelectorAll('tbody td.column-occurred').length;")
        print(f"Number of rows found: {element_count}")
    except Exception as e:
        print(f"Error: {e}")
        driver.quit()
        exit()
    
    # Parse the page source with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    rows = soup.select('tbody tr')
        
    for row in rows:
        columns = row.find_all('td')

        # Extract individual elements from each row
        link = base_url + row.find('a')['href']
        occured = columns[1].text
        city = columns[2].text
        state = columns[3].text
        country = columns[4].text
        shape = columns[5].text
        summary = columns[6].text
        reported = columns[7].text
        media = True if (columns[8].text == "Y") else False
        explanation = columns[9].text

        # Append the data as a dictionary
        all_data.append({
            "Report Link": link,
            "Sighting DateTime": occured,
            "City": city,
            "State Province": state,
            "Country": country,
            "UFO Shape": shape,
            "Report Summary": summary,
            "Reported DateTime": reported,
            "Media": media,
            "Explanation": explanation
        })

    # Try to click the "Next" button
    try:
        print(f"Finised scraping page {page_num + 1}.")
        wait.until(lambda driver: driver.execute_script(
            "return document.querySelectorAll('.paginate_button.next').length > 0;"
        ))
        
        next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.paginate_button.next')))  
        element_count = driver.execute_script("return document.querySelectorAll('.paginate_button.next').length;")
        
        if next_button.is_displayed() and next_button.is_enabled():
            button = driver.find_element(By.CSS_SELECTOR, '.paginate_button.next')
            driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
            # next_button.click()
            button.click()
            time.sleep(3)  # Wait for the next page to load
        else:
            print("Next button is not clickable.")
            break
    except Exception as e:
        print(e)
        print(f"Error clicking the 'Next' button: {e}")
        break

# Close the browser
driver.quit()

print("Data scraping complete.")

Number of rows found: 100
Finised scraping page 1.
Number of rows found: 100
Finised scraping page 2.
Number of rows found: 100
Finised scraping page 3.
Number of rows found: 100
Finised scraping page 4.
Number of rows found: 100
Finised scraping page 5.
Number of rows found: 100
Finised scraping page 6.
Number of rows found: 100
Finised scraping page 7.
Number of rows found: 100
Finised scraping page 8.
Number of rows found: 100
Finised scraping page 9.
Number of rows found: 100
Finised scraping page 10.
Number of rows found: 100
Finised scraping page 11.
Number of rows found: 100
Finised scraping page 12.
Number of rows found: 100
Finised scraping page 13.
Number of rows found: 100
Finised scraping page 14.
Number of rows found: 100
Finised scraping page 15.
Data scraping complete.


In [17]:
# Convert the list of dictionaries into a pandas DataFrame
ufo_2025_df = pd.DataFrame(all_data)

# Convert the 'Sighting DateTime' column to datetime to allow for correct sorting
ufo_2025_df['Sighting DateTime'] = pd.to_datetime(ufo_2025_df['Sighting DateTime'], format='%m/%d/%Y %H:%M', errors='coerce')

# Filtering out to only include sightings from the year 2025
ufo_2025_df = ufo_2025_df[ufo_2025_df['Sighting DateTime'].dt.year == 2025]

# Sort rows by sighting date and time, starting with most recent
ufo_2025_df = ufo_2025_df.sort_values(by='Sighting DateTime', ascending=False)

# Sightings from January 1st should now be at the bottom
ufo_2025_df['Sighting DateTime'].tail(10)

ufo_2025_df.count()

# Export the DataFrame to a CSV file
ufo_2025_df.to_csv('data/sightings_2025.csv', index=False)

print("Data exported to 'data/sightings_2025.csv'.")

Data exported to 'data/sightings_2025.csv'.


#### Scraping Reports by Month page

In [18]:
# Set up Splinter
browser = Browser('chrome')

# Visit the National UFO Reporting Center site
url = 'https://nuforc.org/ndx/?id=event'
browser.visit(url)

# Optional delay for loading the page
browser.is_element_present_by_css('#primary table', wait_time=2)

# Scrape the website
html = browser.html

# Create a BeautifulSoup object from the scraped HTML
soup = BeautifulSoup(html, 'html.parser')

In [19]:
# Initialize an empty list to store all rows
page_data = []

# Find the table
table = soup.select_one('#primary table') 

# Find the table rows
rows = table.select('tbody tr')  

# Skip the first row containing the column names
rows = rows[1:]
 
for row in rows:
    # Extract individual elements from each row
    elements = row.find_all('td')
    date = elements[0].text
    count = elements[1].text
    
     # Append the data as a dictionary
    page_data.append({
        "Date": date,
        "Count": count,
    })

# Convert the list of dictionaries into a pandas DataFrame
reports_by_month_df = pd.DataFrame(page_data)

# Split into separate columns for year and date
reports_by_month_df[['Sighting Year', 'Sighting Month']] = reports_by_month_df['Date'].str.split('/', expand=True)

# Drop the original 'Date' column and reorder columns to be more intuitive 

reports_by_month_df = reports_by_month_df[['Sighting Year', 'Sighting Month', 'Count']]

# Convert the new columns to integers
reports_by_month_df['Sighting Year'] = pd.to_numeric(reports_by_month_df['Sighting Year'], errors='coerce').astype('Int64')
reports_by_month_df['Sighting Month'] = pd.to_numeric(reports_by_month_df['Sighting Month'], errors='coerce').astype(int)
reports_by_month_df['Count'] = reports_by_month_df['Count'].astype(int)

# Export the DataFrame to a CSV file
reports_by_month_df.to_csv('data/reports_by_month.csv', index=False)

# Close the browser
browser.quit()

print("Data scraping complete.")

Data scraping complete.
