In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin

BASE_URL = "https://dps.arkansas.gov/law-enforcement/arkansas-state-police/directors-office/asp-commission/asp-commission-meeting-minutes/"

response = requests.get(BASE_URL)
print("Status code:", response.status_code)

# Just show a small snippet so we know we got HTML back
print(response.text[:500])


Status code: 403
<html>
<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>
<hr><center>nginx</center>
</body>
</html>



In [2]:
import requests

BASE_URL = "https://dps.arkansas.gov/law-enforcement/arkansas-state-police/directors-office/asp-commission/asp-commission-meeting-minutes/"

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/129.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}

response = requests.get(BASE_URL, headers=headers)
print("Status code:", response.status_code)
print(response.text[:500])


Status code: 200
<!DOCTYPE html>
<html lang="en-US">
<head><meta charset="UTF-8"><script>if(navigator.userAgent.match(/MSIE|Internet Explorer/i)||navigator.userAgent.match(/Trident\/7\..*?rv:11/i)){var href=document.location.href;if(!href.match(/[?&]nowprocket/)){if(href.indexOf("?")==-1){if(href.indexOf("#")==-1){document.location.href=href+"?nowprocket=1"}else{document.location.href=href.replace("#","?nowprocket=1#")}}else{if(href.indexOf("#")==-1){document.location.href=href+"&nowprocket=1"}else{document.loca


In [3]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(response.text, "html.parser")

# Find all PDF links that look like meeting minutes
links = soup.find_all("a", href=True)

print("Total links found:", len(links))

# Show the first 10 link texts + URLs so we can inspect them
for link in links[:10]:
    print("-", link.get_text(strip=True), "->", link["href"])


Total links found: 662
- Skip to content -> #content
- The Official Website of the State of Arkansas -> https://portal.arkansas.gov/
- State Directory -> https://portal.arkansas.gov/state-directory/
- All State Agencies -> https://portal.arkansas.gov/agencies/
- Elected Officials -> https://portal.arkansas.gov/elected-officials/
- Arkansas Code -> https://advance.lexis.com/container?config=00JAA3ZTU0NTIzYy0zZDEyLTRhYmQtYmRmMS1iMWIxNDgxYWMxZTQKAFBvZENhdGFsb2cubRW4ifTiwi5vLw6cI1uX&crid=95c15fdc-50b6-43db-a19a-aa07301703e9
- State Employees -> https://portal.arkansas.gov/state-employees/
- Help Center -> https://portal.arkansas.gov/help-center/
- Accessibility & Settings -> https://portal.arkansas.gov/acceptable-use/#accessibility
- State Directory -> https://portal.arkansas.gov/state-directory/


In [4]:
from urllib.parse import urljoin

# Filter links that look like meeting minutes PDFs
minutes_links = []
for a in links:
    href = a["href"]
    if href.lower().endswith(".pdf") and "meeting-minutes" in href.lower():
        minutes_links.append(a)

print("Meeting-minutes PDF links found:", len(minutes_links))

# Preview the first 20
for a in minutes_links[:20]:
    text = a.get_text(strip=True)
    full_url = urljoin(BASE_URL, a["href"])
    print("-", text, "->", full_url)


Meeting-minutes PDF links found: 31
- September 11, 2025 -> https://dps.arkansas.gov/wp-content/uploads/ASP-Commission-Meeting-Minutes-September-11-2025.pdf
- August 14, 2025 -> https://dps.arkansas.gov/wp-content/uploads/ASP-Commission-Meeting-Minutes-August-14-2025.pdf
- July 10, 2025 -> https://dps.arkansas.gov/wp-content/uploads/ASP-Commission-Meeting-Minutes-July-10-2025.pdf
- June 12, 2025 -> https://dps.arkansas.gov/wp-content/uploads/ASP-Commission-Meeting-Minutes-June-12-2025.pdf
- May 8, 2025 -> https://dps.arkansas.gov/wp-content/uploads/ASP-Commission-Meeting-Minutes-May-8-2025.pdf
- April 10, 2025 -> https://dps.arkansas.gov/wp-content/uploads/ASP-Commission-Meeting-Minutes-April-10-2025.pdf
- March 13, 2025 -> https://dps.arkansas.gov/wp-content/uploads/ASP-Commission-Meeting-Minutes-March-13-2025.pdf
- February 14, 2025 -> https://dps.arkansas.gov/wp-content/uploads/ASP-Commission-Meeting-Minutes-February-14-2025.pdf
- January 16, 2025 -> https://dps.arkansas.gov/wp-cont

In [5]:
from urllib.parse import urljoin

months = [
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
]

minutes_links = []

for a in links:
    href = a["href"]
    text = a.get_text(strip=True)

    # Check: PDF + text contains a month name (looks like a meeting date)
    if href.lower().endswith(".pdf") and any(month in text for month in months):
        minutes_links.append(a)

print("Meeting-minutes-style PDF links found:", len(minutes_links))

# Preview a few from the start
print("\n--- First 10 ---")
for a in minutes_links[:10]:
    print("-", a.get_text(strip=True), "->", urljoin(BASE_URL, a["href"]))

# And a few from the end (should include 2013)
print("\n--- Last 10 ---")
for a in minutes_links[-10:]:
    print("-", a.get_text(strip=True), "->", urljoin(BASE_URL, a["href"]))


Meeting-minutes-style PDF links found: 170

--- First 10 ---
- September 11, 2025 -> https://dps.arkansas.gov/wp-content/uploads/ASP-Commission-Meeting-Minutes-September-11-2025.pdf
- August 14, 2025 -> https://dps.arkansas.gov/wp-content/uploads/ASP-Commission-Meeting-Minutes-August-14-2025.pdf
- July 10, 2025 -> https://dps.arkansas.gov/wp-content/uploads/ASP-Commission-Meeting-Minutes-July-10-2025.pdf
- June 12, 2025 -> https://dps.arkansas.gov/wp-content/uploads/ASP-Commission-Meeting-Minutes-June-12-2025.pdf
- May 8, 2025 -> https://dps.arkansas.gov/wp-content/uploads/ASP-Commission-Meeting-Minutes-May-8-2025.pdf
- April 10, 2025 -> https://dps.arkansas.gov/wp-content/uploads/ASP-Commission-Meeting-Minutes-April-10-2025.pdf
- March 13, 2025 -> https://dps.arkansas.gov/wp-content/uploads/ASP-Commission-Meeting-Minutes-March-13-2025.pdf
- February 14, 2025 -> https://dps.arkansas.gov/wp-content/uploads/ASP-Commission-Meeting-Minutes-February-14-2025.pdf
- January 16, 2025 -> https:/

In [6]:
import pandas as pd
from urllib.parse import urljoin

rows = []

for a in minutes_links:
    meeting_date = a.get_text(strip=True)
    href = a["href"]
    pdf_url = urljoin(BASE_URL, href)  # make sure it's a full URL

    # Extract the year from the visible date text (last 4 characters)
    year = meeting_date[-4:]

    rows.append({
        "year": year,
        "meeting_date": meeting_date,
        "pdf_url": pdf_url
    })

df = pd.DataFrame(rows)

print("Total rows:", len(df))
df.head(10)


Total rows: 170


Unnamed: 0,year,meeting_date,pdf_url
0,2025,"September 11, 2025",https://dps.arkansas.gov/wp-content/uploads/AS...
1,2025,"August 14, 2025",https://dps.arkansas.gov/wp-content/uploads/AS...
2,2025,"July 10, 2025",https://dps.arkansas.gov/wp-content/uploads/AS...
3,2025,"June 12, 2025",https://dps.arkansas.gov/wp-content/uploads/AS...
4,2025,"May 8, 2025",https://dps.arkansas.gov/wp-content/uploads/AS...
5,2025,"April 10, 2025",https://dps.arkansas.gov/wp-content/uploads/AS...
6,2025,"March 13, 2025",https://dps.arkansas.gov/wp-content/uploads/AS...
7,2025,"February 14, 2025",https://dps.arkansas.gov/wp-content/uploads/AS...
8,2025,"January 16, 2025",https://dps.arkansas.gov/wp-content/uploads/AS...
9,2024,"December 12, 2024",https://dps.arkansas.gov/wp-content/uploads/AS...


In [7]:
df['year'].value_counts()

year
2015    16
2020    14
2019    14
2022    13
2021    13
2017    13
2016    13
2014    13
2013    13
2024    12
2023    12
2018    11
2025     9
all)     2
 201     1
ing)     1
Name: count, dtype: int64

In [8]:
import re
from urllib.parse import urljoin

clean_rows = []
bad_texts = []

for a in minutes_links:
    meeting_date = a.get_text(strip=True)
    href = a["href"]
    pdf_url = urljoin(BASE_URL, href)

    # Look for a 4-digit year at the end of the string
    m = re.search(r'(\d{4})\s*$', meeting_date)
    if m:
        year = m.group(1)
        clean_rows.append({
            "year": year,
            "meeting_date": meeting_date,
            "pdf_url": pdf_url
        })
    else:
        bad_texts.append(meeting_date)

df = pd.DataFrame(clean_rows)

print("Total clean rows:", len(df))
print("\nYear counts:")
print(df["year"].value_counts().sort_index())

print("\nTexts that did NOT match a year at the end (for your info):")
for t in bad_texts:
    print("-", t)


Total clean rows: 166

Year counts:
year
2013    13
2014    13
2015    16
2016    13
2017    13
2018    11
2019    14
2020    14
2021    13
2022    13
2023    12
2024    12
2025     9
Name: count, dtype: int64

Texts that did NOT match a year at the end (for your info):
- August 15, 2023 (Special Conference Call)
- January 20, 2023 (Special Conference Call)
- July 19, 201
- February 26, 2016 (Special Called Meeting)


In [9]:
import re
from urllib.parse import urljoin
import pandas as pd

rows = []
still_bad = []

for a in minutes_links:
    meeting_date = a.get_text(strip=True)
    href = a["href"]
    pdf_url = urljoin(BASE_URL, href)

    # 1) Try to find a 4-digit year in the visible text
    years_in_text = re.findall(r'20\d{2}', meeting_date)
    year_from_text = years_in_text[-1] if years_in_text else None

    # 2) If not found, try to find a 4-digit year in the href
    years_in_href = re.findall(r'20\d{2}', href)
    year_from_href = years_in_href[-1] if years_in_href else None

    year = year_from_text or year_from_href

    # 3) Manual fix for the "July 19, 201" typo
    if year is None and "July 19, 201" in meeting_date:
        year = "2018"

    # If we still don't have a year, keep track and skip it
    if year is None:
        still_bad.append((meeting_date, href))
        continue

    # Special meeting flag
    is_special = (
        "Special Conference Call" in meeting_date
        or "Special Called Meeting" in meeting_date
    )

    rows.append({
        "year": year,
        "meeting_date": meeting_date,
        "pdf_url": pdf_url,
        "Special Conference Call": is_special
    })

df = pd.DataFrame(rows)

print("Total rows:", len(df))
print("\nYear counts:")
print(df["year"].value_counts().sort_index())

print("\nSample of special meetings:")
print(df[df["Special Conference Call"]].head())

print("\nEntries we still couldn't parse a year from (if any):")
for text, href in still_bad:
    print("-", text, "->", href)


Total rows: 170

Year counts:
year
2013    13
2014    13
2015    16
2016    14
2017    13
2018    12
2019    14
2020    14
2021    13
2022    13
2023    14
2024    12
2025     9
Name: count, dtype: int64

Sample of special meetings:
     year                                meeting_date  \
25   2023   August 15, 2023 (Special Conference Call)   
33   2023  January 20, 2023 (Special Conference Call)   
125  2016  February 26, 2016 (Special Called Meeting)   

                                               pdf_url  \
25   https://dps.arkansas.gov/wp-content/uploads/AS...   
33   https://dps.arkansas.gov/wp-content/uploads/Mi...   
125  https://dps.arkansas.gov/wp-content/uploads/20...   

     Special Conference Call  
25                      True  
33                      True  
125                     True  

Entries we still couldn't parse a year from (if any):


In [10]:
print("Shape of df (rows, columns):", df.shape)

print("\nColumns:")
print(df.columns.tolist())

print("\nFirst 5 rows:")
display(df.head())

print("\nLast 5 rows:")
display(df.tail())

print("\nYear counts:")
display(df["year"].value_counts().sort_index())


Shape of df (rows, columns): (170, 4)

Columns:
['year', 'meeting_date', 'pdf_url', 'Special Conference Call']

First 5 rows:


Unnamed: 0,year,meeting_date,pdf_url,Special Conference Call
0,2025,"September 11, 2025",https://dps.arkansas.gov/wp-content/uploads/AS...,False
1,2025,"August 14, 2025",https://dps.arkansas.gov/wp-content/uploads/AS...,False
2,2025,"July 10, 2025",https://dps.arkansas.gov/wp-content/uploads/AS...,False
3,2025,"June 12, 2025",https://dps.arkansas.gov/wp-content/uploads/AS...,False
4,2025,"May 8, 2025",https://dps.arkansas.gov/wp-content/uploads/AS...,False



Last 5 rows:


Unnamed: 0,year,meeting_date,pdf_url,Special Conference Call
165,2013,"June 3, 2013",https://dps.arkansas.gov/wp-content/uploads/20...,False
166,2013,"May 9, 2013",https://dps.arkansas.gov/wp-content/uploads/20...,False
167,2013,"April 11, 2013",https://dps.arkansas.gov/wp-content/uploads/20...,False
168,2013,"March 14, 2013",https://dps.arkansas.gov/wp-content/uploads/20...,False
169,2013,"February 14, 2013",https://dps.arkansas.gov/wp-content/uploads/20...,False



Year counts:


year
2013    13
2014    13
2015    16
2016    14
2017    13
2018    12
2019    14
2020    14
2021    13
2022    13
2023    14
2024    12
2025     9
Name: count, dtype: int64

In [11]:
output_filename = "asp_commission_meeting_minutes.csv"
df.to_csv(output_filename, index=False)
print(f"Saved CSV to: {output_filename}")


Saved CSV to: asp_commission_meeting_minutes.csv
