In [54]:
import json
import os
import pandas as pd
# import uuid
import requests
from bs4 import BeautifulSoup
import time
from typing import Dict
from my_types import MinutesData

from urllib.parse import parse_qs, urlparse, urlsplit


In [55]:
contents = os.listdir("data")

In [16]:
all_data = []
for x in contents:
    if ".json" not in x:
        continue
    with open(f"data/{x}", "r") as f:
        part_data = json.load(f)
        all_data += part_data

In [17]:
len(all_data)

1252

In [18]:
with open("all_data.json", "w") as f:
    json.dump(all_data, f)

In [19]:
df = pd.DataFrame.from_dict(all_data)

In [20]:
df.head()

Unnamed: 0,meeting_type,location,datetime_iso,links
0,Accessibility Advisory Committee Meeting,Electronic Meeting,2021-11-03T16:00:00,[{'aria_label': 'Accessibility Advisory Commit...
1,Accessibility Advisory Committee Meeting,Electronic Meeting,2021-10-06T16:00:00,[{'aria_label': 'Accessibility Advisory Commit...
2,Accessibility Advisory Committee Meeting,Electronic Meeting,2021-09-08T16:00:00,[{'aria_label': 'Accessibility Advisory Commit...
3,Accessibility Advisory Committee Meeting,Electronic Meeting,2021-06-02T16:00:00,[{'aria_label': 'Accessibility Advisory Commit...
4,Accessibility Advisory Committee Meeting,Electronic Meeting,2021-05-05T16:00:00,[{'aria_label': 'Accessibility Advisory Commit...


In [23]:
df["links"][0]

[{'aria_label': 'Accessibility Advisory Committee Meeting November 03, 2021',
  'href': 'Meeting.aspx?Id=1ef4a741-9ba3-444b-9993-ba8198548511&lang=English',
  'link_text': 'Accessibility Advisory Committee Meeting',
  'url': 'https://pub-peterborough.escribemeetings.com/Meeting.aspx?Id=1ef4a741-9ba3-444b-9993-ba8198548511&lang=English'},
 {'aria_label': 'Agenda Cover Page (PDF) for Accessibility Advisory Committee Meeting 03 November 2021',
  'href': 'FileStream.ashx?DocumentId=30040',
  'link_text': 'PDF',
  'url': 'https://pub-peterborough.escribemeetings.com/FileStream.ashx?DocumentId=30040'},
 {'aria_label': 'Agenda (HTML) for Accessibility Advisory Committee Meeting 03 November 2021',
  'href': 'Meeting.aspx?Id=1ef4a741-9ba3-444b-9993-ba8198548511&Agenda=Agenda&lang=English',
  'link_text': 'HTML',
  'url': 'https://pub-peterborough.escribemeetings.com/Meeting.aspx?Id=1ef4a741-9ba3-444b-9993-ba8198548511&Agenda=Agenda&lang=English'}]

In [24]:
meeting = df["links"][0][0]

In [25]:
meeting["href"]

'Meeting.aspx?Id=1ef4a741-9ba3-444b-9993-ba8198548511&lang=English'

In [27]:
# the first link in the list is always the agenda
df["agenda_url"] = df["links"].map(lambda x: x[0]["url"])

In [28]:
df["agenda_url"][100]

'https://pub-peterborough.escribemeetings.com/Meeting.aspx?Id=14cfe5e7-5396-4ae6-bf89-1a25e58df245&lang=English'

In [29]:
links = df["links"][10]

In [30]:
# if minutes is in aria_label.lower() then it is the minutes
# link text gives file type
def get_minues_url(links, file_type="html"):
    "file_type in ['html', 'pdf']"
    for x in links:
        if "minutes" not in x["aria_label"].lower():
            continue
        if file_type == x["link_text"].lower():
            return x["url"]
        return ""
    return ""

def is_cancelled(links):
    for x in links:
        if "cancellation" in x["link_text"].lower():
            return True
    return False

def is_cancelled(links):
    for x in links:
        if "cancellation" in x["link_text"].lower():
            return True
    return False

def get_video_url(links):
    for x in links:
        if x["link_text"].lower() == "video":
            return x["url"]
    return ""

In [31]:
df["minutes_html_url"] = df["links"].map(lambda x: get_minues_url(x, "html"))
df["minutes_pdf_url"] = df["links"].map(lambda x: get_minues_url(x, "pdf"))
df["cancelled"] = df["links"].map(is_cancelled)
df["video_url"] = df["links"].map(get_video_url)

In [32]:
df.columns

Index(['meeting_type', 'location', 'datetime_iso', 'links', 'agenda_url',
       'minutes_html_url', 'minutes_pdf_url', 'cancelled', 'video_url'],
      dtype='object')

In [37]:
def get_id(url: str) -> str:
    "Get the meeting id used by the external site"
    query_dict = dict(parse_qs(urlsplit(url).query))
    if "Id" not in query_dict:
        # these seem to be urls for cancellation notices
        # @todo: fix my scraping program since cancelled meetings should still have agendas
        return "blah"
    return query_dict["Id"][0]

df["id"] = df["agenda_url"].map(lambda url: get_id(url))

In [38]:
df.query("id == 'blah'")["agenda_url"][11]

'https://pub-peterborough.escribemeetings.com/FileStream.ashx?DocumentId=27054'

In [39]:
cols_of_interest = ['id', 'meeting_type', 'location', 'datetime_iso', 'agenda_url', 'minutes_html_url', 'minutes_pdf_url', 'cancelled', 'video_url']

In [40]:
df[cols_of_interest].to_json("all_data_flat.json", orient="records")

In [48]:
all_meeting_types = list(set(df["meeting_type"]))

In [49]:
with open("all_meeting_types.json", "w") as f:
    json.dump(all_meeting_types, f)

In [50]:
with open("all_data_flat.json", "r") as f:
    all_data_flat = json.load(f)

In [51]:
def get_minutes_url_from_page(agenda_text: str) -> str:
    soup = BeautifulSoup(agenda_text, "html.parser")
    attachment_divs = soup.find_all("div", attrs={"class": "AgendaItemAttachment"})
    for attachment in attachment_divs:
        if "minutes" in attachment.text.lower():
            a = attachment.find("a")
            href = a.attrs["href"]
            return "https://pub-peterborough.escribemeetings.com/" + href

def req_agenda_get_minutes_url(agenda_url: str) -> str:
    time.sleep(2)
    resp = requests.get(agenda_url)
    return get_minutes_url(resp.text)

In [52]:
with open("scraped_minutes_urls.csv", "w") as f:
    f.write("id,minues_pdf_url\n")
    for n, meeting in enumerate(all_data_flat[:]):
        print(n)
        minutes_pdf_url = req_agenda_get_minutes_url(meeting["agenda_url"])
        f.write(f"{meeting['id']},{minutes_pdf_url}\n")


0


NameError: name 'get_minutes_url' is not defined

In [None]:
    agenda_url = df["agenda_url"][0]


In [154]:
req_agenda_get_minutes_url(df["agenda_url"][0])

'https://pub-peterborough.escribemeetings.com/filestream.ashx?DocumentId=30207'

In [147]:
a = attachment_divs[0].find("a")

'filestream.ashx?DocumentId=30207'

In [141]:
soup.prettify

<bound method Tag.prettify of <!DOCTYPE html>

<html lang="en">
<head><meta charset="utf-8"/><meta content="IE=edge;chrome=1" http-equiv="X-UA-Compatible"/><meta content="width=device-width, initial-scale=1.0" name="viewport"/><title>
	Peterborough Environmental Advisory Committee - November 17, 2021
</title><script src="/bundles/modernizr?v=inCVuEFe6J4Q07A0AcRsbJic_UE5MwpRMNGcOtk94TE1"></script>
<script src="/bundles/jquery?v=8Oos0avDZyPg-cbyVzvkIfERIE1DGSe3sRQdCSYrgEQ1"></script>
<script src="/bundles/Meeting?v=vaEFLlb0HrNgYY1eIGsAWequ3k350RU5S70y-_3tc701"></script>
<script crossorigin="anonymous" integrity="sha384-Q6E9RHvbIyZFJoft+2mJbHaEWldlvI9IOYy5n3zV9zzTtmI3UksdQRVvoxMfooAo" src="https://cdn.jsdelivr.net/npm/popper.js@1.16.0/dist/umd/popper.min.js"></script>
<link href="/Content/css?v=E-sIopF6j7YKm1FVNJShk6CNqZG85Ni4yI98X1mDEqU1" rel="stylesheet"/>
<link href="./FileStream.ashx?DocumentId=30214" rel="stylesheet" type="text/css"/><link href="https://maxst.icons8.com/vue-static/la

In [None]:
requests.

In [53]:
df_minutes = pd.read_csv("scraped_minutes_urls.csv")

In [177]:
records = list(df_minutes.to_records(index=False))

In [180]:
url = records[1][1]

In [181]:
url

'https://pub-peterborough.escribemeetings.com/filestream.ashx?DocumentId=29972'

In [208]:
def get_doc_id(url: str) -> str:
    parsed_url = urlparse(url)
    doc_id = parse_qs(parsed_url.query)['DocumentId'][0]
    return doc_id

def save_pdf_locally(url: str):
    resp = requests.get(url)
    doc_id = get_doc_id(url)
    pdf_fname = f"{doc_id}.pdf"
    with open(f"minutes/{doc_id}.pdf", "wb") as f:
        f.write(resp.content)


In [213]:
for n, record in enumerate(records[3:]):
    print(n, end=", ")
    url = record[1]
    if not url or url == "None":
        continue
    save_pdf_locally(url)
    time.sleep(2)

0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 

In [225]:
minutes_dict: Dict[str, MinutesData] = {}

for record in records:
    meeting_id = record[0]
    url = record[1]
    if url == "None":
        continue
    doc_id = get_doc_id(url)
    pdf_fname = f"{doc_id}.pdf"
    html_fname = f"{doc_id}.html"
    minutes_data: MinutesData = {"pdf_fname": pdf_fname, "html_fname": html_fname, "url": url}
    minutes_dict[meeting_id] = minutes_data

In [227]:
json.dump(minutes_dict, open("minutes_dict.json", "w"))