## Scrape meetings

#### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
pd.set_option("display.max_colwidth", None)

In [4]:
today = pd.Timestamp("today").strftime("%Y%m%d")

In [5]:
today

'20231225'

---

## Scrape data

#### Headers for the request

In [6]:
headers = {
    "Access-Control-Allow-Origin": "*",
    "Access-Control-Allow-Methods": "GET",
    "Access-Control-Allow-Headers": "Content-Type",
    "Access-Control-Max-Age": "3600",
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
}

#### Loop through meeting IDs to scrape their pages into a list of dictionaries

In [7]:
%%time

dict_list = []

for d in tqdm(range(30, 16000)):
    url = f"https://lacoaa.org/meeting-details.php?id={d}?pmethod=4"
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.text, "html.parser")
        
    try:
        data_dict = {
            "id": d,
            "title": soup.find("h1").text.title(),
            "format": soup.findAll("span", class_="mb-4")[0].text,
            "address": soup.find("iframe")["src"].split("q=", 1)[1],
            "virtual": soup.find(lambda tag: tag.name == "p" and "ID" in tag.text).text.strip() if soup.find(lambda tag: tag.name == "p" and "ID" in tag.text) else "",
            "date_time": soup.find(lambda tag: tag.name == "p" and "@" in tag.text).text.strip() if soup.find(lambda tag: tag.name == "p" and "@" in tag.text) else "",
            "details": soup.find(lambda tag: tag.name == "p" and "This meeting" in tag.text).text.strip() if soup.find(lambda tag: tag.name == "p" and "This meeting" in tag.text) else "",
            "tags": soup.findAll("p", class_="lead-small")[0]
            .text.replace("\ncheck", "")
            .replace("check", "; ")
            .strip(),
            "language": soup.findAll("p", class_="lead-small")[2]
            .text.replace("check", "")
            .strip(),
            "type": soup.findAll("p", class_="lead-small")[1]
            .text.replace("check", "")
            .strip(),
        }
    except:
        continue
        
    dict_list.append(data_dict)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15970/15970 [53:10<00:00,  5.01it/s]

CPU times: user 10min 36s, sys: 57.1 s, total: 11min 33s
Wall time: 53min 10s





#### Read list of dictionaries into a dataframe

In [8]:
src = pd.DataFrame(dict_list)

In [9]:
src

Unnamed: 0,id,title,format,address,virtual,date_time,details,tags,language,type
0,30,Spin The Bottle,In Person,"2930 Hyperion Ave Los Angeles, CA 90027",,Thursday @ 8:00PM - 9:00PM,,Open; Speaker; Tag Participation,English,LGBTQ
1,36,Daily Reflections,In Person,"3510 Baldwin Park Blvd Baldwin Park, CA 91706",,Monday @ 7:00PM - 8:30PM,This meeting is in a church.,Open,English,No Types Set
2,37,As Bill Sees It,In Person,"4743 Maine Ave Baldwin Park, CA 91706",,Wednesday @ 6:00PM - 7:15PM,This meeting is in a commercial center.,As Bill Sees It,English,No Types Set
3,44,Bellflower Big Book Group,Hybrid,"9603 Belmont St Bellflower, CA 90706",ID: 563 370 404PH: 12133388477436554PW: bigbook,Monday @ 7:30PM,,Open; Speaker,English,No Types Set
4,48,Roxbury Group,In Person,"471 S Roxbury Dr Beverly Hills, CA 90212",,Tuesday @ 8:00PM - 9:00PM,,Discussion; Open; Speaker,English,No Types Set
...,...,...,...,...,...,...,...,...,...,...
2740,15502,We Are Responsible Group,In Person,"4032 Whitsett Avenue Studio City, CA 91604",,Saturday @ 12:00PM - 1:00PM,,Birthday; Closed; Literature,English,No Types Set
2741,15503,Am Attitude Adjustment,Hybrid,"510 S 2nd Ave Covina, CA 91723",ID: 816-161-354PH: 1(669) 900-6833PW: 816816,Thursday @ 6:15AM - 7:15AM,,Closed,English,No Types Set
2742,15505,Women'S As Bill Sees It,In Person,"510 S 2nd Ave Covina , CA 91723",,Monday @ 6:30PM - 7:30PM,,Closed,English,Women
2743,15506,Women'S Mixed Book,In Person,"510 S 2nd Ave Covina, CA 91723",,Friday @ 6:30PM - 7:30PM,,Closed,English,Women


---

## Processing

#### Get ZIP Codes from addresses

In [10]:
src["zip_code"] = src["address"].str[-5:]

#### Split day/time into separate columns

In [11]:
src[["day", "time"]] = src["date_time"].str.split(" @ ", expand=True)
src[["start", "end"]] = src["time"].str.split(" - ", expand=True)

#### Meeting URL

In [12]:
src["url"] = "https://lacoaa.org/meeting-details.php?id=" + src["id"].astype(str)

In [13]:
src["format"] = src["format"].str.replace("In Person", "Person")

In [14]:
src = src[~src["date_time"].str.contains("ID:")].copy()

#### Clean dataframe that excludes women and other languages

In [15]:
df = src.query("language == 'English' and type != 'Women'")[
    [
        "title",
        "day",
        "start",
        "end",
        "format",
        "zip_code",
        "address",
        "virtual",
        "details",
        "tags",
        "type",
        "url",
    ]
].copy()

---

## Analysis

#### Listings in specific ZIP codes

In [23]:
df.query("zip_code == '90045' and day == 'Monday'")

Unnamed: 0,title,day,start,end,format,zip_code,address,virtual,details,tags,type,url
830,Westchester Monday Night Speakers,Monday,7:30PM,8:30PM,Hybrid,90045,"6323 W 80th St Westchester , CA 90045",ID: 827 8300 2767PH: (669) 900-6833PW: 118033,,No Formats Set,No Types Set,https://lacoaa.org/meeting-details.php?id=12532


#### Meetings by day

In [17]:
df.groupby("day").agg({"url": "count"}).reset_index().rename(columns={"url": "count"})

Unnamed: 0,day,count
0,Friday,216
1,Monday,216
2,Saturday,183
3,Sunday,196
4,Thursday,210
5,Tuesday,209
6,Wednesday,196


#### Meetings by format

In [18]:
df.groupby("format").agg({"url": "count"}).reset_index().rename(
    columns={"url": "count"}
)

Unnamed: 0,format,count
0,Hybrid,184
1,Person,1242


---

## Export

In [19]:
df.to_csv(f"data/processed/friends_of_bill_meetings_{today}.csv", index=False)