In [69]:
# Imports
import pandas as pd
import json
import requests
import re
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import datetime as dt

def copy_json_file(source_file_path, destination_file_path):
    """
    Function to take the source path of a json file and make
    a copy of the json file to the destination_file_path.
    """
    # Step 1: Open and read the JSON file
    with open(source_file_path, 'r') as json_file:
        # Load the JSON content into a Python data structure
        data = json.load(json_file)

    # Step 2: Create or open the destination JSON file and write the data to it
    with open(destination_file_path, 'w') as destination_json_file:
        # Write the data to the destination file
        json.dump(data, destination_json_file, indent=4)  # You can use indent for pretty formatting if desired
    
    return

def save_json_file(data, file_path):
    # Step 1: Create or open the destination JSON file and write the data to it
    with open(file_path, 'w') as json_file:
        # Write the data to the destination file
        json.dump(data, json_file, indent=4)  # You can use indent for pretty formatting if desired

def standardize_text(string):
    """
    Initially designed for handling strings containing times.
    Removes periods, lowercases it, and removes spaces.
    """
    # Remove punctuation
    string = string.replace('.', '')

    # Lowercase
    string = string.lower()

    # Remove spaces
    string = string.replace(' ', '')

    # Replace noon with 12
    string = string.replace('noon', '12pm')
    
    return string

def parse_time_to_timestamp(time_str):
    """
    Given a string containing time(s), extract the time
    in the string and convert it to HH:MM form (with :MM
    optional), and specifying am or pm.
    """

    # Regular expression pattern to match time formats
    time_pattern = r'(\d{1,2}(?::\d{2})?)\s?(am|pm)?'

    # Match the time components
    match = re.match(time_pattern, time_str, re.IGNORECASE)
    if match:
        # Extract hours and minutes
        hours_minutes = match.group(1)
        am_pm = match.group(2)

        if am_pm:
            am_pm = am_pm.lower()

        # Convert hours to 24-hour format if needed
        if ':' in hours_minutes:
            hours, minutes = hours_minutes.split(':')
            if am_pm == 'am':
                hours = str(int(hours))
                time_str = f'{hours}:{minutes}'
            elif am_pm == 'pm':
                hours = str(int(hours) + 12)
                time_str = f'{hours}:{minutes}'
        elif ':' not in hours_minutes:
            hours = hours_minutes
            if am_pm == 'am':
                if int(hours) == 12:
                    time_str = '00:00'
                elif len(hours) == 1:
                    hours = hours.zfill(2)
                    time_str = f'{hours}:00'
                else:
                    time_str = f'{hours}:00'
            elif am_pm == 'pm':
                if int(hours) == 12:
                    time_str = f'{hours}:00'
                else:
                    hours = str(int(hours) + 12)
                    time_str = f'{hours}:00'

        # Convert the time string to a datetime timestamp
        try:
            timestamp = datetime.strptime(time_str, '%H:%M').time()
            return timestamp
        except ValueError:
            return None

    return None

def get_sorting_index(date, start_time, end_time):
    """
    Given a start_time and end_time, get a timestamp (then convert)
    to string of the start time if possible, otherwise the end time,
    which will be used as the sorting index to sort events.
    """
    # If we have the date, start with that as the sorting index
    if date:
        time_sort = date
        
        # If we have the start time, use that to sort also
        if start_time:
            time_sort += parse_time_to_timestamp(start_time)
            # # Convert the datetime object to a string in a specific format
            # try:
            #     time_sort = time_sort.strftime("%H:%M:%S")
            # except AttributeError:
            #     time_sort = None
        
        # Otherwise use the end time if we have it
        elif end_time:
            time_sort += parse_time_to_timestamp(end_time)
            # # Convert the datetime object to a string in a specific format
            # time_sort = time_sort.strftime("%H:%M:%S")
    
        else:
            time_sort = None

    else:
        return None

def next_occurrence_of_day(day_name):
    """
    Given a day of the week, return date of the next occurrence
    of that day, inclusive of the current day (e.g if it's Saturday,
    today's date will be returned for 'sat' or 'saturday').
    """
    
    # Get the current date
    current_date = datetime.now().date()

    # Define a dictionary to map day names to their corresponding numbers
    days = {
        "mon": 0, "monday": 0,
        "tue": 1, "tuesday": 1, "tues": 1,
        "wed": 2, "wednesday": 2,
        "thu": 3, "thursday": 3, "thur": 3, "thurs": 3,
        "fri": 4, "friday": 4,
        "sat": 5, "saturday": 5,
        "sun": 6, "sunday": 6
    }

    # Get the current day of the week as a number (0 = Monday, 6 = Sunday)
    current_day = current_date.weekday()

    # Get the day of the week specified
    specified_day = days.get(day_name.lower(), None)

    if specified_day is None:
        return None

    # Calculate the number of days until the specified day
    days_until_next_occurrence = (specified_day - current_day) % 7

    # Calculate the date of the next occurrence
    next_date = current_date + timedelta(days=days_until_next_occurrence)

    return next_date

def extract_date(date_str):
    
    # Lowercase the date
    date = date_str.lower()

    # If there is no month in the date string, get the date of the next occurrence of day of the week
    if not any(m in date for m in [
        'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec',
        'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december',
    ]):
        return next_occurrence_of_day(date)

    # Get year
    years = []
    for i in range(4):
        if str(int(pd.Timestamp.now().year) + i) in date:
            years.append(pd.Timestamp.now().year + i)
            date = date.replace(str(pd.Timestamp.now().year + i), '')
    if len(years) > 0:
        year = min(years)
    else:
        year = pd.Timestamp.now().year

    # Remove days of the week and comma/space
    for string in ('mon', 'mo', 'tues', 'tue', 'tu', 'wed', 'we', 'thurs', 'thu', 'th', 'fri', 'sat', 'sa', 'sun', 'su', 'day', ','):
        date = date.replace(string, '')
    date = date.split('–')[0]
    date = date.strip()

    # Get month
    month_to_num_dict = {
        'jan': 1,
        'feb': 2,
        'mar': 3,
        'apr': 4,
        'may': 5,
        'jun': 6,
        'jul': 7,
        'aug': 8,
        'sep': 9,
        'oct': 10,
        'nov': 11,
        'dec': 12
    }
    month_num = month_to_num_dict[date[:3]]

    # Get day
    day = int(date.split(' ')[1])

    if month_num and day and year:
        return dt.date(year, month_num, day)
    
    return None

def get_de_young_events():
    """
    Uses BeautifulSoup to scrape event info from the de Young
    Museum and Legion of Honor's calendar.
    """
    print("Collecting events from the de Young & Legion of Honor...")

    # Collect event info
    events_list = []

    # Iterate through the pages
    for i in range(1, 10):

        url = "https://www.famsf.org/calendar" + f"?page={i}"
    
        # Send a GET request to fetch the webpage content
        response = requests.get(url)
        html_content = response.content
        
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Find elements a class
        group_elements = soup.find_all(class_="mt-24 xl:mt-32")

        # If no pages left, exit loop
        if len(group_elements) == 0: # this will be 0 when we've gone through all the pages
            break
        
        for e in group_elements:
            
            # Extract title
            title = e.find("a").find("h3").get_text().strip()
            
            # Extract link
            link = e.find("a").get("href")
            
            # Extract date info
            date = e.find(class_="mt-12 text-secondary f-subheading-1").get_text()

            def extract_time(string):

                # Standardize string
                string = standardize_text(string)
                
                # If there is \\ in it, there's a time
                if '\\' in string:
                
                    # Split by \\ and choose [-1], then by , and choose [0], and remove spaces
                    string = string.split('\\')[-1].split(',')[0].replace(' ', '')
                    
                    # If a hyphen ("–") is in it, there is a start and end time, split by dash
                    if '–' in string:
                        start_time = string.split('–')[0]
                        end_time = string.split('–')[1]
                
                    # Elif a + is in it, there are two start times, split by +
                    elif '+' in string:
                        start_time = string.split('+')[0]
                        end_time = None
                    
                    # Otherwise
                    else:
                        start_time = string
                        end_time = None

                else:
                    start_time = None
                    end_time = None

                return start_time, end_time

            # Extract time
            start_time, end_time = extract_time(date)

            print('Date:', date)
            print()
            
            # Get sorting index
            if date[:7].lower() == 'through': # if through a certain date, it's current/happening now
                time_sort = pd.Timestamp.now().date().strftime('%Y-%m-%d %H:%M:%S')
            # Otherwise, use the date string
            else:
                # Extract date
                date = extract_date(date)
                # print('Ext:', date) # TEMP
                # print() # TEMP
                time_sort = get_sorting_index(date, start_time, end_time)
            
            # if not time_sort:
            #     print('Date:', date)
            #     print('Ext:', extract_date(date))
            #     print('Standardized date:', standardize_text(date))
            #     print('Start time:', start_time)
            #     print('End time:', end_time)
            #     print('date[:7]', date[:7])
            #     print('Sort time:', time_sort)
            #     print()

            # Extract venue
            try:
                venue = e.find(class_="text-inherit pt-2 ml-8").get_text()
            except AttributeError:
                venue = "unknown"
            
            # Add tags
            tags = []
            event_type = e.find(class_="text-inherit pt-2").get_text().lower() # this is Exhibition or Event
            if event_type == "exhibition":
                tags.append("exhibition")
            if "tour" in title.lower():
                tags.append("tour")
            if "family" in title.lower():
                tags.append("family")
            if "youngster" in title.lower():
                tags.append("family")
            if "reading" in title.lower():
                tags.append("reading")
            if "concert" in title.lower():
                tags.append("audio")
            if "song bath" in title.lower():
                tags.append("audio")
            if "workshop" in title.lower():
                tags.append("workshop")
            if "free" in title.lower():
                tags.append("free")
            if "opening" in title.lower():
                tags.append("opening")
            if "member" in title.lower():
                tags.append("members only")
            if "symposium" in title.lower():
                tags.append("symposium")
            if "lecture" in title.lower():
                tags.append("talk")
            if "talk" in title.lower():
                tags.append("talk")
            if "conversation" in title.lower():
                tags.append("talk")
            if "party" in title.lower():
                tags.append("party")
            if "queer" in title.lower():
                tags.append("queer")
            if "virtual" in title.lower():
                tags.append("virtual")
            
            # Collect data
            events_list.append(
                {
                    "Title": title,
                    "Links": [{
                        "Link": link,
                        "Text": "Event Page",
                    }],
                    "Date": date,
                    "StartTime": start_time,
                    "EndTime": end_time,
                    "TimeSort": time_sort,
                    "Venue": venue,
                    "Tags": list(set(tags)) # get unique list of tags
                }
            )

    print("Completed. Collected {:,} events.".format(len(events_list)))
    return events_list

def get_berkeley_art_center_events():
    """
    Uses BeautifulSoup to scrape event info from Berkeley Art 
    Center's calendar.
    """
    print("Collecting events from Berkeley Art Center...")
    
    # Collect event info
    events_list = []
    
    # URL of the website to scrape
    url = "https://www.berkeleyartcenter.org/calendar"
    
    # Send a GET request to fetch the webpage content
    response = requests.get(url)
    html_content = response.content
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # We'll use this later to identify dates
    days_of_week = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]

    # Get elements
    elements = soup.find_all(class_="col sqs-col-6 span-6")

    # Iterate through elements
    for e in elements:
        
        # If looking at past events, stop the loop
        if "past events" in e.find_previous("h1").text.lower():
            break
        
        # Otherwise, we're looking at current events --> collect events
        else:
            h3s = e.find_all("h3")
            
            # If there are any h3 elements
            if len(h3s) > 0:
                
                # Identify title
                title = h3s[0].get_text().strip()

                # Tag events and collect dates
                dates = []
                tags = []
                # Iterate through h3 elements
                for h in h3s:
                    if any(x in h.get_text().lower() for x in days_of_week):
                        dates.append(h.get_text().strip())
                    if "opening" in h.get_text().lower():
                        tags.append("opening")
                    if "conversation" in h.get_text().lower():
                        tags.append("talk")
                    if "talk" in h.get_text().lower():
                        tags.append("talk")
                    if "dialogue" in h.get_text().lower():
                        tags.append("talk")
                    if "performance" in h.get_text().lower():
                        tags.append("performance")
                    if "workshop" in h.get_text().lower():
                        tags.append("workshop")
                    if "party" in h.get_text().lower():
                        tags.append("party")
                    if "queer" in h.get_text().lower():
                        tags.append("queer")
                    if "virtual" in h.get_text().lower():
                        tags.append("virtual")
                    if "zoom" in h.get_text().lower():
                        tags.append("virtual")
                
                # Combine dates
                date = " | ".join(dates)
                date_text = date # copy date text to use for other purpose
                date = date.replace(" on zoom.", "")
                
                # Identify location
                try:
                    "berkeley art center" in h3s[2].get_text().lower()
                    venue = h3s[2].get_text()
                except IndexError:
                    venue = "Berkeley Art Center"
                else:
                    venue = "Berkeley Art Center"
                if "on zoom" in date_text.lower():
                    venue = "Virtual"
                
                # Get links
                links = []
                link_elements = e.find_all("a")
                if len(link_elements) > 0:
                    for l in link_elements:
                        link_url = l.get("href")
                        if "eventbrite" in link_url.lower():
                            links.append({
                                "Link": link_url,
                                "Text": "Eventbrite"
                            })
                        elif ("berkeleyartcenter" in link_url.lower() and link_url.lower() != "https://www.berkeleyartcenter.org/upcoming-exhibitions"):
                            links.append({
                                "Link": link_url,
                                "Text": "Event Page"
                            })
                        else:
                            links.append({
                                "Link": link_url,
                                "Text": "unknown"
                            })
                
                def extract_time(string):
        
                    # Standardize string
                    string = standardize_text(string)
                    
                    # If there is a hyphen, that indicates there is a start and end time
                    if '–' in date:
                        # Get the start date from the left side of the hyphen
                        start_time = string.split('–')[0]
                        # Get the start date from the right side of the hyphen
                        try:
                            end_time = string.split('–')[1]
                        except IndexError:
                            end_time = None
                        # If the end time is AM, then the start time must be AM
                        if 'am' in end_time:
                            start_time += 'am'
                        # If the end time is PM
                        else:
                            # If the hour of the start time is before the hour of the end time, it must be PM
                            if int(start_time) < int(end_time.replace('am', '').replace('pm', '')):
                                start_time += 'pm'
                            # If the hour is after, it must be AM
                            else:
                                start_time += 'am'
                    
                    # If there is no end time / just a start time
                    else:
                        start_time = string
                        end_time = None

                    return start_time, end_time
                
                # Extract time
                start_time, end_time = extract_time(date.split(' ')[-1])
                
                # Get sorting index
                date = None # come back to this!!! need to parse dates, just using start/end times currently 
                time_sort = get_sorting_index(date, start_time, end_time)

                # Collect event data
                events_list.append(
                    {
                        "Title": title,
                        "Links": links,
                        "Date": date,
                        "StartTime": start_time,
                        "EndTime": end_time,
                        "TimeSort": time_sort,
                        "Venue": venue,
                        "Tags": list(set(tags)) # get unique list of tags
                    }
                )
    print("Completed. Collected {:,} events.".format(len(events_list)))
    return events_list

def is_event_recurring(event_str):
    """
    Function that takes in a string containing date and time information
    about an event and identifies whether it is a recurring event or not.
    """
    # Check if the string contains "through" (case insensitive)
    if re.search(r'through', event_str, re.IGNORECASE):
        return True

    # Check if the string contains a "–" or "," between two days of the week or abbreviations
    if re.search(r'(\w{3} – \w{3}|\w{3}, \w{3})', event_str):
        return True

    # Check if the string contains a "+" and has more than one day of the week or abbreviations
    if re.search(r'\+\s*(\w{3}|\w{3},)+', event_str):
        return True

    return False

def main():
    """
    Function that:
        1. Saves a copy of existing data
        2. Scrapes data from the de Young Museum & Legion of Honor
        3. Scrapes data from the Berkeley Art Center
        4. Saves data as json
    """

    # Save a copy of existing json data
    try:
        copy_json_file('website/data.json', 'website/data_copy.json')
    except FileNotFoundError:
        pass

    # Instantiate list to collect data
    data = []

    # Scrape data from the de Young Museum & Legion of Honor
    de_young_data = get_de_young_events()
    data += de_young_data

    # # Scrape data from the Berkeley Art Center
    # berkeley_art_center_data = get_berkeley_art_center_events()
    # data += berkeley_art_center_data

    return data

#     # Save data as json
#     save_json_file(data, 'website/data.json')
#     print("."*15)
#     print("Saved {:,} events.".format(len(data)))

#     return

# if __name__ == "__main__":
#     main()


In [70]:
data = main()

Collecting events from the de Young & Legion of Honor...
Date: Through Sep 15, 2024

Date: Through May 5, 2024

Date: Sat \ 4 pm

Date: Sat \ 9:30 am – 5:15 pm

Date: Tues – Fri, Sun \ Noon – 1 pm, 2 – 3 pm

Date: Through Oct 15, 2023

Date: Through Oct 1, 2023

Date: Sat \ 9:30 am – 5:15 pm

Date: Through Sep 24, 2023

Date: Tues – Sun \ 11:30 am + 1:30 pm

Date: Sat \ noon – 12:40 pm, 3 – 3:40 pm

Date: Through Mar 24, 2024

Date: Through Dec 31, 2023

Date: Sat, Sun, Tues \ 11:30 am – 1:30 pm

Date: Select Saturdays \ 10:30 am

Date: Through Feb 25, 2024

Date: Select Saturdays \ 11 am – 3 pm

Date: Through Mar 10, 2024

Date: Select Saturdays \ 11 am – 3 pm

Date: Sat, Sun, Free Tues \ 11:30 am + 1 pm

Date: Sat, Sep 9 \ 2 – 3:30 pm



TypeError: unsupported operand type(s) for +=: 'datetime.date' and 'NoneType'

In [56]:
def next_occurrence_of_day(day_name):
    """
    Given a day of the week, return date of the next occurrence
    of that day, inclusive of the current day (e.g if it's Saturday,
    today's date will be returned for 'sat' or 'saturday').
    """
    
    # Get the current date
    current_date = datetime.now().date()

    # Define a dictionary to map day names to their corresponding numbers
    days = {
        "mon": 0, "monday": 0,
        "tue": 1, "tuesday": 1, "tues": 1,
        "wed": 2, "wednesday": 2,
        "thu": 3, "thursday": 3, "thur": 3, "thurs": 3,
        "fri": 4, "friday": 4,
        "sat": 5, "saturday": 5,
        "sun": 6, "sunday": 6
    }

    # Get the current day of the week as a number (0 = Monday, 6 = Sunday)
    current_day = current_date.weekday()

    # Get the day of the week specified
    specified_day = days.get(day_name.lower(), None)

    if specified_day is None:
        return "Invalid day name"

    # Calculate the number of days until the specified day
    days_until_next_occurrence = (specified_day - current_day) % 7

    # Calculate the date of the next occurrence
    next_date = current_date + timedelta(days=days_until_next_occurrence)

    return next_date

# Example usage:
day_name = "Wednesday"  # Replace with the day of the week you want
next_date = next_occurrence_of_day(day_name)
print(f"The next {day_name} is on {next_date}")


The next Wednesday is on 2023-09-13


In [55]:
next_occurrence_of_day('sat')

datetime.date(2023, 9, 9)

In [40]:
data

[{'Title': 'Nampeyo and the Sikyátki Revival',
  'Links': [{'Link': 'https://www.famsf.org/exhibitions/nampeyo-and-sikyatki-revival',
    'Text': 'Event Page'}],
  'Date': 'Through Sep 15, 2024',
  'StartTime': None,
  'EndTime': None,
  'TimeSort': '2023-09-09 00:00:00',
  'Venue': 'de Young',
  'Tags': ['exhibition']},
 {'Title': 'Lhola Amira: Facing the Future',
  'Links': [{'Link': 'https://www.famsf.org/exhibitions/lhola-amira',
    'Text': 'Event Page'}],
  'Date': 'Through May 5, 2024',
  'StartTime': None,
  'EndTime': None,
  'TimeSort': '2023-09-09 00:00:00',
  'Venue': 'de Young',
  'Tags': ['exhibition']},
 {'Title': 'Organ Concert',
  'Links': [{'Link': 'https://www.famsf.org/events/organ-concert',
    'Text': 'Event Page'}],
  'Date': 'Sat \\ 4 pm',
  'StartTime': '4pm',
  'EndTime': None,
  'TimeSort': '2023-09-09 00:00:00',
  'Venue': 'Legion of Honor',
  'Tags': ['audio']},
 {'Title': 'Free Saturdays at the Legion of Honor',
  'Links': [{'Link': 'https://www.famsf.org/

In [42]:
for e in data:
    print('Date:', e['Date'])
    print('StartTime:', e['StartTime'])
    print('EndTime:', e['EndTime'])
    print('TimeSort:', e['TimeSort'])
    print()

Date: Through Sep 15, 2024
StartTime: None
EndTime: None
TimeSort: 2023-09-09 00:00:00

Date: Through May 5, 2024
StartTime: None
EndTime: None
TimeSort: 2023-09-09 00:00:00

Date: Sat \ 4 pm
StartTime: 4pm
EndTime: None
TimeSort: 2023-09-09 00:00:00

Date: Sat \ 9:30 am – 5:15 pm
StartTime: 9:30am
EndTime: 5:15pm
TimeSort: 2023-09-09 00:00:00

Date: Tues – Fri, Sun \ Noon – 1 pm, 2 – 3 pm
StartTime: 12pm
EndTime: 1pm
TimeSort: 2023-09-09 00:00:00

Date: Through Oct 15, 2023
StartTime: None
EndTime: None
TimeSort: 2023-09-09 00:00:00

Date: Through Oct 1, 2023
StartTime: None
EndTime: None
TimeSort: 2023-09-09 00:00:00

Date: Sat \ 9:30 am – 5:15 pm
StartTime: 9:30am
EndTime: 5:15pm
TimeSort: 2023-09-09 00:00:00

Date: Through Sep 24, 2023
StartTime: None
EndTime: None
TimeSort: 2023-09-09 00:00:00

Date: Tues – Sun \ 11:30 am + 1:30 pm
StartTime: 11:30am
EndTime: None
TimeSort: 2023-09-09 00:00:00

Date: Sat \ noon – 12:40 pm, 3 – 3:40 pm
StartTime: 12pm
EndTime: 12:40pm
TimeSort: 202

In [4]:
dates = []
for d in data:
    if ':' not in d['Date'] and 'am' not in d['Date'] and 'pm' not in d['Date']:
        if 'through' not in d['Date'].lower():
            dates.append(d['Date'])
dates

['Mon, Sep 11',
 'Sep 30, 2023 – Jan 7, 2024',
 'Oct 7, 2023 – Jan 7, 2024',
 'Wed, Oct 11 – Sun, Oct 15',
 'Nov 19, 2023 – Feb 11, 2024',
 'Jan 20 – Aug 11, 2024',
 'Feb 17 – Jul 7, 2024',
 'Mar 16 – Jul 21, 2024',
 'Mar 30 – Aug 25, 2024',
 'Apr 6 – Aug 18, 2024',
 'Sat, May 4',
 'Friday, september 8, 6–8PM',
 'Tuesday, August 29 from 7–8PM']

In [5]:
# Regex match MMM D or MMM DD, choose first match

# Regex match for year YYYY

# If year, that's the year

    # Turn into datetime

# Otherwise
    # It's the current year
    # Turn into datetime




In [16]:
def extract_and_convert_date(date_str):
    # Define regular expression pattern for matching MMM D or MMM DD, and year YYYY
    date_pattern = r'([A-Za-z]{3}\s\d{1,2}(?:,\s?\d{4})?)'  # Match MMM D or MMM DD, and optional YYYY

    # Attempt to find a date match in the input string
    date_match = re.search(date_pattern, date_str)
    print(date_match)

    if date_match:
        date_str = date_match.group(1)  # Extract the date portion

        # Check if a year is present in the date string
        if ',' in date_str:
            date_str = date_str.split(',')[0].strip()  # Remove comma and trim spaces

        # Parse the date string into a datetime object
        year = datetime.now().year  # Default to current year
        if re.search(r'\d{4}', date_str):
            year_match = re.search(r'\d{4}', date_str)
            year = int(year_match.group(0))  # Extract the year if present

        date_obj = datetime.strptime(date_str, '%b %d')
        date_obj = date_obj.replace(year=year)  # Set the year

    else:
        # If no valid date is found, you can handle it as needed
        date_obj = None

    return date_obj

# Example usage:
date_str = "Sep 15, 2024"
result = extract_and_convert_date(date_str)

if result:
    # If a valid date is found, you can convert it to a string for JSON
    date_as_string = result.strftime('%Y-%m-%d')
    print("Datetime:", result)
    print("Date as String for JSON:", date_as_string)
else:
    print("No valid date found in the input string.")


<re.Match object; span=(0, 12), match='Sep 15, 2024'>
Datetime: 2023-09-15 00:00:00
Date as String for JSON: 2023-09-15


In [8]:
pd.Timestamp.now().year

2023

In [9]:
for i in range(1,4):
    print(i)

1
2
3


In [17]:
def extract_date(date_str):
    
    # Lowercase the date
    date = date_str.lower()

    # Get year
    years = []
    for i in range(4):
        if str(int(pd.Timestamp.now().year) + i) in date:
            years.append(pd.Timestamp.now().year + i)
            date = date.replace(str(pd.Timestamp.now().year + i), '')
    if len(years) > 0:
        year = min(years)
    else:
        year = pd.Timestamp.now().year

    # Remove days of the week and comma/space
    for string in ('mon', 'mo', 'tues', 'tue', 'tu', 'wed', 'we', 'thurs', 'thu', 'th', 'fri', 'sat', 'sa', 'sun', 'su', 'day', ','):
        date = date.replace(string, '')
    date = date.split('–')[0]
    date = date.strip()

    # Get month
    month_to_num_dict = {
        'jan': 1,
        'feb': 2,
        'mar': 3,
        'apr': 4,
        'may': 5,
        'jun': 6,
        'jul': 7,
        'aug': 8,
        'sep': 9,
        'oct': 10,
        'nov': 11,
        'dec': 12
    }
    month_num = month_to_num_dict[date[:3]]

    # Get day
    day = int(date.split(' ')[1])

    if month_num and day and year:
        return dt.date(year, month_num, day)
    
    return None

In [12]:
dt.date(2021, 3, 2)

datetime.date(2021, 3, 2)

In [18]:
for d in dates:
    # print(d)
    print(extract_date(d))
    # print(extract_and_convert_date(d))
    print()
    

2023-09-11

2023-09-30

2023-10-07

2023-10-11

2023-11-19

2024-01-20

2024-02-17

2024-03-16

2024-03-30

2024-04-06

2023-05-04

2023-09-08

2023-08-29



In [158]:
def parse_time_to_timestamp(time_str):
    """
    Given a string containing time(s), extract the time
    in the string and convert it to HH:MM form (with :MM
    optional), and specifying am or pm.
    """

    # Regular expression pattern to match time formats
    time_pattern = r'(\d{1,2}(?::\d{2})?)\s?(am|pm)?'

    # Match the time components
    match = re.match(time_pattern, time_str, re.IGNORECASE)

    if match:
        # Extract hours and minutes
        hours_minutes = match.group(1)
        am_pm = match.group(2)

        if am_pm:
            am_pm = am_pm.lower()

        # Convert hours to 24-hour format if needed
        if ':' in hours_minutes:
            hours, minutes = hours_minutes.split(':')
            if am_pm == 'am':
                hours = str(int(hours))
                time_str = f'{hours}:{minutes}'
            elif am_pm == 'pm':
                hours = str(int(hours) + 12)
                time_str = f'{hours}:{minutes}'
        elif ':' not in hours_minutes:
            hours = hours_minutes
            if am_pm == 'am':
                if int(hours) == 12:
                    time_str = '00:00'
                elif len(hours) == 1:
                    hours = hours.zfill(2)
                    time_str = f'{hours}:00'
                else:
                    time_str = f'{hours}:00'
            elif am_pm == 'pm':
                if int(hours) == 12:
                    time_str = f'{hours}:00'
                else:
                    hours = str(int(hours) + 12)
                    time_str = f'{hours}:00'

        # Convert the time string to a datetime timestamp
        try:
            timestamp = datetime.strptime(time_str, '%H:%M').time()
            return timestamp
        except ValueError:
            return None

    return None

In [164]:
parse_time_to_timestamp('12pm')

datetime.time(12, 0)

In [None]:
# Maybe add "DateTimeDisplayText" for the date/time text that is displayed

In [212]:
# Imports
import pandas as pd
import json
import requests
import re
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import datetime as dt

def copy_json_file(source_file_path, destination_file_path):
    """
    Function to take the source path of a json file and make
    a copy of the json file to the destination_file_path.
    """
    # Step 1: Open and read the JSON file
    with open(source_file_path, 'r') as json_file:
        # Load the JSON content into a Python data structure
        data = json.load(json_file)

    # Step 2: Create or open the destination JSON file and write the data to it
    with open(destination_file_path, 'w') as destination_json_file:
        # Write the data to the destination file
        json.dump(data, destination_json_file, indent=4)  # You can use indent for pretty formatting if desired
    
    return

def save_json_file(data, file_path):
    # Step 1: Create or open the destination JSON file and write the data to it
    with open(file_path, 'w') as json_file:
        # Write the data to the destination file
        json.dump(data, json_file, indent=4)  # You can use indent for pretty formatting if desired

def standardize_text(string):
    """
    Initially designed for handling strings containing times.
    Removes periods, lowercases it, and removes spaces.
    """
    # Remove punctuation
    string = string.replace('.', '')

    # Lowercase
    string = string.lower()

    # Remove spaces
    string = string.replace(' ', '')

    # Replace noon with 12
    string = string.replace('noon', '12pm')
    
    return string

def parse_time_to_timestamp(time_str):
    """
    Given a string containing time(s), extract the time
    in the string and convert it to HH:MM form (with :MM
    optional), and specifying am or pm.
    """

    # Regular expression pattern to match time formats
    time_pattern = r'(\d{1,2}(?::\d{2})?)\s?(am|pm)?'

    # Match the time components
    match = re.match(time_pattern, time_str, re.IGNORECASE)
    if match:
        # Extract hours and minutes
        hours_minutes = match.group(1)
        am_pm = match.group(2)

        if am_pm:
            am_pm = am_pm.lower()

        # Convert hours to 24-hour format if needed
        if ':' in hours_minutes:
            hours, minutes = hours_minutes.split(':')
            if am_pm == 'am':
                hours = str(int(hours))
                time_str = f'{hours}:{minutes}'
            elif am_pm == 'pm':
                hours = str(int(hours) + 12)
                time_str = f'{hours}:{minutes}'
        elif ':' not in hours_minutes:
            hours = hours_minutes
            if am_pm == 'am':
                if int(hours) == 12:
                    time_str = '00:00'
                elif len(hours) == 1:
                    hours = hours.zfill(2)
                    time_str = f'{hours}:00'
                else:
                    time_str = f'{hours}:00'
            elif am_pm == 'pm':
                if int(hours) == 12:
                    time_str = f'{hours}:00'
                else:
                    hours = str(int(hours) + 12)
                    time_str = f'{hours}:00'

        # Convert the time string to a datetime timestamp
        try:
            timestamp = datetime.strptime(time_str, '%H:%M').time()
            return timestamp
        except ValueError:
            return None

    return None

def get_sorting_index(date, start_time, end_time):
    """
    Use the date to sort if we have that. Use the time also
    (start time if available otherwise end time) if we have that.

    Expects date in the form YYYY-MM-DD 00:00:00.
    Expects dater_time in the the form HHam/pm or Ham/pm.
    """
    time_sort = None
    
    # If we have the date, use it as the sorting index
    if date:
        time_sort = date

        # These are potential formats the times could be in
        formats = ['%I%p', '%I:%M%p', '%I%P', '%I:%M%P']
            
        # If we have the start time, use that to sort also
        if start_time:
            for fmt in formats:
                try:
                    # Try to parse time with current format
                    parsed_time = datetime.strptime(start_time, fmt).time()
                    time_sort = datetime.combine(time_sort, parsed_time)
                except ValueError:
                    # If parsing fails, continue to next format
                    continue
        
        # Otherwise use the end time if we have it
        elif end_time:
            for fmt in formats:
                try:
                    # Try to parse time with current format
                    parsed_time = datetime.strptime(end_time, fmt).time()
                    time_sort = datetime.combine(time_sort, parsed_time)
                except ValueError:
                    # If parsing fails, continue to next format
                    continue

    return time_sort

def get_de_young_events():
    """
    Uses BeautifulSoup to scrape event info from the de Young
    Museum and Legion of Honor's calendar.
    """
    print("Collecting events from the de Young & Legion of Honor...")

    # Collect event info
    events_list = []

    # Iterate through the pages
    for i in range(1, 10):

        url = "https://www.famsf.org/calendar" + f"?page={i}"
    
        # Send a GET request to fetch the webpage content
        response = requests.get(url)
        html_content = response.content
        
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Find elements a class
        group_elements = soup.find_all(class_="mt-24 xl:mt-32")

        # If no pages left, exit loop
        if len(group_elements) == 0: # this will be 0 when we've gone through all the pages
            break
        
        for e in group_elements:
            
            # Extract title
            title = e.find("a").find("h3").get_text().strip()
            
            # Extract link
            link = e.find("a").get("href")
            
            # Extract date info
            date = e.find(class_="mt-12 text-secondary f-subheading-1").get_text()

            def extract_time(string):

                # Standardize string
                string = standardize_text(string)
                
                # If there is \\ in it, there's a time
                if '\\' in string:
                
                    # Split by \\ and choose [-1], then by , and choose [0], and remove spaces
                    string = string.split('\\')[-1].split(',')[0].replace(' ', '')
                    
                    # If a hyphen ("–") is in it, there is a start and end time, split by dash
                    if '–' in string:
                        start_time = string.split('–')[0]
                        end_time = string.split('–')[1]
                
                    # Elif a + is in it, there are two start times, split by +
                    elif '+' in string:
                        start_time = string.split('+')[0]
                        end_time = None
                    
                    # Otherwise
                    else:
                        start_time = string
                        end_time = None

                else:
                    start_time = None
                    end_time = None

                return start_time, end_time

            # Extract time
            start_time, end_time = extract_time(date)

            # Extract venue
            try:
                venue = e.find(class_="text-inherit pt-2 ml-8").get_text()
            except AttributeError:
                venue = "unknown"
            
            # Add tags
            tags = []
            event_type = e.find(class_="text-inherit pt-2").get_text().lower() # this is Exhibition or Event
            if event_type == "exhibition":
                tags.append("exhibition")
            if "tour" in title.lower():
                tags.append("tour")
            if "family" in title.lower():
                tags.append("family")
            if "youngster" in title.lower():
                tags.append("family")
            if "reading" in title.lower():
                tags.append("reading")
            if "concert" in title.lower():
                tags.append("audio")
            if "song bath" in title.lower():
                tags.append("audio")
            if "workshop" in title.lower():
                tags.append("workshop")
            if "free" in title.lower():
                tags.append("free")
            if "opening" in title.lower():
                tags.append("opening")
            if "member" in title.lower():
                tags.append("members only")
            if "symposium" in title.lower():
                tags.append("symposium")
            if "lecture" in title.lower():
                tags.append("talk")
            if "talk" in title.lower():
                tags.append("talk")
            if "conversation" in title.lower():
                tags.append("talk")
            if "party" in title.lower():
                tags.append("party")
            if "queer" in title.lower():
                tags.append("queer")
            if "virtual" in title.lower():
                tags.append("virtual")
            
            # Collect data
            events_list.append(
                {
                    "Title": title,
                    "Links": [{
                        "Link": link,
                        "Text": "Event Page",
                    }],
                    "Date": date,
                    "StartTime": start_time,
                    "EndTime": end_time,
                    "Venue": venue,
                    "Tags": list(set(tags)) # get unique list of tags
                }
            )

    print("Completed. Collected {:,} events.".format(len(events_list)))
    return events_list

def get_berkeley_art_center_events():
    """
    Uses BeautifulSoup to scrape event info from Berkeley Art 
    Center's calendar.
    """
    print("Collecting events from Berkeley Art Center...")
    
    # Collect event info
    events_list = []
    
    # URL of the website to scrape
    url = "https://www.berkeleyartcenter.org/calendar"
    
    # Send a GET request to fetch the webpage content
    response = requests.get(url)
    html_content = response.content
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # We'll use this later to identify dates
    days_of_week = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]

    # Get elements
    elements = soup.find_all(class_="col sqs-col-6 span-6")

    # Iterate through elements
    for e in elements:
        
        # If looking at past events, stop the loop
        if "past events" in e.find_previous("h1").text.lower():
            break
        
        # Otherwise, we're looking at current events --> collect events
        else:
            h3s = e.find_all("h3")
            
            # If there are any h3 elements
            if len(h3s) > 0:
                
                # Identify title
                title = h3s[0].get_text().strip()

                # Tag events and collect dates
                dates = []
                tags = []
                # Iterate through h3 elements
                for h in h3s:
                    if any(x in h.get_text().lower() for x in days_of_week):
                        dates.append(h.get_text().strip())
                    if "opening" in h.get_text().lower():
                        tags.append("opening")
                    if "conversation" in h.get_text().lower():
                        tags.append("talk")
                    if "talk" in h.get_text().lower():
                        tags.append("talk")
                    if "dialogue" in h.get_text().lower():
                        tags.append("talk")
                    if "performance" in h.get_text().lower():
                        tags.append("performance")
                    if "workshop" in h.get_text().lower():
                        tags.append("workshop")
                    if "party" in h.get_text().lower():
                        tags.append("party")
                    if "queer" in h.get_text().lower():
                        tags.append("queer")
                    if "virtual" in h.get_text().lower():
                        tags.append("virtual")
                    if "zoom" in h.get_text().lower():
                        tags.append("virtual")

                def extract_date_to_timestamp(s):
                    """
                    Extracts a date from a given string and converts it to a timestamp.
                    
                    The function searches for a date in the format "day of the week, month day, year"
                    within the provided string. The ", year" portion is optional. If the year is not
                    specified, the current year is assumed.
                    """
                    
                    pattern = r'\b(?P<weekday>Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s(?P<month>January|February|March|April|May|June|July|August|September|October|November|December)\s(?P<day>\d{1,2})(,\s(?P<year>\d{4}))?\b'
                    
                    match = re.search(pattern, s, re.IGNORECASE)
                    if match:
                        # Extract components
                        month = match.group("month")
                        day = int(match.group("day"))
                        year = int(match.group("year")) if match.group("year") else datetime.now().year
                        
                        # Convert to datetime object (and then back to string so it's json-serializable)
                        dt = datetime.strptime(f"{month} {day} {year}", "%B %d %Y")
                        return dt
                    
                    return None
                
                # Combine dates and extract date
                date_str = " | ".join([d.replace('.', '').replace('–', '-').replace('-', '-').replace(' from', ',') for d in dates])
                date_text = date_str # copy date text to use for tags
                date_str = date_str.replace(" on zoom.", "")
                date_display_text = ' '.join([word if word in [None] else word.capitalize() for word in date_str.split()]) # include words in list if we don't want to capitalize them
                date = extract_date_to_timestamp(date_str)
                
                # Identify location
                try:
                    "berkeley art center" in h3s[2].get_text().lower()
                    venue = h3s[2].get_text()
                except IndexError:
                    venue = "Berkeley Art Center"
                else:
                    venue = "Berkeley Art Center"
                if "on zoom" in date_text.lower():
                    venue = "Virtual"
                
                # Get links
                links = []
                link_elements = e.find_all("a")
                if len(link_elements) > 0:
                    for l in link_elements:
                        link_url = l.get("href")
                        if "eventbrite" in link_url.lower():
                            links.append({
                                "Link": link_url,
                                "Text": "Eventbrite"
                            })
                        elif ("berkeleyartcenter" in link_url.lower() and link_url.lower() != "https://www.berkeleyartcenter.org/upcoming-exhibitions"):
                            links.append({
                                "Link": link_url,
                                "Text": "Event Page"
                            })
                        else:
                            links.append({
                                "Link": link_url,
                                "Text": "unknown"
                            })
                
                def extract_time(string):
        
                    # Standardize string
                    string = standardize_text(string)
                    string = string.replace('–', '-').replace('-', '-')
                    
                    # If there is a hyphen, that indicates there is a start and end time
                    if '-' in string:
                        # Get start and end times
                        times = string.split('-')
                        start_time = times[0]
                        # Get the start date from the right side of the hyphen
                        try:
                            end_time = times[1]
                        except IndexError:
                            end_time = None
                        # If the end time is AM, then the start time must be AM
                        if 'am' in end_time:
                            start_time += 'am'
                        # If the end time is PM
                        else:
                            # If the hour of the start time is before the hour of the end time, it must be PM
                            if int(start_time) < int(end_time.replace('am', '').replace('pm', '')):
                                start_time += 'pm'
                            # If the hour is after, it must be AM
                            else:
                                start_time += 'am'
                    
                    # If there is no end time / just a start time
                    else:
                        start_time = string
                        end_time = None

                    return start_time, end_time
                
                # Extract time
                start_time, end_time = extract_time(date_str.split(' ')[-1])

                # Get sorting index
                time_sort = get_sorting_index(date, start_time, end_time)

                # Collect event data
                events_list.append(
                    {
                        "Title": title,
                        "Links": links,
                        "Date": date.strftime('%Y-%m-%d %H:%M:%S'),
                        "DateText": date_display_text,
                        "StartTime": start_time,
                        "EndTime": end_time,
                        "TimeSort": None,
                        "Venue": venue,
                        "Tags": list(set(tags)) # get unique list of tags
                    }
                )
    print("Completed. Collected {:,} events.".format(len(events_list)))
    return events_list

def is_event_recurring(event_str):
    """
    Function that takes in a string containing date and time information
    about an event and identifies whether it is a recurring event or not.
    """
    # Check if the string contains "through" (case insensitive)
    if re.search(r'through', event_str, re.IGNORECASE):
        return True

    # Check if the string contains a "–" or "," between two days of the week or abbreviations
    if re.search(r'(\w{3} – \w{3}|\w{3}, \w{3})', event_str):
        return True

    # Check if the string contains a "+" and has more than one day of the week or abbreviations
    if re.search(r'\+\s*(\w{3}|\w{3},)+', event_str):
        return True

    return False

def main():
    """
    Function that:
        1. Saves a copy of existing data
        2. Scrapes data from the de Young Museum & Legion of Honor
        3. Scrapes data from the Berkeley Art Center
        4. Saves data as json
    """

    # Save a copy of existing json data
    try:
        copy_json_file('website/data.json', 'website/data_copy.json')
    except FileNotFoundError:
        pass

    # Instantiate list to collect data
    data = []

    # Scrape data from the de Young Museum & Legion of Honor
    de_young_data = get_de_young_events()
    data += de_young_data

    # Scrape data from the Berkeley Art Center
    berkeley_art_center_data = get_berkeley_art_center_events()
    data += berkeley_art_center_data

    return data

#     # Save data as json
#     save_json_file(data, 'website/data.json')
#     print("."*15)
#     print("Saved {:,} events.".format(len(data)))

#     return

# if __name__ == "__main__":
#     main()


In [213]:
data = main()

Collecting events from the de Young & Legion of Honor...
Completed. Collected 50 events.
Collecting events from Berkeley Art Center...
Completed. Collected 2 events.


In [214]:
data[-2:]

[{'Title': 'OÑI OCAN: A RITUAL PERFORMANCE BY COURTNEY DESIREE MORRIS',
  'Links': [{'Link': 'https://www.eventbrite.com/e/oni-ocan-a-ritual-performance-by-courtney-desiree-morris-tickets-697681232347?aff=oddtdtcreator',
    'Text': 'Eventbrite'}],
  'Date': '2023-09-08 00:00:00',
  'DateText': 'Friday, September 8, 6-8pm',
  'StartTime': '6pm',
  'EndTime': '8pm',
  'TimeSort': None,
  'Venue': 'Berkeley Art Center',
  'Tags': ['performance']},
 {'Title': 'Community Dinner',
  'Links': [{'Link': 'https://www.eventbrite.com/e/705586196307?aff=oddtdtcreator',
    'Text': 'Eventbrite'}],
  'Date': '2023-09-14 00:00:00',
  'DateText': 'Thursday, September 14, 6-9pm',
  'StartTime': '6pm',
  'EndTime': '9pm',
  'TimeSort': None,
  'Venue': 'Berkeley Art Center',
  'Tags': []}]

In [180]:
def get_sorting_index(date, start_time, end_time):
    """
    Use the date to sort if we have that. Use the time also
    (start time if available otherwise end time) if we have that.

    Expects date in the form YYYY-MM-DD 00:00:00.
    Expects dater_time in the the form HHam/pm or Ham/pm.
    """
    time_sort = None
    
    # If we have the date, use it as the sorting index
    if date:
        time_sort = date

        # These are potential formats the times could be in
        formats = ['%I%p', '%I:%M%p', '%I%P', '%I:%M%P']
            
        # If we have the start time, use that to sort also
        if start_time:
            for fmt in formats:
                try:
                    # Try to parse time with current format
                    parsed_time = datetime.strptime(start_time, fmt).time()
                    time_sort = datetime.combine(time_sort, parsed_time)
                except ValueError:
                    # If parsing fails, continue to next format
                    continue
        
        # Otherwise use the end time if we have it
        elif end_time:
            for fmt in formats:
                try:
                    # Try to parse time with current format
                    parsed_time = datetime.strptime(end_time, fmt).time()
                    time_sort = datetime.combine(time_sort, parsed_time)
                except ValueError:
                    # If parsing fails, continue to next format
                    continue

    return time_sort

In [182]:
get_sorting_index(None, '6pm', '10pm')

In [None]:
# These are potential formats the time could be in
formats = ['%I%p', '%I:%M%p', '%I%P', '%I:%M%P']
for fmt in formats:
  try:
    # Try to parse time with current format
    parsed_time = datetime.strptime(time_str, fmt).time()
    # return datetime.combine(current_date, parsed_time)
    return parsed_time
except ValueError:
    # If parsing fails, continue to next format
    continue

In [152]:
def time_string_to_datetime(time_str):
    # These are potential formats your time could be in
    formats = ['%I%p', '%I:%M%p', '%I%P', '%I:%M%P']
    
    for fmt in formats:
        try:
            # Try to parse time with current format
            # Use today's date, and only change the time
            current_date = datetime.now().date()
            parsed_time = datetime.strptime(time_str, fmt).time()
            # return datetime.combine(current_date, parsed_time)
            return parsed_time
        except ValueError:
            # If parsing fails, continue to next format
            continue
    raise ValueError(f"Time {time_str} is not in a recognized format")

# Testing the function
test_times = ['7am', '07am', '7:30am', '07:30am', '8pm', '08pm', '8:45pm', '08:45pm']

for t in test_times:
    print(f"{t} -> {time_string_to_datetime(t)}")


7am -> 07:00:00
07am -> 07:00:00
7:30am -> 07:30:00
07:30am -> 07:30:00
8pm -> 20:00:00
08pm -> 20:00:00
8:45pm -> 20:45:00
08:45pm -> 20:45:00


In [153]:
time_string_to_datetime('10pm')

datetime.time(22, 0)