# Lab 3 - Web Scraping & One-Hot Encoding

In [1]:
import re
import requests

In [2]:
re_title = r'<title>(.*?)<\/title>'
re_email = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9._-]+\.[a-zA-Z]{2,}'
re_phone = r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'

re_address = r'\d+\s+[A-Za-z\s]+,\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*[A-Z0-9]{3,6}'
re_address = r'\d{1,3}.?\d*\s[a-zA-Z]{2,}\s?[a-zA-Z]{0,},?\s?[a-zA-Z]+,?\s?[A-Z]{2,3}\s?\d{5,6}'
re_address = r'\d{1,5}\s[\w\s]+,\s[\w\s]+,\s[A-Z]{2},\s?[A-Z0-9\s]+'
re_address = r'(\d{1,5}[\w\s.,\-]+(St|Street|Rd|Road|Ave|Avenue|Blvd|Boulevard|Dr|Drive|Lane|Ln|Way|Court|Plaza|Circle|Crescent)[\w\s.,\-]+,\s?[A-Za-z\s]+,\s?[A-Z]{2,3}\s?\d{5,6})'
re_address = r'(\d{1,5}[\w\s.,\-]+(St|Street|Rd|Road|Ave|Avenue|Blvd|Boulevard|Dr|Drive|Lane|Ln|Way|Court|Plaza|Circle|Crescent)[\w\s.,\-]+,\s?[A-Za-z\s]+,\s?[A-Za-z\s]+,\s?[A-Z]{2,3}\s?\d{5,6})'
re_address = r'(\d{1,6}\s+[a-zA-Z0-9\s.,\'-]+(?:St|Street|Rd|Road|Ave|Avenue|Blvd|Boulevard|Dr|Drive|Lane|Ln|Way|Court|Circle|Crescent|Campus|Loop|Parkway|Trail|Plaza|Place|Mall|Park|Square)[\s.,]+[A-Za-z\s]+,\s?[A-Za-z\s]+,\s?(?:[A-Z]{2,3}\s?\d{5}|\w{2,3}\d{1,2}\w{1,2}\s?\d{1,2}\w{2,3})?)'

In [3]:
urls = {
    'Loyalist' : 'https://loyalistcollege.com/about/contact-us/',
    'Seneca' : 'https://www.senecapolytechnic.ca/news-and-events/media-releases.html',
    'OntarioTech' : 'https://ontariotechu.ca/',
    'Centennial' : 'https://www.centennialcollege.ca/about-centennial/contact-us',
    'Georgian' : 'https://www.georgiancollege.ca/about-georgian/campuses/barrie-campus/#contact'
}

In [4]:
for college, url in urls.items():
    # Fetching Data from the given `URL`.
    response = requests.get(url)

    # Getting HTML Content as Text.
    content = response.text

    # Extracting Page-Title from the Website.
    titles = re.search(re_title, content)

    # Extracting Email from the Website.
    emails = re.findall(re_email, content)

    # Extracting Phone-Number from the Website.
    phones = re.findall(re_phone, content)

    # Extracting Address from the Website.
    addresses = re.findall(re_address, content)

    print({
        'College' : college,
        'URL' : url,
        'Title' : titles.group(1) if titles else 'No Title Found',
        'Email' : emails,
        'Phone' : phones,
        'Address' : addresses
    })

{'College': 'Lambton', 'URL': 'https://www.lambtoncollege.ca/', 'Title': 'Lambton College Home | Lambton College', 'Email': ['info@lambtoncollege.ca', 'info@lambtoncollege.ca'], 'Phone': ['519-542-7751', '6383849322', '519-542-7751', '519-542-7751', '613-462-8431', '416-485-2098', '905-890-7833'], 'Address': ['1457 London Road, Sarnia, ON, ']}
{'College': 'Loyalist', 'URL': 'https://loyalistcollege.com/about/contact-us/', 'Title': 'Contact us - Loyalist College', 'Email': ['fippa@loyalistcollege.com', 'info@loyalistcollege.com', 'hbrown@loyalistcollege.com', 'hbrown@loyalistcollege.com', 'communications@loyalistcollege.com', 'communications@loyalistcollege.com', 'AccessAbility@loyalistcollege.com', 'AccessAbility@loyalistcollege.com', 'admissions@loyalistcollege.com', 'admissions@loyalistcollege.com', 'internationaladmissions@loyalistcollege.com', 'internationaladmissions@loyalistcollege.com', 'athletics@loyalistcollege.com', 'athletics@loyalistcollege.com', 'awards@loyalistcollege.com

In [1]:
import requests
import re

# List of URLs to scrape
urls = [
    "https://loyalistcollege.com/about/contact-us/",
    "https://www.senecapolytechnic.ca/news-and-events/media-releases.html",
    "https://ontariotechu.ca/",
    "https://www.centennialcollege.ca/about-centennial/contact-us",
    "https://www.georgiancollege.ca/about-georgian/campuses/barrie-campus/#contact"
]

# Regular expression for street address, city, province, and postal code
address_pattern = r"(\d+\s\w+\s(?:[A-Za-z]+(?:\s\w+)*)),?\s+([A-Za-z]+(?:\s[A-Za-z]+)*),?\s*(ON|QC|NS|NB|MB|BC|PE|SK|AB|NL|NT|YT|NU)?\s*(\w\d\w\s?\d\w\d)?"

# Function to fetch content from a URL and search for address-like patterns using regex
def fetch_and_extract_address(url):
    try:
        # Fetch HTML content from the URL
        response = requests.get(url)
        response.raise_for_status()  # Ensure the request was successful

        # Extract all text from the HTML content
        text = response.text

        # Use regex to search for address patterns in the text
        matches = re.findall(address_pattern, text, re.IGNORECASE)

        if matches:
            # Clean up and join the matched parts of the address
            addresses = [" ".join([part for part in match if part]).strip() for match in matches]
            return f"Found addresses: {', '.join(set(addresses))}"
        else:
            return "No address found."
    except requests.exceptions.RequestException as e:
        return f"Failed to fetch {url}: {e}"

# Iterate over each URL and extract address information
for url in urls:
    address_info = fetch_and_extract_address(url)
    print(f"URL: {url} -> {address_info}")

URL: https://loyalistcollege.com/about/contact-us/ -> Found addresses: 50 milliseconds to fix a bug in chrome, 2 of the Ontario Colleges of Applied Arts and Technology Act for recruiting and related purposes, 911 and Public Safety Communications
URL: https://www.senecapolytechnic.ca/news-and-events/media-releases.html -> Found addresses: 2 million in applied research funding, 19 Vaccination Clinic Re, 19 by donating equipment and supplies to hospitals, 5 million from federal government to boost applied research centres, 15 times and counting, 2 million from the Government of Canada to help women prepare for new careers, 19 vaccination clinic to serve North York residents, 000 from the City of Toronto to continue Newnham Campus greening project, 2024 All rights reserved, 000 donation from Scotiabank, 000 for applied research benefiting urban farmers, 1750 Finch Avenue East, 000 boost from TD Bank Group to help recharge more careers, 19 does not slow down reconciliation at Seneca, 000 gr

In [9]:
import requests
import re

# List of URLs to scrape
urls = [
    "https://loyalistcollege.com/about/contact-us/",
    "https://www.senecapolytechnic.ca/news-and-events/media-releases.html",
    "https://ontariotechu.ca/",
    "https://www.centennialcollege.ca/about-centennial/contact-us",
    "https://www.georgiancollege.ca/about-georgian/campuses/barrie-campus/#contact"
]

# Regular expression to find content in <p> tags containing <br> tags (addresses)
address_pattern = r'<p>(.*?(?:<br\s*/?>)+.*?)</p>'

# Regex to remove unnecessary elements (like scripts and styles) from the HTML content
cleanup_pattern = r'<(script|style).*?>.*?</\1>|<!--.*?-->|<.*?>'

# Function to fetch HTML content and clean unnecessary tags
def fetch_and_clean_html(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        html_content = response.text

        # Remove script, style tags, and HTML comments
        cleaned_content = re.sub(cleanup_pattern, '', html_content, flags=re.DOTALL)

        return cleaned_content
    except requests.exceptions.RequestException as e:
        return None

# Function to extract addresses from cleaned HTML
def extract_address(cleaned_html):
    # Search for the address pattern inside <p> tags with <br> tags
    matches = re.findall(address_pattern, cleaned_html, re.DOTALL)

    if matches:
        # Further clean and format the address by removing <br> tags and extraneous spaces
        addresses = [re.sub(r'<br\s*/?>', ', ', match.strip()) for match in matches]
        # Remove any remaining HTML tags from the addresses
        clean_addresses = [re.sub(r'<.*?>', '', addr).strip() for addr in addresses]
        return f"Found addresses: {', '.join(set(clean_addresses))}"
    else:
        return "No address found."

# Iterate over each URL, clean the HTML, and extract address information
for url in urls:
    cleaned_html = fetch_and_clean_html(url)
    
    if cleaned_html:
        address_info = extract_address(cleaned_html)
        print(f"URL: {url} -> {address_info}")
    else:
        print(f"Failed to fetch content from {url}")

URL: https://loyalistcollege.com/about/contact-us/ -> No address found.
URL: https://www.senecapolytechnic.ca/news-and-events/media-releases.html -> No address found.
URL: https://ontariotechu.ca/ -> No address found.
URL: https://www.centennialcollege.ca/about-centennial/contact-us -> No address found.
URL: https://www.georgiancollege.ca/about-georgian/campuses/barrie-campus/#contact -> No address found.
