In [11]:
import os
import random
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time


def crawl_site(start_url, visited_urls_file, urls_file, not_visiting_urls_file):
    
    # 初始化集合
    not_visiting_urls = set()
    visited_urls = set()
    urls_to_visit = set([start_url])

    # 在函数的开头打开文件，并在整个函数执行期间保持打开状态
    with open(urls_file, 'a+', encoding='utf-8') as urls_f, \
        open(visited_urls_file, 'a+', encoding='utf-8') as visited_urls_f, \
        open(not_visiting_urls_file, 'r', encoding='utf-8') as not_visiting_urls_f:
        # 读取 urls_file 文件并更新 urls_to_visit 集合
        urls_f.seek(0)
        for url in urls_f:
            urls_to_visit.add(url.strip())

        # 读取 visited_urls_file 文件并更新 visited_urls 集合
        visited_urls_f.seek(0)
        for url in visited_urls_f:
            visited_urls.add(url.strip())
            
        not_visiting_urls_f.seek(0)
        for url in not_visiting_urls_f:
            not_visiting_urls.add(url.strip())

        # 从 urls_to_visit 中去除已访问过的 URLs
        urls_to_visit.difference_update(visited_urls)

        while urls_to_visit:
            current_url = urls_to_visit.pop()

            # 只有当current_url不在visited_urls里时才进行爬取
            if current_url not in visited_urls and all(sub_url not in current_url for sub_url in not_visiting_urls):
                print(f"Visiting: {current_url}")
                # 在尝试访问链接前，添加随机延迟
                time.sleep(random.randint(1, 3))

                try:
                    response = requests.get(current_url)
                    soup = BeautifulSoup(response.text, 'html.parser')

                    # 提取并存储所有文字内容
                    text_content = soup.get_text()

                    # 移除current_url中的"https:"，并为文件路径准备
                    sanitized_url = current_url.replace('https://', '').replace('http://', '')
                    content_directory = f"scraping_data/{sanitized_url}/"
                    content_file_path = os.path.join(content_directory, "content.txt") #加个scraping_data

                    # 确保目录存在
                    os.makedirs(content_directory, exist_ok=True)

                    with open(content_file_path, 'w', encoding='utf-8') as content_f:
                        content_f.write(f"URL: {current_url}\n{text_content}\n{'='*100}\n\n")

                    # 查找并处理所有链接
                    for link in soup.find_all('a', href=True):
                        absolute_link = urljoin(current_url, link['href'])
                        if (absolute_link not in visited_urls) and \
                        (absolute_link not in urls_to_visit) and \
                        ("melbconnect" in absolute_link) and \
                        all(sub_url not in absolute_link for sub_url in not_visiting_urls):
                            urls_to_visit.add(absolute_link)
                            urls_f.write(f"{absolute_link}\n")

                    visited_urls.add(current_url)
                    visited_urls_f.write(f"{current_url}\n")
                except requests.RequestException as e:
                    print(f"Error during requests to {current_url}: {str(e)}")


if __name__ == "__main__":
    # start_url = 'https://cis.unimelb.edu.au/'
    start_url = 'https://melbconnect.com.au/'
    visited_urls_file = 'scraping_urls/visited_urls.txt'
    urls_file = 'scraping_urls/urls.txt'
    not_visiting_urls_file = 'scraping_urls/not_visiting_urls.txt'
    crawl_site(start_url, visited_urls_file, urls_file, not_visiting_urls_file)

Visiting: mailto:team@melbconnectcoworking.com.au
Error during requests to mailto:team@melbconnectcoworking.com.au: No connection adapters were found for 'mailto:team@melbconnectcoworking.com.au'
Visiting: mailto:ai-assurance@cis.unimelb.edu.au
Error during requests to mailto:ai-assurance@cis.unimelb.edu.au: No connection adapters were found for 'mailto:ai-assurance@cis.unimelb.edu.au'
Visiting: mailto:concierge-melbconnect@unimelb.edu.au
Error during requests to mailto:concierge-melbconnect@unimelb.edu.au: No connection adapters were found for 'mailto:concierge-melbconnect@unimelb.edu.au'
Visiting: http://www.cis.unimelb.edu.au
Error during requests to http://www.cis.unimelb.edu.au: HTTPConnectionPool(host='www.cis.unimelb.edu.au', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x11117e850>: Failed to establish a new connection: [Errno 60] Operation timed out'))


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time

# 初始化Chrome WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# 打开目标网页
driver.get('https://melbconnect.com.au/')


# 找到所有的动态加载的div元素
dynamic_divs = driver.find_elements(By.CSS_SELECTOR, 'div.js-card-gallery')

# 遍历每个div并获取其内容
for div in dynamic_divs:
    # 可以根据需要获取更多属性
    entry_type = div.get_attribute('data-entry-types')
    order = div.get_attribute('data-order')
    ids = div.get_attribute('data-ids')
    results_per_page = div.get_attribute('data-results-per-page')
    print(f'Entry Type: {entry_type}, Order: {order}, IDs: {ids}, Results per page: {results_per_page}')
    
    # 如果需要获取div中的文本内容
    print('Text Content:', div.text)

# 关闭浏览器
driver.quit()

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time

# 初始化Chrome WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# 打开目标网页
driver.get('https://melbconnect.com.au/')

# 等待页面JavaScript加载完毕
time.sleep(5)  # 根据实际情况调整等待时间

# 找到所有的动态加载的div元素
dynamic_divs = driver.find_elements(By.CSS_SELECTOR, 'div.js-card-gallery')

# 遍历每个div
for div in dynamic_divs:
    # 在每个div中查找所有的<a>标签
    links = div.find_elements(By.TAG_NAME, 'a')
    # 输出每个链接的href属性
    for link in links:
        url = link.get_attribute('href')
        print('Found URL:', url)

# 关闭浏览器
driver.quit()

In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time

# 初始化Chrome WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# 打开目标网页
driver.get('https://melbconnect.com.au/')

# 等待页面JavaScript加载完毕
time.sleep(5)  # 根据实际情况调整等待时间

# 找到所有的动态加载的div元素
dynamic_divs = driver.find_elements(By.CSS_SELECTOR, 'div.js-card-gallery')

# 遍历每个div
for div in dynamic_divs:
    # 在每个div中查找所有的<a>标签
    links = div.find_elements(By.TAG_NAME, 'a')
    # 输出每个链接的href属性
    for link in links:
        url = link.get_attribute('href')
        print('Found URL:', url)

# 关闭浏览器
driver.quit()

Found URL: https://melbconnect.com.au/events/the-future-of-legal-practice-in-the-face-of-ai-with-stuart-fuller
Found URL: https://melbconnect.com.au/events/2024-map-launch
Found URL: https://melbconnect.com.au/events/wattle-fellowship-spotlight-2024-3
Found URL: https://melbconnect.com.au/community/atomos
Found URL: https://melbconnect.com.au/community/melbourne-climate-futures
Found URL: https://melbconnect.com.au/community/world-view
Found URL: https://melbconnect.com.au/discovery/shadow-players-sharpen-ai-weapons-in-online-war-whistleblower-warns
Found URL: https://melbconnect.com.au/discovery/ilana-bean-on-the-birth-of-medical-illustration
Found URL: https://melbconnect.com.au/discovery/kath-dolan
