In [18]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import csv

driver = webdriver.Chrome()
driver.get('https://www.rootdata.com/zh/Projects')  # Open the webpage

# Initialize WebDriverWait
wait = WebDriverWait(driver, 20)

# Open the CSV file in write mode
with open('project_list.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(['Name', 'Tag', 'Ecology', 'Hyperlink'])

    for i in range(430):  # Loop through up to 430 pages
        # Get the page source
        pageSource = driver.page_source

        soup = BeautifulSoup(pageSource, 'html.parser')

        # Find all rows in the table
        rows = soup.find_all('tr', {'role': 'row'})

        for row in rows:
            # Extract name
            element = row.find('a', {'class': 'list_name animation_underline'})
            if element:
                name = element.text.strip()
            else:
                name = 'N/A'

            # Extract tag
            tag_element = row.find('div', {'class': 'tag_list'})
            if tag_element:
                tag_text = tag_element.text.strip()
            else:
                tag_text = 'N/A'

            # Extract ecologies
            ecology_div = row.find('div', {'class': 'd-flex flex-row chain_list justify-end'})
            if ecology_div:
                # Check for images within the ecology div
                ecology_imgs = ecology_div.find_all('img')
                if ecology_imgs:
                    ecology_text = ', '.join([img.get('alt', 'N/A') for img in ecology_imgs])
                else:
                    ecology_text = 'None'
            else:
                ecology_text = 'None'

            # Extract hyperlink
            link_element = row.find('a', href=True)
            if link_element:
                hyperlink = "https://www.rootdata.com" + link_element['href']
            else:
                hyperlink = 'N/A'

            # Check if any data is "N/A" and skip this row if so
            if name != 'N/A' and tag_text != 'N/A' and ecology_text != 'N/A' and hyperlink != 'N/A':
                writer.writerow([name, tag_text, ecology_text, hyperlink])

        # After processing current page, click the "Next" button if it's not the last page
        if i < 429:  # No need to click "Next" on the last page
            button_element = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.btn-next')))
            driver.execute_script("arguments[0].click();", button_element)
            time.sleep(5)  # Wait for the next page to load

driver.quit()  # Close the browser


In [1]:
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

# Function to extract text between specific tags
def extract_text(soup, tag, attrs, default='N/A'):
    element = soup.find(tag, attrs)
    return element.text.strip() if element else default

# Function to extract all hyperlinks between specific tags
def extract_hyperlinks(soup, tag, attrs):
    div = soup.find(tag, attrs)
    if div:
        links = div.find_all('a', href=True)
        return [link['href'] for link in links if link['href'].startswith('https')]
    return []

# Function to extract significant events
def extract_significant_events(soup):
    events = []
    event_sections = soup.find_all('div', {'class': 'd-flex flex-column pb-4 content'})
    for section in event_sections:
        date = section.find('p', {'class': 'date'}).text.strip() if section.find('p', {'class': 'date'}) else 'N/A'
        desc = section.find('p', {'class': 'desc'}).text.strip() if section.find('p', {'class': 'desc'}) else 'N/A'
        if date != 'N/A' and desc != 'N/A':
            events.append(f"{date} - {desc}")
    return events

# Function to extract similar projects
def extract_similar_projects(soup):
    projects = []
    project_sections = soup.find_all('h4', {'class': 'mb-1'})
    for project in project_sections:
        projects.append(f"#{project.text.strip()}")
    return ', '.join(projects)

# Read the CSV file and extract hyperlinks
with open('project_list.csv', 'r', newline='', encoding='utf-8') as infile:
    reader = csv.reader(infile)
    header = next(reader)  # Read the header row
    rows = [row for row in reader]  # Read all data rows

# Initialize the webdriver
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 20)

# Open the CSV file in append mode to add the crawled data
with open('project_list.csv', 'w', newline='', encoding='utf-8') as outfile:
    writer = csv.writer(outfile)
    # Add the new header fields
    if len(header) == 4:  # Check if the header needs to be expanded
        header.extend(['Project Description', 'Official Hyperlink List', 'Details', 'Founded Date', 'Funders', 'Related News Links', 'Twitter Hyperlink', 'Followers', 'Following', 'Significant Events', 'Similar Projects'])
        writer.writerow(header)
    else:
        writer.writerow(header)

    for row in rows:
        name, tag, ecology, hyperlink = row

        # Open the project detail page
        driver.get(hyperlink)
        time.sleep(5)  # Wait for the page to load
        pageSource = driver.page_source
        soup = BeautifulSoup(pageSource, 'html.parser')

        # Extracting required information
        project_description = extract_text(soup, 'p', {'class': 'detail_intro'})
        official_hyperlinks = extract_hyperlinks(soup, 'div', {'class': 'links d-flex flex-row flex-wrap'})
        details = extract_text(soup, 'p', {'class': 'pt-4'})
        founded_date = extract_text(soup, 'span', {'class': 'info_text'})
        funders = [funder.text.strip() for funder in soup.find_all('h2', {'class': 'ml-2'})]
        related_news_links = extract_hyperlinks(soup, 'div', {'class': 'list'})
        related_news_links = ', '.join(related_news_links) if related_news_links else ''

        # Extracting Twitter hyperlink
        twitter_hyperlink = 'N/A'
        twitter_info = soup.find('h4', {'class': 'x_name singe-line'})
        if twitter_info:
            twitter_link = twitter_info.find_next('a', {'class': 'x_link'})
            if twitter_link:
                twitter_hyperlink = twitter_link['href']

        # Extracting followers and following counts
        followers = 'N/A'
        following = 'N/A'
        analysis_section = soup.find('div', {'class': 'analysis d-flex align-center px-4 pt-6'})
        if analysis_section:
            followers_span = analysis_section.find('span', string='Followers')
            if followers_span:
                followers = followers_span.find_next('span', {'class': 'analyze_value'}).text.strip()
            following_span = analysis_section.find('span', string='Following')
            if following_span:
                following = following_span.find_next('span', {'class': 'analyze_value'}).text.strip()

        # Extracting significant events
        significant_events = extract_significant_events(soup)

        # Extracting similar projects
        similar_projects = extract_similar_projects(soup)

        # Append the crawled data to the CSV
        writer.writerow([
            name, tag, ecology, hyperlink, project_description, ', '.join(official_hyperlinks), details, founded_date,
            ', '.join(funders), related_news_links, twitter_hyperlink, followers, following,
            ' | '.join(significant_events), similar_projects
        ])

driver.quit()  # Close the browser
