In [None]:
import csv
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse

# Set the list of wind project names and the words to search for
with open('wind_projects.csv', newline='') as csvfile:
    wp_reader = csv.reader(csvfile)
    wind_project_names = [row[0] for row in wp_reader]

# Set the search words from the CSV
with open('search_words.csv', newline='') as csvfile:
    sw_reader = csv.reader(csvfile)
    search_words = [row[0] for row in sw_reader]

# Set the website URLs from the CSV
with open('website_urls.csv', newline='') as csvfile:
    url_reader = csv.reader(csvfile)
    website_urls = [row[0] for row in url_reader]

# Create an empty list to store the press release links
pr_links = []

# Create a set to store the visited URLs, so we don't visit the same URL twice
visited_urls = set()

# Loop through each URL and send a GET request to the website and store the HTML response
for url in website_urls:
    # Add the URL to the visited set
    visited_urls.add(url)

    # Send a GET request to the website and store the HTML response
    response = requests.get(url)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all links on the page
    links = soup.find_all('a')

    # Loop through each link and check if it is a press release for a wind project that mentions any of the search words
    for link in links:
        href = link.get('href')
        if href and 'press-release' in href:
            # If the link doesn't start with 'http' or 'https', assume it's a relative link and construct the full URL
            if not href.startswith('http') and not href.startswith('https'):
                url_parts = urlparse(url)
                href = f'{url_parts.scheme}://{url_parts.netloc}{href}'

            # Check if the link is on the same site as the initial URL
            if urlparse(href).netloc == urlparse(url).netloc:
                for wind_project_name in wind_project_names:
                    if wind_project_name.lower() in href.lower():
                        # Send a GET request to the press release link and store the HTML response
                        pr_response = requests.get(href)

                        # Parse the HTML content using BeautifulSoup
                        pr_soup = BeautifulSoup(pr_response.content, 'html.parser')

                        # Extract the relevant information from the press release
                        title = pr_soup.find('h1').text
                        date = pr_soup.find('span', {'class': 'date'}).text
                        content = pr_soup.find('div', {'class': 'content'}).text

                        # Check if the press release mentions any of the search words
                        for search_word in search_words:
                            if search_word in content.lower():
                                # Add the press release link to the list
                                pr_links.append(href)

    # Find all links on the page
    links = soup.find_all('a')

    # Loop through each link and check if it is a relative URL and is rooted at the initial URL
    for link in links:
        href = link.get('href')
        if href and not href.startswith('http') and not href.startswith('https'):
            full_url = f'{urlparse(url).scheme}://{urlparse(url).netloc}{href}'
            if full_url not in visited_urls:
                visited_urls.add(full_url)
                # Check if the link is on the same site as the initial URL
                if urlparse(full_url).netloc == urlparse(url).netloc:
                    # Add the full URL to the list of website URLs to visit
                    website_urls.append(full_url)

# Loop through each URL in the website URLs list
for url in website_urls:
    # Add the URL to the visited set
    visited_urls.add(url)

    # Send a GET request to the website and store the HTML response
    response = requests.get(url)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all links on the page
    links = soup.find_all('a')

    # Loop through each link and check if it is a press release for a wind project that mentions any of the search words
    for link in links:
        href = link.get('href')
        if href and 'press-release' in href:
            # If the link doesn't start with 'http' or 'https', assume it's a relative link and construct the full URL
            if not href.startswith('http') and not href.startswith('https'):
                url_parts = urlparse(url)
                href = f'{url_parts.scheme}://{url_parts.netloc}{href}'

            # Check if the link is on the same site as the initial URL
            if urlparse(href).netloc == urlparse(url).netloc:
                for wind_project_name in wind_project_names:
                    if wind_project_name.lower() in href.lower():
                        # Send a GET request to the press release link and store the HTML response
                        pr_response = requests.get(href)

                        # Parse the HTML content using BeautifulSoup
                        pr_soup = BeautifulSoup(pr_response.content, 'html.parser')

                        # Extract the relevant information from the press release
                        title = pr_soup.find('h1').text
                        date = pr_soup.find('span', {'class': 'date'}).text
                        content = pr_soup.find('div', {'class': 'content'}).text

                        # Check if the press release mentions any of the search words
                        for search_word in search_words:
                            if search_word in content.lower():
                                # Add the press release link to the list
                                pr_links.append(href)

    # Find all links on the page
    links = soup.find_all('a')

    # Loop through each link and check if it is a relative URL and is rooted at the initial URL
    for link in links:
        href = link.get('href')
        if href and not href.startswith('http') and not href.startswith('https'):
            full_url = f'{urlparse(url).scheme}://{urlparse(url).netloc}{href}'
            if full_url not in visited_urls:
                visited_urls.add(full_url)
                # Check if the link is on the same site as the initial URL
                if urlparse(full_url).netloc == urlparse(url).netloc:
                    # Add the full URL to the list of website URLs to visit
                    website_urls.append(full_url)

# Open a file to write the output
with open('output.txt', 'w') as f:
    if not pr_links:
        f.write("No press releases found\n")
    else:
        f.write(f"Found {len(pr_links)} press releases\n")
        # Loop through each press release link and scrape the content
        for pr_link in pr_links:
            # Send a GET request to the press release link and store the HTML response
            pr_response = requests.get(pr_link)

            # Parse the HTML content using BeautifulSoup
            pr_soup = BeautifulSoup(pr_response.content, 'html.parser')
        # Extract the relevant information from the press release
        title = pr_soup.find('h1').text
        date = pr_soup.find('span', {'class': 'date'}).text
        content = pr_soup.find('div', {'class': 'content'}).text

        # Write the extracted information to the file
        f.write(f'Title: {title}\n')
        f.write(f'Date: {date}\n')
        f.write(f'Content: {content}\n')
        f.write('------------------------\n')