<a href="https://colab.research.google.com/github/smartgh0/ESGpython/blob/main/ESGpython.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
from urllib.parse import urljoin, urlparse

# Prompt the user to input the website URL
website_url = input("Enter the website URL: ")

# Ensure the URL has a scheme (http:// or https://)
if not website_url.startswith(('http://', 'https://')):
    website_url = 'https://' + website_url

download_folder = "pdf_downloads"  # Set the folder where the PDFs will be downloaded

# Create the download folder if it doesn't exist
if not os.path.exists(download_folder):
    os.makedirs(download_folder)

# Set to store the URLs of downloaded PDFs to avoid duplicates
downloaded_pdfs = set()

# Set to store visited URLs to avoid revisits and infinite loops
visited_links = set()

def download_pdf(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status
        filename = url.split('/')[-1]
        filepath = os.path.join(download_folder, filename)
        if not os.path.isfile(filepath):  # Check if file already exists
            with open(filepath, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded {filename}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")

def visit_link(link):
    if link in visited_links:
        return
    visited_links.add(link)
    try:
        response = requests.get(link)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find all PDF links on the page
        pdf_links = soup.find_all('a', href=lambda href: href and href.lower().endswith('.pdf'))
        for pdf_link in pdf_links:
            pdf_url = urljoin(response.url, pdf_link['href'])
            if pdf_url not in downloaded_pdfs:  # Avoid duplicates
                download_pdf(pdf_url)
                downloaded_pdfs.add(pdf_url)
        # Find all links on the page and visit them recursively
        page_links = soup.find_all('a', href=True)
        for page_link in page_links:
            href = page_link['href']
            absolute_url = urljoin(response.url, href)
            parsed_url = urlparse(absolute_url)
            # Stay within the same website domain and avoid revisiting links
            if parsed_url.netloc == urlparse(website_url).netloc and absolute_url not in visited_links:
                visit_link(absolute_url)
    except requests.exceptions.RequestException as e:
        print(f"Could not parse link: {link}: {e}")

# Start visiting from the initial URL
visit_link(website_url)

# Convert the downloaded PDFs list to a pandas dataframe
df = pd.DataFrame(list(downloaded_pdfs), columns=['Downloaded PDFs'])

# Print the dataframe
print(df.head())


Enter the website URL: www.bp.com
Downloaded bp-second-quarter-2024-results.pdf
Downloaded bp-energy-outlook-2024.pdf
Downloaded bp-sustainability-report-2023.pdf
Downloaded bp-annual-report-and-form-20f-2023.pdf
Downloaded bp-net-zero-progress-update-2024.pdf
Downloaded careers-bp-candidate-support-application-process.pdf
Downloaded careers-interviewing-at-bp-a-guide-for-candidates.pdf
Downloaded careers-bp-candidate-support-telephone-interview-tips.pdf
Downloaded careers-bp-candidate-support-top-tips.pdf
Downloaded First%20quarter%202023%20results.pdf
Downloaded bp-net-zero-progress-aims-update-2024.pdf
Downloaded bp-ar2024-strategic-report-performance-against-our-strategy.pdf
Downloaded bp-US-Impact-Report-Investing-in-America.pdf
Downloaded bp-investor-handout-2q24.pdf
Downloaded bp-esg-investor-pack.pdf
Downloaded bp-gulf-of-mexico-cash-flow-schedule.pdf
Downloaded bp-investor-update-2023-plenary.pdf
Downloaded bp-investor-update-2023-oil.pdf
Downloaded bp-investor-update-2023-gas



Downloaded 1_3_route_bpgent_nfe.pdf




Downloaded Algemene%20Voorwaarden%20voor%20het%20gebruik%20van%20de%20bp%20tankkaart_2024%20.pdf
Downloaded Prijs-%20en%20vergoedingenoverzicht%20bp-tankpassen.BE_V1_2024.pdf
Downloaded ORS%20PrijslijstV29_07-2024_.pdf




Downloaded bp-gives-go-ahead-for-sixth-operated-hub-kaskida-in-the-us-gulf-of-mexico.pdf




Downloaded bp-gulf-of-mexico-fact-sheet.pdf
Downloaded bp-diversity-equity-and-inclusion-report-2023.pdf
Downloaded bp%20in%20America.pdf
Downloaded bp%20in%20California.pdf
Downloaded bp%20in%20Colorado.pdf
Downloaded bp%20in%20Illinois.pdf
Downloaded bp%20in%20Indiana.pdf
Downloaded bp%20in%20Kansas.pdf
Downloaded bp-in-kentucky.pdf
Downloaded bp%20in%20Louisiana.pdf
Downloaded bp%20in%20New%20Jersey.pdf
Downloaded bp%20in%20Ohio.pdf
Downloaded bp%20in%20Pennsylvania.pdf
Downloaded bp%20in%20Texas.pdf
Downloaded bp%20in%20Washington.pdf
Downloaded bp-biogas-fact-sheet.pdf
Downloaded bp-pulse-fact-sheet.pdf
Downloaded bpx-energy-fact-sheet.pdf
Downloaded bp-convenience-and-mobility-fact-sheet.pdf
Downloaded bp-refineries-fact-sheet.pdf
Downloaded bp-solar-energy-fact-sheet.pdf
Downloaded bp-trading-and-shipping-fact-sheet.pdf
Downloaded bp-wind-energy-fact-sheet.pdf
Downloaded bp-america-inc-comments-on-new-york-cap-and-invest-pre-proposal-draft.pdf
Downloaded comments-of-bp-america-i

KeyboardInterrupt: 