In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from io import BytesIO
from svglib.svglib import svg2rlg
from reportlab.graphics import renderPDF
from PIL import Image

# Function to download and convert SVG to PNG
def convert_svg_to_png(svg_url, output_path):
    response = requests.get(svg_url)
    svg_data = response.content

    drawing = svg2rlg(BytesIO(svg_data))
    renderPDF.drawToFile(drawing, output_path)

    # Optional: Convert PDF to PNG using Pillow (PIL)
    pdf_image = Image.open(output_path)
    pdf_image.save(output_path.replace('.pdf', '.png'), 'PNG')

# Function to scrape data from HTML and save to Excel
def scrape_and_save_to_excel(html_url, excel_output_path):
    # Send a GET request to the URL
    response = requests.get(html_url)
    html_content = response.text

    # Parse HTML with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract data from the HTML table
    data = []
    for row in soup.find_all('tr')[1:]:  # assuming the first row contains headers
        columns = row.find_all('td')
        if len(columns) >= 2:
            svg_url = columns[0].find('img')['src']  # adjust this based on the actual HTML structure
            name = columns[1].text.strip()
            data.append({'name': name, 'svg_url': svg_url})

    # Create a Pandas DataFrame
    df = pd.DataFrame(data)

    # Convert SVG images to PDF and save them
    for index, row in df.iterrows():
        pdf_output_path = f"{row['name']}.pdf"
        convert_svg_to_png(row['svg_url'], pdf_output_path)
        df.at[index, 'pdf_path'] = pdf_output_path

    # Save the DataFrame to Excel
    df.to_excel(excel_output_path, index=False)

# Example usage
html_url = 'https://companiesmarketcap.com/tech/largest-tech-companies-by-market-cap/'
excel_output_path = 'output.xlsx'
scrape_and_save_to_excel(html_url, excel_output_path)




Collecting svglib
  Using cached svglib-1.5.1-py3-none-any.whl
Collecting reportlab
  Using cached reportlab-4.0.8-py3-none-any.whl.metadata (1.4 kB)
Collecting lxml (from svglib)
  Using cached lxml-4.9.4-cp310-cp310-win_amd64.whl.metadata (3.8 kB)
Using cached reportlab-4.0.8-py3-none-any.whl (1.9 MB)
Using cached lxml-4.9.4-cp310-cp310-win_amd64.whl (3.8 MB)
Installing collected packages: reportlab, lxml, svglib
Successfully installed lxml-4.9.4 reportlab-4.0.8 svglib-1.5.1
