In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def get_absolute_favicon_url(base_url, relative_url):
    try:
        if relative_url.startswith(('http://', 'https://')):
            return relative_url

        absolute_url = urljoin(base_url, relative_url)
        return absolute_url
    except Exception as e:
        print(f"Error converting relative URL to absolute: {e}")
        return None

def get_favicon_url(url):
    try:
        retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
        adapter = HTTPAdapter(max_retries=retries)
        session = requests.Session()
        session.mount('http://', adapter)
        session.mount('https://', adapter)

        response = session.get(url, verify=False, timeout=5)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        favicon_tag = soup.find('link', rel='icon') or soup.find('link', rel='shortcut icon')
        favicon_url = favicon_tag['href'] if favicon_tag else None
        return get_absolute_favicon_url(url, favicon_url) if favicon_url else None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching favicon for {url}: {e}")
        return None

df = pd.read_csv('data.csv')
df['favicon_url'] = df['url'].apply(get_favicon_url)
df.to_csv('data2.csv', index=False)

Error fetching favicon for https://futurecorp.london: HTTPSConnectionPool(host='futurecorp.london', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x15ea68c50>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
Error fetching favicon for https://momentfactory.com: HTTPSConnectionPool(host='momentfactory.com', port=443): Max retries exceeded with url: / (Caused by ResponseError('too many 500 error responses'))
Error fetching favicon for http://www.zhestkov.com: HTTPConnectionPool(host='www.zhestkov.com', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x168b91d90>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
Error fetching favicon for http://urmston.xyz: 436 Client Error:  for url: http://urmston.xyz/
Error fetching favicon for https://www.minmax.d

###  Save Images

In [10]:
import os
df = pd.read_csv('data2.csv')

img_folder = 'img'
os.makedirs(img_folder, exist_ok=True)

def download_and_save_favicon(url, save_folder):
    try:
        # Configure retry strategy for network issues
        retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
        adapter = HTTPAdapter(max_retries=retries)
        session = requests.Session()
        session.mount('http://', adapter)
        session.mount('https://', adapter)

        # Fetch the favicon URL with retries and SSL verification disabled
        response = session.get(url, verify=False, timeout=5)
        response.raise_for_status()

        # Extract filename from URL
        filename = url.split("/")[-1]

        # Save the favicon image to the specified folder
        with open(os.path.join(save_folder, filename), 'wb') as f:
            f.write(response.content)

        print(f"Successfully downloaded and saved favicon for {url}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading favicon for {url}: {e}")

for index, row in df.iterrows():
    favicon_url = row['favicon_url']
    if favicon_url:
        download_and_save_favicon(favicon_url, img_folder)

Successfully downloaded and saved favicon for https://rectangle.design/wp-content/themes/rectangle/favicon-32x32.png
Successfully downloaded and saved favicon for https://rulesrulesrules.studio/favicon.ico
Successfully downloaded and saved favicon for https://bus.group/assets/favicon-4TLS4Q2E.png
Successfully downloaded and saved favicon for https://www.datocms-assets.com/8504/1549364939-rndrlogo.jpg?auto=format&h=16&w=16
Successfully downloaded and saved favicon for https://veravandeseyp.com/site/templates//images/favicon.ico
Successfully downloaded and saved favicon for http://www.fredericbrodbeck.de/favicon.ico
Successfully downloaded and saved favicon for https://cdn.sanity.io/images/lrtvpz57/production/3d716c36bddc0d5a2fdf43379dec8315dd4109a5-320x320.svg?w=512&h=512&fm=png
Successfully downloaded and saved favicon for https://www.onformative.com/favicon.ico
Successfully downloaded and saved favicon for https://studiomoniker.com/assets/images/favico.png
Successfully downloaded and 