In [5]:
import os
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Set up a session with retries and timeout
session = requests.Session()
retry_strategy = Retry(
    total=3,
    backoff_factor=0.3,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["HEAD", "GET", "OPTIONS"],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)
REQUEST_TIMEOUT = 10  # seconds

# Directory containing JSON files
json_dir = "../resources"

# List to store the extracted data
data = []

# Iterate through each JSON file in the directory
for file_name in os.listdir(json_dir):
    if file_name.endswith(".json"):
        file_path = os.path.join(json_dir, file_name)
        try:
            with open(file_path, 'r') as json_file:
                json_content = json.load(json_file)
        except Exception as e:
            print(f"Error loading file {file_path}: {e}")
            continue
        
        # Use list if JSON contains multiple rows, else try with key 'urls'
        if isinstance(json_content, list):
            rows = json_content
        else:
            rows = json_content.get("urls", [])
        
        # Iterate through each row
        for row in rows:
            url = row.get("url")
            if not url:
                continue  # Skip if no URL is provided
            
            try:
                response = session.get(url, timeout=REQUEST_TIMEOUT)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, 'html.parser')
                # Extract all hrefs from the page
                for link in soup.find_all('a', href=True):
                    href = link['href']
                    data.append({"url": url, "href": href})
            except Exception as e:
                print(f"Error processing URL {url}: {e}")

# Create a DataFrame from the collected data with only url and href columns
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = os.path.join(json_dir, "hrefs_table.csv")
df.to_csv(csv_path, index=False)
print(f"Data saved to {csv_path}")

Error processing URL https://www.sevendays.nl/: HTTPSConnectionPool(host='www.sevendays.nl', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x715ba8902ab0>: Failed to resolve 'www.sevendays.nl' ([Errno -2] Name or service not known)"))
Data saved to ../resources/hrefs_table.csv


In [6]:
import torch

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    print(f"GPU is available: {gpu_name}")
else:
    print("No GPU is available.")

No GPU is available.
