In [9]:
import json
import requests
import os
from io import BytesIO
from zipfile import ZipFile
from PIL import Image
from IPython.display import display, FileLink


In [15]:
def parse_har(file_content):
    urls = []
    har_data = json.loads(file_content)
    for entry in har_data['log']['entries']:
        url = entry['request']['url']
        if 'img_' in url and 'thumb' in url:
            urls.append(url)
    return urls

def replace_thumbnail_urls(urls):
    main_urls = [url.replace('thumb', 'main') for url in urls]
    return main_urls

def sanitize_filename(url):
    # Extract the filename part and remove URL parameters
    filename = url.split('/')[-1]
    filename = filename.split('?')[0]
    return filename

def download_images(urls, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    image_paths = []
    for url in urls:
        image_name = sanitize_filename(url)
        image_path = os.path.join(output_folder, image_name)
        response = requests.get(url)
        with open(image_path, 'wb') as f:
            f.write(response.content)
        image_paths.append(image_path)
    return image_paths


def display_images(image_paths):
    for image_path in image_paths:
        img = Image.open(image_path)
        display(img)

In [22]:
har_file_path = 'schools.procareconnect.com_2023.har'
output_folder = "downloaded_images/2023/"
import os
os.makedirs(output_folder, exist_ok=True)

In [23]:
with open(har_file_path, 'r') as file:
    har_content = file.read()

urls = parse_har(har_content)
print(f"Found {len(urls)} image URLs.")
main_urls = replace_thumbnail_urls(urls)
print("Replaced thumbnail links with main picture links.")

print("Downloading images...")
image_paths = download_images(main_urls, output_folder)
print(f"Images downloaded to '{output_folder}'.")

Found 748 image URLs.
Replaced thumbnail links with main picture links.
Downloading images...
Images downloaded to 'downloaded_images/2023/'.
