## Download the snapshots
Now that we have the snapshots timestamp, we can download the snapshots from the Wayback Machine.

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.wait import WebDriverWait
from pathlib import Path
import time


### Load the data

In [2]:
########################
## Read URLs from a JSON file
########################

import json
URL_PATH = "urls_with_snapshots.json"

# INITIALIZE THE URL LIST
url_list = []

with open(URL_PATH, "r") as f:
    url_list = json.load(f)

print(f"Read {len(url_list)} URLs from {URL_PATH}")
print("First 5 URLs:")
print(url_list)
for url in url_list[:5]:
    print(url)


Read 20 URLs from urls_with_snapshots.json
First 5 URLs:
[{'id': '4da862f1', 'url': 'http://www.voicenet.com/~squeeze/contras.html', 'snapshots': [{'timestamp': 19961222204926, 'statuscode': 200, 'url': 'https://web.archive.org/web/19961222204926if_/http://www.voicenet.com/~squeeze/contras.html'}, {'timestamp': 19970406223306, 'statuscode': 200, 'url': 'https://web.archive.org/web/19970406223306if_/http://www.voicenet.com/~squeeze/contras.html'}, {'timestamp': 19970615113827, 'statuscode': 200, 'url': 'https://web.archive.org/web/19970615113827if_/http://www.voicenet.com/~squeeze/contras.html'}, {'timestamp': 19970804051208, 'statuscode': 200, 'url': 'https://web.archive.org/web/19970804051208if_/http://www.voicenet.com/~squeeze/contras.html'}, {'timestamp': 19991012124911, 'statuscode': 200, 'url': 'https://web.archive.org/web/19991012124911if_/http://www.voicenet.com/~squeeze/contras.html'}, {'timestamp': 19991122034535, 'statuscode': 200, 'url': 'https://web.archive.org/web/19991122

### Iterate over the snapshots and download them
We iterate over a list of website entries, where each entry contains a URL, an ID, and a collection of snapshots. For each entry, the script:
- Creates a unique directory based on the entry's ID within a predefined base directory.
  - We cannot use the URL as the directory name because it may contain characters that are not allowed in directory names, such as slashes and colons.
- Iterates through each snapshot associated with this entry.
- For every snapshot, it:
  - Builds a directory for that specific snapshot based on its timestamp.
  - Attempts to download the website snapshot from its URL, retrying up to three times in case of failure.
  - Saves the downloaded HTML content into a file within the snapshot's directory.
  - Pause between each snapshot download to manage request frequency, and extends this pause in case of download errors.

In [None]:
import os
import requests
import time

DATA_BASE_PATH = "data"
sleep_time_on_error = 5
sleep_time_per_snapshot = 1

total_entry_count = len(url_list)

for i, entry in enumerate(url_list):
    print(f"Processing entry: {i+1}/{total_entry_count}")
    url = entry["url"]
    id = entry["id"]
    snapshots =   entry["snapshots"]

    print(f"Processing URL: {url}")

    # create a directory for the URL
    url_path = os.path.join(DATA_BASE_PATH, id)
    os.makedirs(url_path, exist_ok=True)

    for snapshot in snapshots:
        snapshot_url = snapshot["url"]
        snapshot_timestamp = snapshot["timestamp"]
        # download the website snapshot from the url

        # make a directory for the snapshot
        snapshot_dir = os.path.join(url_path, str(snapshot_timestamp))
        os.makedirs(snapshot_dir, exist_ok=True)

        max_retries = 3

        for i in range(max_retries):
            try:
                response = requests.get(snapshot_url)
                print(f"Downloaded snapshot from {snapshot_url}")
                snapshot_path = os.path.join(snapshot_dir, "snapshot.html")
                with open(snapshot_path, "w") as f:
                    f.write(response.text)
                break

            except Exception as e:
                print(f"Error downloading snapshot: {e}")
                print(f"Sleeping for {sleep_time_on_error} seconds")
                time.sleep(sleep_time_on_error)
                continue

        print(f"Sleeping for {sleep_time_per_snapshot} seconds")
        time.sleep(sleep_time_per_snapshot)
        
        

### Congrates! You have downloaded the snapshots.
Check out your `data` folder to see the downloaded snapshots.