In [58]:
import hashlib
# url = "http://www.geocities.com/Heartland/Fields/5727/"
# url = "http://www.olink.com.cn/mylove/"
url = "http://www.pagina12web.com.ar/"

def get_md5(url):
    return hashlib.md5(url.encode()).hexdigest()

url_md5 = get_md5(url)

print("URL: " + url)
print("MD5(id): " + url_md5)

URL: http://www.pagina12web.com.ar/
MD5(id): 32c0821cd06d384d330e179d76095a08


In [59]:
import requests

def make_wm_cdx_url(url, from_time="19960101", to_time="20051231"):
    """
    Construct a URL to query the Wayback Machine CDX API
    for a given URL and time range
    """
    base_url = "https://web.archive.org/cdx/search/cdx"
    params = {
        "url": url,
        "from": from_time,
        "to": to_time,
    }

    # this will create a URL with the parameters
    # eg. https://web.archive.org/cdx/search/cdx?url=example.com&from=19960101&to=20051231
    url_with_params = requests.Request("GET", base_url, params=params).prepare().url
    return url_with_params


wayback_cdx_url = make_wm_cdx_url(url)
print("Wayback CDX URL:")
print(wayback_cdx_url)



Wayback CDX URL:
https://web.archive.org/cdx/search/cdx?url=http%3A%2F%2Fwww.pagina12web.com.ar%2F&from=19960101&to=20051231


In [60]:
import pandas as pd

dataframe = pd.read_csv(
    wayback_cdx_url,
    names=[
        "urlkey",
        "timestamp",
        "original",
        "mimetype",
        "statuscode",
        "digest",
        "length",
    ],
    sep="\s+"
)

print("Dataframe:")
print(dataframe.head())


Dataframe:
                 urlkey       timestamp                           original  \
0  ar,com,pagina12web)/  20030713132201  http://www.pagina12web.com.ar:80/   
1  ar,com,pagina12web)/  20030719183604  http://www.pagina12web.com.ar:80/   
2  ar,com,pagina12web)/  20031011110432  http://www.pagina12web.com.ar:80/   
3  ar,com,pagina12web)/  20031026120712  http://www.pagina12web.com.ar:80/   
4  ar,com,pagina12web)/  20031119012503  http://www.pagina12web.com.ar:80/   

    mimetype  statuscode                            digest  length  
0  text/html         200  VBPAYVNGYNLDVYCTTSHTL53PPGGMEIUU    5470  
1  text/html         200  4BSBISB6E5SYHRK6P5BULV7VJFS25YAN    5203  
2  text/html         200  7UO4Y24WMPWPZHJMXQIFTPJ5WLPJWK4M    5571  
3  text/html         200  SB2GVRIAJWHPU2ZACELGRZPM2DVII7ON    5624  
4  text/html         200  PRGQHSKH4FVILXTOH5RUTMC2THHZBQCJ    5440  


In [61]:
snapshot_list = dataframe.to_dict("records")
print(f"Found {len(snapshot_list)} snapshots.")
print("Snapshot list:")
print(snapshot_list)


useful_snapshots = []

for snapshot in snapshot_list:
    if snapshot["statuscode"] != 200:
        continue
    snapshot_url = f"https://web.archive.org/web/{snapshot['timestamp']}if_/{url}"
    
    useful_snapshot = {
        "timestamp": snapshot["timestamp"],
        "url": snapshot_url,
    }
    useful_snapshots.append(useful_snapshot)

print(f"Kept {len(useful_snapshots)} snapshots.")
print("Useful snapshots:")
print(useful_snapshots)


Found 316 snapshots.
Snapshot list:
[{'urlkey': 'ar,com,pagina12web)/', 'timestamp': 20030713132201, 'original': 'http://www.pagina12web.com.ar:80/', 'mimetype': 'text/html', 'statuscode': 200, 'digest': 'VBPAYVNGYNLDVYCTTSHTL53PPGGMEIUU', 'length': 5470}, {'urlkey': 'ar,com,pagina12web)/', 'timestamp': 20030719183604, 'original': 'http://www.pagina12web.com.ar:80/', 'mimetype': 'text/html', 'statuscode': 200, 'digest': '4BSBISB6E5SYHRK6P5BULV7VJFS25YAN', 'length': 5203}, {'urlkey': 'ar,com,pagina12web)/', 'timestamp': 20031011110432, 'original': 'http://www.pagina12web.com.ar:80/', 'mimetype': 'text/html', 'statuscode': 200, 'digest': '7UO4Y24WMPWPZHJMXQIFTPJ5WLPJWK4M', 'length': 5571}, {'urlkey': 'ar,com,pagina12web)/', 'timestamp': 20031026120712, 'original': 'http://www.pagina12web.com.ar:80/', 'mimetype': 'text/html', 'statuscode': 200, 'digest': 'SB2GVRIAJWHPU2ZACELGRZPM2DVII7ON', 'length': 5624}, {'urlkey': 'ar,com,pagina12web)/', 'timestamp': 20031119012503, 'original': 'http:/

In [62]:
import os
import time
from IPython.display import clear_output


DATA_DIR = "example-data"
URL_DIR = os.path.join(DATA_DIR, url_md5)

            
def download_file(url, dest, retries=3, sleep=1):
    if os.path.isfile(dest):
        print(f"File {dest} already exists.")
        return
    print(f"Downloading {url} to {dest}")
    for i in range(retries):
        print(f"Attempt {i+1}/{retries}")
        try:
            response = requests.get(url)
            with open(dest, "wb") as f:
                f.write(response.content)
            return
        except Exception as e:
            print(f"Error: {e}")
            time.sleep(sleep)

sleep_time_per_snapshot = 1

max_retries = 3

for snapshot in useful_snapshots:
    snapshot_url = snapshot["url"]
    print(snapshot_url)
    
    

    snapshot_dir = os.path.join(URL_DIR, str(snapshot["timestamp"]))
    os.makedirs(snapshot_dir, exist_ok=True)
    snapshot_file = os.path.join(snapshot_dir, "snapshot.html")
    download_file(snapshot_url, snapshot_file)
        
    time.sleep(sleep_time_per_snapshot)
    clear_output()



https://web.archive.org/web/20030713132201if_/http://www.pagina12web.com.ar/
Downloading snapshot (attempt 1/3)
Downloaded https://web.archive.org/web/20030713132201if_/http://www.pagina12web.com.ar/ to example-data/32c0821cd06d384d330e179d76095a08/20030713132201/snapshot.html
Downloading snapshot (attempt 2/3)
File example-data/32c0821cd06d384d330e179d76095a08/20030713132201/snapshot.html already exists.
Downloading snapshot (attempt 3/3)
File example-data/32c0821cd06d384d330e179d76095a08/20030713132201/snapshot.html already exists.
https://web.archive.org/web/20030719183604if_/http://www.pagina12web.com.ar/
Downloading snapshot (attempt 1/3)
Downloaded https://web.archive.org/web/20030719183604if_/http://www.pagina12web.com.ar/ to example-data/32c0821cd06d384d330e179d76095a08/20030719183604/snapshot.html
Downloading snapshot (attempt 2/3)
File example-data/32c0821cd06d384d330e179d76095a08/20030719183604/snapshot.html already exists.
Downloading snapshot (attempt 3/3)
File example-dat

KeyboardInterrupt: 

In [None]:
import bs4
import os
import json
import requests
import urllib
from IPython.display import clear_output


DATA_DIR = "example-data"
WAYBACK_BASE_URL = "https://web.archive.org"

html_files = []
# walk through all files in the DATA_DIR
# and find all HTML files
for root, dirs, files in os.walk(DATA_DIR):
    for file in files:
        if file.endswith(".html"):
            file_path = os.path.join(root, file)
            html_files.append(file_path)


            
def download_file(url, dest, retries=3, sleep=1):
    if os.path.isfile(dest):
        print(f"File {dest} already exists.")
        return
    print(f"Downloading {url} to {dest}")
    for i in range(retries):
        print(f"Attempt {i+1}/{retries}")
        try:
            response = requests.get(url)
            with open(dest, "wb") as f:
                f.write(response.content)
            return
        except Exception as e:
            print(f"Error: {e}")
            time.sleep(sleep)

print(f"Found {len(html_files)} HTML files.")

for file_path in html_files:
    print("Processing file:", file_path)
    with open(file_path, "r") as f:
        html_dir = os.path.dirname(file_path)
        # 'utf-8' codec can't decode byte 0xc7 in position 1051: invalid continuation byte
        try:
            html_content = f.read()
        except Exception as e:
            print("Can't read file:", file_path)
            continue


        soup = bs4.BeautifulSoup(html_content, "html.parser")
  
        ########## Extract basic metadata ##########
        title = soup.title.string
        description = soup.find("meta", attrs={"name": "description"})
        if description:
            description = description["content"]
        else:
            description = None
        
        metadata = {
            "url" : url,
            "title": title,
            "description": description,
        }

        metadata_path = os.path.join(html_dir, "metadata.json")
        with open(metadata_path, "w") as f:
            json.dump(metadata, f, indent=2)

        ########## Extract links ##########
        links = []
        for link in soup.find_all("a"):
            href = link.get("href")
            if href:
                links.append(href)
        
        links_path = os.path.join(html_dir, "links.json")
        with open(links_path, "w") as f:
            json.dump(links, f, indent=2)

        ########################################
        ########## Extract sources #############
        ########################################
            
        sources = []
        tags = ["link", "script", "iframe", "embed", "audio", "video", "source", "track", "object"]
        # getting all the `src` attributes from the tags
        # the tags we are interested in are defined in the `tags` list
        for tag in tags:
            for script in soup.find_all(tag):
                src = script.get("src")
                if src:
                    src = WAYBACK_BASE_URL + src
                    sources.append(src)

        sources_path = os.path.join(html_dir, "sources.json")
        print(f"Found {len(sources)} sources, saving results to {sources_path}")
        with open(sources_path, "w") as f:
            json.dump(sources, f, indent=2)

        ########################################
        ########## Extract MIDI URLs ###########
        ########################################
        midi_urls = []
        for src in sources:
            if src.endswith(".mid"):
                midi_urls.append(src)
        
        midi_urls_path = os.path.join(html_dir, "midi.json")
        print(f"Found {len(midi_urls)} MIDI files, saving results to {midi_urls_path}")
        with open(midi_urls_path, "w") as f:
            json.dump(midi_urls, f, indent=2)
        
        midi_dir = os.path.join(html_dir, "mid")
        os.makedirs(midi_dir, exist_ok=True)
        print(f"Dowloading {len(midi_urls)} MIDI files to {midi_dir}")
        for midi_url in midi_urls:
            print("Downloading MIDI file:", midi_url)
            midi_file_path = os.path.join(midi_dir, os.path.basename(midi_url))
            download_file(midi_url, midi_file_path)
     
        ########################################
        ########## Extract audio ###############
        ########################################
        audio_urls = []
        audio_extensions = [".mp3", ".wav", ".ogg", ".aac", ".flac", ".alac", ".aiff", ".dsd", ".wma", ".opus", ".m4a"]
        for src in sources:
            if src.endswith(tuple(audio_extensions)):
                audio_urls.append(src)

        audio_urls_path = os.path.join(html_dir, "audio.json")
        print(f"Found {len(audio_urls)} audio files, saving results to {audio_urls_path}")
        with open(audio_urls_path, "w") as f:
            json.dump(audio_urls, f, indent=2)


        ########################################
        ########## Extract image URLs ##########
        ########################################
        image_urls = []
        image_extensions = [".jpg", ".jpeg", ".png", ".gif", ".svg", ".webp", ".bmp", ".ico", ".tiff", ".tif"]
        for src in sources:
            if src.endswith(tuple(image_extensions)):
                image_urls.append(src)

        image_urls_path = os.path.join(html_dir, "images.json")
        print(f"Found {len(image_urls)} images, saving results to {image_urls_path}")
        with open(image_urls_path, "w") as f:
            json.dump(image_urls, f, indent=2)


        image_dir = os.path.join(html_dir, "images")
        os.makedirs(image_dir, exist_ok=True)
        print(f"Dowloading {len(image_urls)} images to {image_dir}")
        for image_url in image_urls:
            print("Downloading image:", image_url)
            image_file_path = os.path.join(image_dir, os.path.basename(image_url))
            download_file(image_url, image_file_path)


        ########################################
        ########## Extract videos #############
        ########################################
            
        video_urls = []
        video_extensions = [".mp4", ".webm", ".ogg", ".avi", ".flv", ".mov", ".wmv", ".mkv"]
        for src in sources:
            if src.endswith(tuple(video_extensions)):
                video_urls.append(src)

        video_urls_path = os.path.join(html_dir, "videos.json")
        print(f"Found {len(video_urls)} videos, saving results to {video_urls_path}")
        with open(video_urls_path, "w") as f:
            json.dump(video_urls, f, indent=2)

        ########################################
        ########## Extract Shockwave files ####
        ########################################
        swf_urls = []
        for src in sources:
            if src.endswith(".swf"):
                swf_urls.append(src)
        
        swf_urls_path = os.path.join(html_dir, "swf.json")
        print(f"Found {len(swf_urls)} Shockwave files, saving results to {swf_urls_path}")
        with open(swf_urls_path, "w") as f:
            json.dump(swf_urls, f, indent=2)

    


        clear_output()
     

      

