In [26]:
import pathlib
import requests
import subprocess
import os

from bs4 import BeautifulSoup
from typing import List, Tuple
from urllib.parse import urlparse, parse_qs

In [3]:
swift_available_data = ['auxil', 'bat', 'log', 'uvot', 'xrt']

In [4]:
def search_swift_dead_portal(search: str) -> List[Tuple[str, str]]:
    
    # Searches the dead Swift portal by target name and returns a list of tuples (target id, target name)
    # Target names aren't necessarily standardized so a single object may have multiple target names
    
    # construct the search url
    base_search_url = 'https://www.swift.ac.uk/dead_portal/getobject.php'
    search_url = base_search_url + '?name=' + search + '&submit=Search+Names'
    
    # download the search page and parse it
    page_html = requests.get(search_url)
    search_soup = BeautifulSoup(page_html.text, features="lxml")
    
    # TODO: if there are 0 results, you get an error page - check for that
    # if there is only 1 result, you get taken directly to its download page
    # both of these cases break the search results as it stands
    
    # get the main results table
    results_table = search_soup.find("table", {"class": "chTable"})
    
    # ignore the first row with the names of the columns, and the last row with links for all of the data
    table_rows = results_table.find_all("tr")[1:-1]
    
    # .contents is a list, our table has only one element in it, so take contents[0]
    tids = [row.find("td", {"headers": "row_targ"}).contents[0] for row in table_rows]
    tnames = [row.find("td", {"headers": "row_name"}).contents[0] for row in table_rows]

    all_targets_zip = zip(tids, tnames)
    return list(all_targets_zip)

In [5]:
def get_swift_wget_commands(tid: str, dtype: str, overwrite: bool) -> List[str]:

    # for any given target id, there may be multiple observations in their own directories,
    # with the naming scheme {target id}001/, {target id}002/, etc.
    # so we let the server give us the appropriate wget commands because it knows how
    # many observations each target id has
    
    if overwrite is False:
        overwrite_option = '-nc'
    else:
        overwrite_option = ''
        
    # this page returns a script with wget commands to download our data
    base_wget_url = f'https://www.swift.ac.uk/archive/download.sh?reproc=1&tid={tid}&source=obs&subdir={dtype}'
    wget_response = requests.get(base_wget_url)
    wget_commands = [line for line in wget_response.text.splitlines() if 'wget' in line]
    urls = [command.split()[-1] for command in wget_commands]
    
    # -nc ==> no clobber: don't replace already downloaded files
    # -q ==> quiet mode, no output
    # -w 2 ==> wait 2 seconds between files
    # -nH ==> don't create a directory based on the host, in this case no folder named www.swift.ac.uk/
    # --cut-dirs=2 ==> remove the /archive/reproc/ folders on the server from being created locally
    # -r ==> recursive: grab everything under this folder on the server
    # --reject ... ==> specify files that we don't want from the server
    adjusted_wget_commands = ['wget ' + overwrite_option + ' -q -w 2 -nH --cut-dirs=2 -r --no-parent --reject index.html*,robots.txt* ' + url for url in urls]
    
    return adjusted_wget_commands

In [6]:
def swift_download_uncompressed(tid: str, dtype: str, dest_dir: pathlib.Path = None, overwrite: bool = False) -> None:
    
    # given a Swift target id and type of data, this function downloads the uncompressed
    # data to the directory dest_dir
    
    # check to make sure we're asking for a type of data that exists
    if dtype not in swift_available_data:
        print(f"Unsupported data type {dtype}!\nTry one of: {', '.join(swift_available_data)}.")
        return None
    
    # get our download commands from the server
    wget_commands = get_swift_wget_commands(tid=tid, dtype=dtype, overwrite=overwrite)
    if wget_commands is None:
        print("No wget commands to execute, skipping downloads...")
        return
    
    # change folders if we need to
    old_cwd = os.getcwd()
    if dest_dir is not None:
        os.chdir(dest_dir)
    print(f"Downloading {dtype} data of target id {tid} to {os.getcwd()} ...")
    
    # run each command to grab the individual observations for this target id
    for command in wget_commands:
        presult = subprocess.run(command.split())
        if presult.returncode != 0:
            print(f"Non-zero return code {presult.returncode} for {command}!")
    
    # change folders back
    os.chdir(old_cwd)

In [7]:
def swift_download_compressed(tid: str, tname: str, dtype: str, archive_type: str, dest_dir: pathlib.Path, overwrite: bool = False) -> None:

    """
        Downloads an archive of Swift data from swift.ac.uk to dest_dir

        Parameters
        ----------
        tid : string
            The target ID to be downloaded, e.g. '00020405'
        tname: string
            The name of the target, e.g. 'CometC/2031US10(Catalina)'
        dtype: string
            The type of data being downloaded, e.g. 'uvot'
        archive_type: string
            One of 'zip' or 'tar' to download the corresponding type
        dest_dir: pathlib.Path
            Directory to place files
        overwrite: bool
            Whether or not to overwrite the file if it already exists
    """
    
    # check to make sure we're asking for a type of data that exists
    if dtype not in swift_available_data:
        print(f"Unsupported data type {dtype}!\nTry one of: {', '.join(swift_available_data)}.")
        return None
    
    # check if the archive type is valid
    available_archive_types = ['tar', 'zip']
    if archive_type not in available_archive_types:
        print(f"Unsupported archive type {archive_type}!\nTry one of: {', '.join(available_archive_types)}.")
        return
    
    # change folders if we need to
    old_cwd = os.getcwd()
    if dest_dir is not None:
        os.chdir(dest_dir)
    
    # name the archive with the target id and data type, because the server returns 'download.tar' no matter what
    out_file_stem = pathlib.Path(tid + f"_{dtype}")
    
    # download
    if archive_type == 'zip':
        print(f"Downloading .zip archives is broken server-side so is currently unsupported.")
    if archive_type == 'tar':
        swift_download_compressed_tar(tid=tid, tname=tname, dtype=dtype, out_file_stem=out_file_stem, overwrite=overwrite)

    os.chdir(old_cwd)
    return

In [8]:
def swift_download_compressed_tar(tid: str, tname: str, dtype: str, out_file_stem: pathlib.Path, overwrite: bool) -> None:

    out_file = out_file_stem.with_suffix('.tar')
    if out_file.exists() and overwrite is False:
        print(f"Found {str(out_file)} and overwriting was forbidden, skipping download.")
        return
    
    # build our urls and params to send the server
    swift_referer_base_url = 'https://www.swift.ac.uk/archive/prepdata.php'
    swift_download_portal_base_url = 'https://www.swift.ac.uk/archive/download.tar'

    referer_url = f"{swift_referer_base_url}?tid={tid}&source=obs&name={tname}&referer=portal"
    params = {
        'reproc': '1',
        'tid': tid,
        'source': 'obs',
        'subdir': dtype,
    }

    # lie to the server
    request_header = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/109.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Referer': referer_url,
        'DNT': '1',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-User': '?1',
        'Sec-GPC': '1',
    }

    print(f"Attempting to download {tid} of {tname} to {out_file}, please wait ...")
    response = requests.get(swift_download_portal_base_url, params=params, headers=request_header)
    print(f"Requested data from {response.url}, response code {response.status_code} ...")

    # name the output file if it wasn't passed in an argument
    with open(out_file, 'wb') as f:
        f.write(response.content)
    
    print(f"Wrote {str(out_file)}.")

    return

In [9]:
def swift_download_all_results(search: str, dtype_list: str, dest_dir: pathlib.Path, download_type: str, overwrite=False) -> None:
    
    allowed_download_types = ['uncompressed', 'tar', 'zip']
    if download_type not in allowed_download_types:
        print(f"Unsupported download type {download_type}!\nTry one of: {', '.join(allowed_download_types)}.")
        return
    
    swift_results = search_swift_dead_portal(search)
    
    for dtype in dtype_list:
        for tid, tname in swift_results:
            print(f"{dtype=}\t{tid=}\t{tname=}")
            if download_type == 'uncompressed':
                swift_download_uncompressed(tid=tid, dtype=dtype, dest_dir=dest_dir, overwrite=overwrite)
            if download_type in ['tar', 'zip']:
                swift_download_compressed(tid=tid, tname=tname, dtype=dtype, archive_type=download_type, dest_dir=dest_dir, overwrite=overwrite)      
    
    print("Downloads complete!")

In [11]:
download_path = pathlib.Path.home() / 'Downloads' / 'swift'

swift_download_all_results('CometCatalinaOrbit2', ['auxil', 'log', 'bat'], dest_dir=download_path, download_type='tar')

dtype='auxil'	tid='00034470'	tname='CometCatalinaOrbit2'
Found 00034470_auxil.tar and overwriting was forbidden, skipping download.
dtype='auxil'	tid='00034471'	tname='CometCatalinaOrbit2'
Found 00034471_auxil.tar and overwriting was forbidden, skipping download.
dtype='log'	tid='00034470'	tname='CometCatalinaOrbit2'
Found 00034470_log.tar and overwriting was forbidden, skipping download.
dtype='log'	tid='00034471'	tname='CometCatalinaOrbit2'
Found 00034471_log.tar and overwriting was forbidden, skipping download.
dtype='bat'	tid='00034470'	tname='CometCatalinaOrbit2'
Found 00034470_bat.tar and overwriting was forbidden, skipping download.
dtype='bat'	tid='00034471'	tname='CometCatalinaOrbit2'
Found 00034471_bat.tar and overwriting was forbidden, skipping download.
Downloads complete!


In [13]:
swift_results = search_swift_dead_portal('catalina')

In [14]:
print(swift_results)

[('00020405', 'CometC/2013US10(Catalina)'), ('00033369', 'CometC/2013US10(Catalina)'), ('00033517', 'CometC/2013US10Catalina'), ('00033518', 'CometC/2013US10Catalina'), ('00033554', 'CometC/2013US10Catalina'), ('00033555', 'CometC/2013US10Catalina'), ('00033757', 'CometC/2013US10Catalina'), ('00033758', 'CometC/2013US10Catalina'), ('00033759', 'CometC/2013US10Catalina'), ('00033760', 'CometC/2013US10Catalina'), ('00033822', 'C/2013US10(Catalina)'), ('00033824', 'C/2013US10(Catalina)'), ('00033826', 'C/2013US10(Catalina)'), ('00033827', 'C/2013US10(Catalina)'), ('00033931', 'C/2013US10(Catalina)'), ('00033932', 'C/2013US10(Catalina)'), ('00033933', 'C/2013US10(Catalina)'), ('00033934', 'C/2013US10(Catalina)'), ('00033935', 'C/2013US10(Catalina)'), ('00033936', 'C/2013US10(Catalina)'), ('00034020', 'C/2013US10(Catalina)'), ('00034021', 'C/2013US10(Catalina)'), ('00034022', 'C/2013US10(Catalina)'), ('00034023', 'C/2013US10(Catalina)'), ('00034024', 'C/2013US10(Catalina)'), ('00034025', 'C

In [20]:
def one_result_guy(result=page_html_one_result):
    return True

In [18]:
# construct the search url
base_search_url = 'https://www.swift.ac.uk/dead_portal/getobject.php'
search_url = base_search_url + '?name=' + 'CometCatalinaOrbit1' + '&submit=Search+Names'
    
# download the search page and parse it
page_html_one_result = requests.get(search_url)

In [22]:
print(page_html_one_result.url)

https://www.swift.ac.uk/archive/prepdata.php?tid=34469&source=obs&name=CometCatalinaOrbit1&referer=portal&cone=1


In [28]:
o = urlparse(page_html_one_result.url)

In [29]:
query = parse_qs(o.query)

In [30]:
print(query)

{'tid': ['34469'], 'source': ['obs'], 'name': ['CometCatalinaOrbit1'], 'referer': ['portal'], 'cone': ['1']}
