# Interactive Environment for Grabbing Tadpoles Images

I spent a long time trying to automated the whole thing, but the time is running out. This is designed to be a semi-interactive environment where you log in and navigate to the appropriate months and the script does the rest.

In [None]:
import datetime
import json
import logging
import time
import hashlib

from selenium import webdriver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By

from webdriver_manager.chrome import ChromeDriverManager

logging.basicConfig()

In [None]:
TADPOLES_USERNAME='YOUR_TADPOLES_USERNAME'
TADPOLES_PASSWORD='YOUR_TADPOLES_PASSWORD'
TADPOLES_URL='https://www.tadpoles.com/home_or_work'

DOWNLOAD_DIR = os.path.join(os.getcwd(), 'intermediate')
LOGFILE = os.path.join(DOWNLOAD_DIR, 'log.json')
OUTPUT_DIR = os.path.join(os.getcwd(), 'final')
DAILY_REPORT_DELAY = 2


In [None]:
def md5(fname):
    "see: https://stackoverflow.com/a/3431838/57626"
    hash_md5 = hashlib.md5()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

In [None]:
options = webdriver.ChromeOptions()
options.add_argument('start-maximized')
prefs = {
    "download.default_directory": DOWNLOAD_DIR,
    "download.prompt_for_download": False,
    "profile.default_content_settings.popups": 0,
    "profile.content_settings.exceptions.automatic_downloads.*.setting": 1
}

options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)

driver.get('https://www.tadpoles.com/')

In [None]:
def login(driver, url: str, username: str, password: str) -> None:
    """Executes the Tadpoles login flow

    It used to be that you could start on the Tadpoles landing page, but that doesn't seem to work anymore. Now you need
    to make sure to start at https://tadpoles.com/home_or_work. From there, it should automatically log in as a parent
    to the system.
    """
    driver.get(url)

    # login_button = wait_by_xpath(driver, "//a[@href='/home_or_work']")
    # login_button.click()
    # print("clicked on login button")

    parent_block = wait_by_xpath(driver, '//div[@data-bind="click: chooseParent"]')
    parent_block.click()
    print("clicked on parent button")

    tadpoles_button = wait_by_xpath(driver, "//img[@data-bind='click:chooseTadpoles']")
    tadpoles_button.click()
    print("clicked on tadpoles button")

    email_input = wait_by_xpath(driver, "//input[@placeholder='your email address']")
    email_input.send_keys(username)
    print("entered email address")

    password_input = wait_by_xpath(driver, "//input[@type='password']")
    password_input.send_keys(password)
    print("entered password")

    login_submit_button = wait_by_xpath(driver, "//button[@type='submit']")
    login_submit_button.click()
    print("submitted login attempt")

In [None]:
def click_header_element(driver, title: str='dailyreports') -> None:
    """Simple helper function to click on the "Daily Reports" tab.

    You may not actually need this or you might, it depends on the mood
    of Tadpoles on the day. I wish I was making that up."
    header = driver.find_element_by_xpath(f"//a[@data-bind=\"click: function(){{changeSelection('{title}')}}\"]")
    header.click()

In [None]:
def wait_by_xpath(driver, xpath: str, max_delay: int=10) -> WebElement:
    """Helper function to wait for an XPath element to become visible

    In a PWA you're often left waiting for elements to become visible before
    moving onto the next step in the script."""
    return WebDriverWait(driver, max_delay).until(
        EC.presence_of_element_located((By.XPATH, xpath))
    )

In [None]:
def wait_for_complete_download(filename: str,
                               ext: str='.crdownload') -> None:
    """Wait for a file to be completely downloaded by selenium before continuing.

    This probably isn't the best way to handle this task as it uses
    `time.sleep` to wait between checks. In the case of Selenium with
    Chrome, we can wait until the `.crdownload` file no longer exists
    and the final file exists.

    This function is a little specific to Chrome because obviously Firefox
    and other WebDrivers don't have temporary files with the extension
    .crdownload.
    """
    while True:
        print("waiting for file {filename}".format(filename=filename))
        if os.path.isfile(filename + ext):
            time.sleep(1)
        elif os.path.isfile(filename):
            break
        else:
            time.sleep(1)
    return

def save_image_element(driver, element: WebElement, filename: str) -> None:
    """Save the visual representation of an element from the browser.

    For some odd reason there isn't a way to save images from Selenium. Thus, even
    though the actual image can be seen in the media tab, it can't be saved, which is,
    frankly, really dumb. Instead, this creates a hidden element on the page that when
    clicked saves the image to the download folder.

    see:
    - https://stackoverflow.com/q/15018372/57626
    - https://stackoverflow.com/q/31472754/57626
    """
    href = None

    for attr_name in ['href', 'src', 'rel']:
        if element.get_attribute(attr_name) is not None:
            href = element.get_attribute(attr_name)
            break
    if href is None:
        raise Exception('No idea how to process this tag')

    script = f"""var link = document.createElement('a');
    link.setAttribute('href', '{href}');
    link.setAttribute('download', '{filename}');
    link.click();
    """
    driver.execute_script(script)

def process_daily_report(driver, download_dir: str=DOWNLOAD_DIR) -> None:
    """Save information from a daily report, as long as it is already visible.

    This function assumes that the daily report is already visible on the screen and that it has
    reached a quiescent state so it won't change anymore.

    args:
        driver:
        download_dir:
    """
    logfile = open(LOGFILE, "a+")
    date_str = driver.find_element_by_xpath("//div[@class='modal-header']/h3/span[2]").text
    date = datetime.datetime.strptime(date_str + ' 12:00 -0500',
                                        '%b %d, %Y %H:%M %z')
    images = []
    snapshots = driver.find_elements_by_xpath(
        "//ul[@data-bind='foreach: snapshots']//a[@type='image']")
    for snap in snapshots:
        blob = {"element": snap, "description": ""}
        description = snap.get_attribute('title')
        if description.startswith(' - '):
            description = description[3:]
        
        if not description:
            try:
                description = snap.find_element_by_xpath('.//ancestor::td/following-sibling::td').text.strip()
            except:
                pass
        images.append(blob)

    activities = driver.find_elements_by_xpath(
        "//li/div[@data-bind=\"template: {name: 'view/dailyReport/activity'}\"]//a[@type='image']")

    for act in activities:
        blob = {"element": act, "description": ""}
        try:
            blob["description"] = act.find_element_by_xpath('.//ancestor::td/preceding-sibling::td').text
        except:
            pass
        images.append(blob)

    for ctr, image in enumerate(images):
        outfile = '{date} - Photo {num}.jpg'.format(
            date=date.strftime("%Y%m%d"), num=ctr + 1)

        save_image_element(driver, image["element"], outfile)
        wait_for_complete_download(os.path.join(DOWNLOAD_DIR, outfile))
        logging.warn("not setting image parameters for %s", outfile)
        data_block = json.dumps({
            "date": date.astimezone().replace(microsecond=0).isoformat(),
            "outfile": outfile,
            "description": image["description"],
            "md5": md5(os.path.join(DOWNLOAD_DIR, outfile))
        }) + "\n"
        logfile.write(data_block)
        logfile.flush()
        # set_image_date(date, outfile)
        # set_image_description(description, outfile)

        # TODO: add in support for image cropping and GPS metadata

    snapshot_tiles = driver.find_elements_by_xpath("//div[@data-bind=\"template: {name: 'view/tiles/snapshotTile'}\"]")

    video_elements = driver.find_elements_by_xpath(
        "//ul[@data-bind='foreach: snapshots']//div[@class='play-icon']"
        "/following-sibling::div[1]")
    for ctr, elem in enumerate(video_elements):
        outfile = '{date} - Video {num}.mp4'.format(date=date.strftime("%Y%m%d"),
                                                    num=ctr + 1)
        description = elem.find_element_by_xpath(
            "./parent::div/parent::div/a").get_attribute('title')
        if description.startswith(' - '):
            description = description[3:]
        save_image_element(driver, elem, outfile)
        wait_for_complete_download(os.path.join(DOWNLOAD_DIR, outfile))

    # get the HTML of the element for later parsing with LXML
    modal = driver.find_element_by_id("dr-modal-printable")
    html = driver.execute_script("return arguments[0].outerHTML;", modal)
    with open(
        os.path.join(download_dir,
                        '{date} - report.html'.format(date=date.strftime("%Y%m%d"))), 'w') as f:
        f.write(html)

In [None]:
def process_all_daily_reports(driver,
                              year: int=None,
                              download_dir: str=DOWNLOAD_DIR) -> None:
    date_header_xpath = ("//div[@class='tadpoles modal-overflow-wrapper']/div"
                         "/div[@id='dr-modal-printable']/div[@class='modal-header']/h3/span[2]")
    date_header_xpath = ("//div[@class='hidden modal-open']"
                         "/div[@id='dr-modal-printable']/div[@class='modal-header']/h3/span[2]")
    downloaded_files = os.listdir(download_dir)
    tiles = driver.find_elements_by_xpath(
        "//div[@class='well left-panel pull-left']/ul[@class='thumbnails']/li")

    # The tiles are in reverse chronological order. This crawls them in chronological order to
    # make it easier if the script dies somewhere to pick it up later.
    for tile in reversed(tiles):
        # get the tile date to see if we've already processed it
        print(tile)
        tile_day_month = tile.find_element_by_xpath(".//div/h2").text

        # this is a _very_ fragile way to get the year. You're much better off
        # passing in the year. It seems like the only identifier to show the year
        # is the background color of the element. OUCH.
        if year is None:
            year = int(driver.find_element_by_xpath(
                "//div[@style='background-color: rgb(250, 167, 50);']"
                "//span[@data-bind='text: caption']").text)

        tile_date = datetime.datetime.strptime(tile_day_month + f'/{year}', '%m/%d/%Y')

        # format two strings:
        #   - `file_date_str` is used for the date in saved files
        #   - `expected_date_text` is used for the date that shows that the report has loaded
        file_date_str = tile_date.strftime('%Y%m%d')
        expected_date_text = tile_date.strftime('%b %-d, %Y')

        print(f"Processing report for {expected_date_text}")
        # see if there are already files in the directory with that starting name
        if len([x for x in downloaded_files if x.startswith(file_date_str)]) > 0:
            print(f"already processed date {file_date_str}")
        else:
            tile.click()

            # wait up to 10 seconds for the daily report to become visible
            # we can discard the unused return value
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, date_header_xpath))
            )

            # wait up to 10 seconds for the actual date to be filled in
            # we can discard the unused return value
            WebDriverWait(driver, 10).until(
                EC.text_to_be_present_in_element((By.XPATH, date_header_xpath),
                                                    expected_date_text)
            )

            # process the daily report
            process_daily_report(driver)

            # close the daily report
            modal_exit = driver.find_element_by_xpath(
                "//div[@class='modal-footer']/button[@data-bind='click: $parent.close']")
            modal_exit.click()
            time.sleep(DAILY_REPORT_DELAY)

In [None]:
def get_month_tile_dict(driver) -> dict:
    month_mapping = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
                    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
    month_thumbnails = driver.find_elements_by_xpath("//ul[@class='thumbnails']")[1]
    month_tiles = month_thumbnails.find_elements_by_xpath(".//div[@class='tile pointable']")
    month_tile_dict = {}
    for month_tile in month_tiles:
        month, year = month_tile.text.split('\n')
        key = (month_mapping[month.lower()], int(year))
        month_tile_dict[key] = month_tile

    return month_tile_dict
        
def process_all_months(driver, mindate=datetime.date(2000, 1, 1)) -> None:
    """Iterate over all the months to get all the images and reports.

    args:
        driver: the webdriver instance to use for further calls
    """
    # TODO: this should be a constant...
    timeline_list_xpath = "//div[@class='well left-panel pull-left']/ul[@class='thumbnails']/li"

    month_tile_dict = get_month_tile_dict(driver=driver)
    for month, year in reversed(list(month_tile_dict.keys())):
        if datetime.date(year, month, 1) < mindate:
            continue
        tile = month_tile_dict[(month, year)]
        tile.click()

        # wait up to 10 seconds for the timeline to visible
        # we can discard the unused return value
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, timeline_list_xpath))
        )

        time.sleep(5)
        process_all_daily_reports(driver=driver)

In [None]:
#
# login to tadpoles
#
login(driver, TADPOLES_URL, TADPOLES_USERNAME, TADPOLES_PASSWORD)

In [None]:
#
# click on the header for daily reports
#
click_header_element(driver)

In [None]:
#
# If you have a daily report up, this will download all the images for that individual report
#
# process_daily_report(driver)


In [None]:
#
# this will download all the reports for a given month
#
# process_all_daily_reports(driver)

In [None]:
#
# this will download all of the months that come after the date
# specified as mindate
# 
# In most cases, this is the cell that will execute most of the work for
# the project. 
#
# process_all_months(driver, mindate=datetime.date(2018, 7, 30))
process_all_months(driver)