# Poll [Terafarm Delivery Update] published sheet and store actual delivery times for each location

In [1]:
from urllib.request import urlopen
from pprint import pprint
import sched, time
from datetime import datetime
import itertools

from bs4 import BeautifulSoup
import pandas as pd
import html2text

url = 'https://docs.google.com/document/d/e/2PACX-1vQ8jVXdAQmXfYJ4YIz6-NQtdPEjyn53oLggW5EphnpDTf3qabI4G3DwqYRc-TpNlq8Itfxgd5W8rWcC/pub'
delivery_zones = ['NORTH PENINSULA', 'SOUTH PENINSULA', 'EAST BAY', 'MONTEREY PENINSULA']
termination_tag = 'Published by'
waiting_status_flag = 'Waiting:'
poll_interval_secs = 30


def get_arrival_status(url):
    resource = urlopen(url)
    content = resource.read()
    charset = resource.headers.get_content_charset()
    content = content.decode(charset)

    def extract_text(html):
        soup = BeautifulSoup(html)
        for s in soup(['script', 'style']):
            s.decompose()
        # clean for charactes beautifulsoup didn't catch
        strings = [s.replace('\xa0', '') for s in soup.stripped_strings]
        # remove the head and the tail of the text that is not pertinent to delivery information
        drop_up_to_and_excluding = lambda l, e : list(itertools.dropwhile(lambda y: y != e, l))
        take_up_to_and_excluding = lambda l, e : list(itertools.takewhile(lambda y: y != e, l))
        stripped_list = take_up_to_and_excluding(drop_up_to_and_excluding(strings, delivery_zones[0]), termination_tag)
        stripped_list = [string for string in stripped_list if string != ':']  # in case delivery started and ETAs posted
        return stripped_list

    lines = extract_text(content)

    # create a dataframe from lines of text, flatten information so each row is: ['zone', 'location', 'status']
    zone = None
    rows = []
    i = 0
    while True:
        if i >= len(lines):
            break
        else:
            string = lines[i]
            if string in delivery_zones:
                zone = string
                i += 1
            else:
                rows.append((zone, lines[i + 1], string))
                i += 2
    return pd.DataFrame(rows, columns=['zone', 'location', 'status'])

In [3]:
filename = f"deliveries/{datetime.now().date()}.csv"

old_arrival_status = get_arrival_status(url)
s = sched.scheduler(time.time, time.sleep)

# store snapshot of initial stations with waiting times
with open(filename, "a+") as deliveries:
    deliveries.write('zone,location, status,datetime\n')
    for _, row  in old_arrival_status.iterrows():
        deliveries.write(', '.join(row.values.tolist() + [str(datetime.now()) + '\n']))
    
with open(filename, "a+") as deliveries:
    
    # main scheduler function, it schedules itself before it ends
    def do_something(sc):
        global old_arrival_status
    
        arrival_status = get_arrival_status(url)
        changes = pd.concat([old_arrival_status, arrival_status]).drop_duplicates(keep=False)
        # log any change to initial status
        if len(changes) > 0:
            for _, change  in (changes[changes.status != waiting_status_flag]).iterrows():
                new_line = ', '.join(change.values.tolist() + [str(datetime.now()) + '\n'])
                deliveries.write(new_line)
                print(new_line)
            deliveries.flush()
            old_arrival_status = arrival_status.copy()
        if datetime.now().hour < 13:
            s.enter(poll_interval_secs, 1, do_something, (s,))

    s.enter(poll_interval_secs, 1, do_something, (s,))
    s.run()