Imports

In [None]:
%%capture
%load_ext autoreload
%autoreload 2

In [None]:
from   utilities import load_config, get_browser
import itertools
import more_itertools
from   more_itertools import unique_everseen as uniq
import re
from   itertools import chain
from   numpy.random import randn as gauss_noise
import time

In [None]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, NoSuchElementException, NoSuchWindowException, ElementClickInterceptedException, WebDriverException

# Get links to scrape from Google

In [None]:
# Import the API and load the configuration file
from googleapiclient.discovery import build
config = load_config()

In [None]:
# The parsing strategy will be brute force
#  The return is just a list of values anywhere in the 
#  heirarchy of lists/dicts returned by the sheets API
def flatten(data):
    if isinstance(data, dict):
        contents = list(data.values())
    elif isinstance(data, list):
        contents = data
    else:
        return [data]
    
    return chain.from_iterable([flatten(v) for v in contents])

In [None]:
# Use a regex to extract URLs from the sheet
def parse_medium_articles(datum):
    # Things that are not strings are not URLs
    if not isinstance(datum, str):
        return None
    
    # URLs are strings starting with http
    url = re.search('''(http[a-z%A-Z0-9\-_@\./:]*)''', datum)
    if not url:
        if datum.find('http') != -1:
            breakpoint()
        return None
    else:
        url = url.group(1)
    
    # Bad URLs are those with a name followed by nothing
    #  This is the case if someone links to their medium profile and 
    #  not literal articles
    #   Future to do: fix this
    not_an_article = re.search('medium.com/@[^/]*(/$|$)', url)
    if not_an_article:
        return None

    # Throw out URLs that are a reference to Google Docs
    is_google = re.search('.google.com/', url)
    if is_google:
        return None
    
    return url
    

In [None]:
# Call the Google Sheets api with return reponse :: dict
service        = build('sheets', 'v4', developerKey=config['api_key'])
spreadsheet_id = config['spreadsheet_id']
request        = service.spreadsheets().get(spreadsheetId=spreadsheet_id, includeGridData = True)
response       = request.execute()

In [None]:
links_detected = list(uniq(filter(None, map(parse_medium_articles, flatten(response)))))

In [None]:
from pathlib import Path

In [None]:
if not Path('clapped_urls.txt').exists():
    clapped_urls = []
else:
    with open('clapped_urls.txt', 'r') as f:
        clapped_urls = f.readlines()
        clapped_urls = list(map(lambda s : s.strip(), clapped_urls))

In [None]:
to_clap = list(set(links_detected).difference(clapped_urls))

## Do the actual clapping

In [None]:
def make_xpath(child = None):
    button_text_reqd = False
    animation_div = '''//div[contains(@style, 'animation')]'''
    button_by_svg_size = '''//button[./*[name()='svg' and @width=33]]'''
    
    if child == 'svg':
        return button_by_svg_size

    if child == 'button':
        child = '//button'
        button_text_reqd = True
    elif child == 'animation':
        child = animation_div
    else:
        child = ''

    div_class     = '''(@class = "n o")'''
    button        = '''.//button''' + ('''[contains(text(), 'clap')]''' if button_text_reqd else '')
    
    
    return f'//div[{div_class} and {button}]' + child

In [None]:
def get_xpath_with_waiting(browser, time_out = 2, poll_frequency = 0.5, child = None):
    return WebDriverWait(browser, time_out, poll_frequency = poll_frequency).until(
                      EC.presence_of_element_located(
                        (By.XPATH, make_xpath(child = child)
                      )
                    )
                  )

In [None]:
def clap_for_url(browser, url, target_time = 3):
    '''
        Clicks on the 'clap' button on a given URL
        
        Attempts to find the clap button and clap on it. Attempts to 
        verify the success of the clapping and count the number already given.
          - On success, proceed to clap up to the maximum of 50 times.
            The wait time between claps is randomized to emulate human behavior
            with a goal of the total approximate time for a full 50 clicks being the
            target_time.
          - On failure, attempt to provide a diagnosis to the user. Then return to 
            the main flow which will record the results and proceed to the next URL.
    '''
    browser.get(url)
    
    # Try to find the button for clapping. Wait up to 2 seconds after the page
    #  loads for the contents of the page to load.
    try:
        clapper = get_xpath_with_waiting(browser, child = 'button')
    except TimeoutException:
        try:
            clapper = get_xpath_with_waiting(browser, child = 'svg')
            clapper.click()
            clapper = get_xpath_with_waiting(browser, time_out = 5, child = 'button')
        except TimeoutException:
            print(f'Unable to find the clapping button on {url}.\n'
                   '  Possible issues:\n'
                   '  (1) Medium changed the internal structure of the page\n'
                   '  (2) Nobody has clapped on this article before\n'
                   '  (3) Page load was too slow.'
                 )
            breakpoint()
            return False
    
    if clapper.text.find('claps') != -1:
        print(f'This url {url} appears to be your own article. So it will not be clicked on.')
        return True
    
    # Click on it once
    num_fails = 0
    max_fails = 10
    success   = False
    while num_fails < max_fails and not success:
        try:
            clapper.click()
        except ElementClickInterceptedException:
            time.sleep(1)
            num_fails += 1
        else:
            success = True
    if max_fails == num_fails:
        print(f'Something stopped/intercepted the click on {url}')
        return False
    
    # On success, the number of clicks already given will be displayed in an animation.
    #  If so, collect that information
    #  On failure to collect the information, assume that clicking has failed
    try:
        elt = WebDriverWait(browser, 1, poll_frequency = 0.1).until(
                 EC.presence_of_element_located(
                     (By.XPATH, make_xpath(child = 'animation'))
                 )
               )
        num_clicks_already = int(elt.text)
    except TimeoutException:
        print(f'Something went wrong after clicking on the clapping button in {url}\n'
               '  Possible issues:\n'
               '  (1) You are not logged in\n'
               '  (2) This is your own article. You cannot clap for yourself.'
                    ' After all, what is the sound of one hand clapping?\n'
             )
        return False
    
    # The wait time is normalized to emulate human behavior
    #  This is totally unneccesary but fun
    def wait_time():
        mu    =  target_time / 50
        sigma =  mu / 5
        return mu + gauss_noise() * sigma
    
    clicks_to_do = 50 - num_clicks_already
    if clicks_to_do == 0:
        print(f'This article is already maximally clapped! At url {url}')
        return True
    
    while clicks_to_do > 0:
        clapper.click()
        clicks_to_do -= 1
        time.sleep(wait_time())
                
    print(f'Succesfully clicked on {url}')
    return True

In [None]:
def clap_for_urls(urls):
    with get_browser() as browser, open('clapped_urls.txt', 'a') as success_file:
        if not browser:
            print('Program Terminating')
            return
        for url in urls:

            # Pass the url to the clapping function. Clapping errors are handled
            #  internally by the clapping function. Catch possible miscellaneous errors
            #  and also user intent to terminate communicated by closing the browser.
            try:
                result = clap_for_url(browser, url)
            except NoSuchWindowException:
                print(f'It looks like you closed the window. Program will terminate')
                return
            except StaleElementReferenceException:
                print(f'This is an unanticipated error on url {url}.\n'
                      'In the Rumsfeld classfication it '
                      'is a known unknown. Please report this error to the authorities. '
                      'Proceeding to the next url')
                result = False

            # Handle the result. True = success; False = failure 
            if result:
                success_file.write(f'{url}\n')
                time.sleep(1)
            else:
                wait_time = 5
                print(f'Program will wait {wait_time} seconds before proceeding to the next URL.'
                       ' Close the medium.com window at any time to terminate.\n')
                time.sleep(wait_time)
    
        browser.get('https://medium.com')
        time.sleep(10)
    print('All Done!')