# Pipulate Functions: Stuff you can do from a Google Sheet

In [None]:
"""Functions for conducting SEO and other data investigations."""

In [None]:
import re, sys, os, shelve
import requests
from collections import namedtuple
from html.parser import HTMLParser
from urllib.parse import urlparse, quote, quote_plus, urljoin
import notebook_finder
import goodsheet
import private

## Global Values for Functions scope

In [None]:
Response = namedtuple('Response', 'ok status_code text') #For Pipulate functions that send values to cells
proxy, proxies = None, None

## Pipulate funcs return Response(ok=True, status_code='200', text='string')

In [None]:
def foo(**kwargs):
    """This is a bare-bones copy-and-paste example for new Pipulate functions.
    
    Functions in this file can be used as column names in Google Sheets.
    Text output from these functions get inserted into the spreadsheet.
    Pipulate functions must have **kwargs unless preceded by a decorator.
    Decorators and support-functions significantly simplify these functions.
    The minimum a Pipulate function must return is this 3-element tuple."""
    return Response(ok=True, status_code='200', text='bar')

In [None]:
def url(passed_in_func):
    """This is the frequently-used decorator function that passes along pre-fetched HTML.
    
    This decorator function allows you to use @url above any Pipulate function.
    This in turn allows "html" to be used as the argument instead of **kwargs.
    Swapping out full HTML for a URL is great for screen-scraping functions like title.
    This is also very efficient, because all URL fetches are cached in a database.
    This means that multiple scraper functions can be used together efficiently.
    This function must appear in this file before anything decorated by it."""
    def requests_wrapper(**row_dict):
        html = row_dict['response'].text
        return passed_in_func(html=html)
    return requests_wrapper

In [None]:
@url
def Title(html):
    """This is the quintessential example of grabbing a title tag from a URL.

    This function does not need **kwargs as it's argument BECAUSE it's decorated.
    The @url decorator pre-filters **kwargs and sends along only the pre-fetched html.
    The HTML is cached for efficiency on subsequent calls of the same URL.
    This pattern can be used for extracting any TEXT NODE from HTML (not attributes).
    Pipulate functions are case insensitive, so this could also be 'title'."""
    return extract_text_node(html=html, tag='title')

In [None]:
def realurl(**row_dict):
    """Returns the actual URL retrieved after resolving all redirects."""
    url = row_dict['url']
    try:
        response = requests.get(url)
        return Response(ok=True, status_code='200', text=response.url)
    except:
        return Response(ok=True, status_code='200', text=response.status_code)

In [None]:
def apexdomain(**row_dict):
    """Usually returns the apex or registered domain, given an URL."""
    path = row_dict['url']
    if path:
        apex = urlparse(path).hostname.split(".")
        try:
            apex = ".".join(len(apex[-2]) < 4 and apex[-3:] or apex[-2:])
            return Response(ok=True, status_code='200', text=apex)
        except:
            return Response(ok=True, status_code='200', text="Can't find")
    else:
        return Response(ok=True, status_code='400', text='No input')

In [None]:
def gaorganic(**kwargs):
    path = urlparse(kwargs['url']).path
    path = "ga:pagePath==%s" % path.replace(",", "\,")
    service = goodsheet.create_google_service(api_name="analytics", version="v3")
    ga_request = service.data().ga().get(
        ids=kwargs['ids'],
        start_date=kwargs['startdate'],
        end_date=kwargs['enddate'],
        metrics='ga:organicSearches',
        dimensions='ga:pagePath',
        filters=path,
        start_index='1',
        max_results='100'
    )
    try:
        ga_response = ga_request.execute()
    except:
        return Response(ok=False, status_code='500', text="Did not execute")
    if ga_response and 'rows' in ga_response:
        return Response(ok=True, status_code='200', text=ga_response['rows'][0][1])
    else:
        return Response(ok=False, status_code='200', text='Not found')

## Work in Progress

In [None]:
def verbatim100(**kwargs):
    from urllib.parse import quote_plus
    title = '"%s"' % kwargs['title']
    endpoint = 'https://www.google.com/search'
    search = '%s?num=100&tbs=li%%3A1&q=%s' % (endpoint, quote_plus(title))
    try:
        response = url_cacher(search, 'verbatim100')
        return Response(ok=True, status_code='200', text="cached")
    except:
        print("Error retreiving URL.")
        return Response(ok=False, status_code='500', text=None)

In [None]:
def serpinspector(**kwargs):
    if 'title' in kwargs:
        title = kwargs['title']
    else:
        return Response(ok=True, status_code='500', text='None')
    search = buildquery(title, rpp=100, quote=True)
    pattern = re.compile('<a href=\"/url\?q=(.*?)&')
    with shelve.open('verbatim100') as db:
        if search in db.keys():
            response = db[search]
            landing_pages = re.findall(pattern, response.text)
            landing_pages = set([urlparse(x).hostname for x in landing_pages])
            return Response(ok=True, status_code='200', text=landing_pages)
        else:
            return Response(ok=True, status_code='400', text="Not Found")

In [None]:
def comparesets(**kwargs):
    if 'serpinspector' in kwargs:
        a_set = kwargs['serpinspector']
        if isinstance(a_set, str):
            a_set = eval(a_set)
    else:
        return Response(ok=True, status_code='500', text='{}')
    check_list = set([urlparse(x).hostname for x in private.monitor_list()])
    intersection = a_set.intersection(check_list)
    if intersection:
        return Response(ok=True, status_code='200', text=intersection)
    else:
        return Response(ok=True, status_code='200', text="{}")

In [None]:
def buildquery(title, rpp=100, quote=False):
    if quote:
        title = '"%s"' % title
    endpoint = 'https://www.google.com/search'
    query = '%s?num=%s&tbs=li%%3A1&q=%s' % (endpoint, rpp, quote_plus(title))
    return query

In [None]:
def extracthit(**kwargs):
    if 'title' in kwargs:
        title = kwargs['title']
    else:
        return Response(ok=True, status_code='500', text='None')
    search = buildquery(title, rpp=100, quote=True)
    pattern = re.compile('<a href=\"/url\?q=(.*?)&')
    response = cached_url(search, 'verbatim100')
    landing_pages = re.findall(pattern, response.text)
    if 'serpinspector' in kwargs:
        a_set = kwargs['comparesets']
        if isinstance(a_set, str):
            a_set = eval(a_set)
    hits = set()
    for spot_me in a_set:
        for spot_in in landing_pages:
            if spot_me in spot_in:
                hits.add(spot_in)
    return Response(ok=True, status_code='200', text=hits)

In [None]:
def cached_url(url, dbname):
    with shelve.open(dbname) as db:
        if url in db.keys():
            return db[url]
        else:
            return None

In [None]:
def url_cacher(url, dbname):
    from time import sleep
    with shelve.open(dbname) as db:
        if url in db.keys():
            response = db[url]
        else:
            try:
                response = requests.get(url)
            except requests.exceptions.RequestException as e:
                response = None
        if response and response.ok:
            db[url] = response
        else:
            print("Sleeping 5 minutes before retrying.")
            sleep(300)
            return False
    return response

## Populate() funcs return lists of lists [['A1','B1','C1'],['A2','B2','C2']]

In [None]:
def populate_from_gsc(site, start, end):
    service = goodsheet.create_google_service(api_name="webmasters", version="v3")
    listoflists = []
    units = 5000
    limit = 4
    for its, aset in enumerate(range(0, units*limit, units)):
        sofar = (its+1)*units
        print("Fetching %s of %s from Search Console..." % (sofar, units*limit))
        mydata = None
        request = {
            'startDate': str(start),
            'endDate': str(end),
            'dimensions': ['query', 'page'],
            'rowLimit': str(units),
            'startRow': str(aset)
        }
        mydata = service.searchanalytics().query(siteUrl=site, body=request).execute()
        if mydata and 'rows' in mydata:
            listoflists.append(mydata['rows'])
        else:
            break
    too_many_dicts = sum(listoflists, [])
    list_of_rows = []
    print("Transforming dicts to lists", end="")
    for index, item in enumerate(too_many_dicts):
        if index%1000 == 0:
            print('.', end='')
        row = [item['keys'][0], item['keys'][1], item['position'], 
               item['impressions'], item['clicks'], item['ctr']]
        list_of_rows.append(row)
        list_of_rows.sort(key=lambda x: (-x[4], -x[3]))
    columns = [['keyword', 'url', 'position', 'impressions', 'clicks', 'ctr']]
    print("Returning a big list!")
    return columns + list_of_rows

In [None]:
def populate_from_ga(prepath, profileid, start, end):
    """Experiment at https://ga-dev-tools.appspot.com/query-explorer/"""
    service = create_google_service(api_name="analytics", version="v3")
    ga_request = service.data().ga().get(
        ids=profileid,
        start_date=start,
        end_date=end,
        metrics='ga:organicSearches,ga:sessions,ga:bounces',
        dimensions='ga:pagePath',
        sort='-ga:organicSearches,-ga:sessions',
        filters='ga:organicSearches>0',
        samplingLevel='HIGHER_PRECISION',
        start_index='1',
        max_results='10000'
    )
    try:
        ga_response = ga_request.execute()
    except:
        return [['ERROR']]
    if 'rows' in ga_response:
        raw_rows = ga_response['rows']
        list_of_lists = [['url', 'organicsearches', 'sessions', 'bounces']]+[[prepath+a,b,c,d] for a,b,c,d in raw_rows]
        return list_of_lists
    else:
        return [['ERROR']]

In [None]:
def one_page_crawl(url, regex=None):
    """Returns a set of all site URLs found on a page, optionally filtered."""
    from bs4 import BeautifulSoup
    try:
        html = requests.get(url).text
    except:
        return [['ERROR']]
    lookfor = "%s://%s/" % (urlparse(url).scheme, urlparse(url).hostname)
    soup = BeautifulSoup(html, "lxml")
    all_links = soup.find_all("a")
    nodupes = set()
    for link in all_links:
        url_fragment = link.get("href")
        href = urljoin(url, url_fragment)
        if (type(href).__name__ == 'str' and len(href) > len(lookfor)
            and lookfor in href and href not in nodupes):
            if regex:
                match = re.search(regex, href, flags=re.I)
                if match:
                    nodupes.add(href)
            elif not regex:
                nodupes.add(href)
    if nodupes:
        columns = [['URL']]
        list_of_lists = [[x] for x in nodupes]
        return columns + list_of_lists
    else:
        return [['ERROR']]

In [None]:
def populate_from_atom(sheet_name, tab_name, feed_url):
    try:
        worksheet = goodsheet.oauth().open(sheet_name).worksheet(tab_name)
        rows = worksheet.row_count
        cols = worksheet.col_count
        end_range = worksheet.get_addr_int(rows, cols)
    except:
        return [['ERROR']]
    try:
        feed = requests.get(feed_url).text
    except:
        return None
    first_row = 2
    if rows > 100:
        first_row = rows - 100
    start_range = worksheet.get_addr_int(first_row, 1)
    row_range = '%s:%s' % (start_range, end_range)
    cell_range = worksheet.range(row_range)
    check_for_these_urls = [x.value for x in cell_range][1::3]
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(feed, "lxml")
    entries = soup.find_all("entry")
    list_of_entries = []
    for entry in entries:
        title = entry.find("title").text
        link = entry.find("link").attrs['href']
        updated = entry.find("updated").text
        if link not in check_for_these_urls:
            list_of_entries.append([title, link, updated])
    return list_of_entries

## Support Functions that are not called from sheets or workflows

In [None]:
def extract_text_node(html, tag):
    """This is the helper function that extracts text-nodes like title tags from HTML.
    
    This function is for simple text-node screen scraping, such as for the Title tag.
    It is different from many support functions in that the return values get wrapped
    in the same Response() named tuple as functions that are meant to be called from 
    spreadsheets directly (like Title), so that those can be short and clear like this:
    
        @url
        def Title(html):
            return extract_text_node(html=html, tag="title")
    """
    
    if not html or not tag:
        return Response(ok=False, status_code='400', text=None)
    pattern = r'<{0}\s?>(.*?)</{0}\s?>'.format(tag.lower())
    compiled = re.compile(pattern=pattern, flags=re.DOTALL)
    matches = compiled.findall(string=html)
    if matches:
        text = matches[0].strip()
        return Response(ok=True, status_code='200', text=text)
    else:
        return Response(ok=True, status_code='200', text=None)

In [None]:
class MLStripper(HTMLParser):
    """http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python"""
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

    
def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()


def normalize_whitespace(string):
    return re.sub('[\s\r\n]+', ' ', string)

In [None]:
class common_date_boundaries():
    """Create a selection of dates that can be used as arguments to Analytics or Search Console"""
    def __init__(self):
        from datetime import date, time, datetime, timedelta
        shift = 2
        today = date.today()
        yesterday = today - timedelta(days=1)
        day = today - timedelta(days=shift)
        week = day.isocalendar()[1]
        year = day.year
        first_day_this_month = day.replace(day=1)
        first_week_of_year = date(year,1,1)
        if(first_week_of_year.weekday()>3):
            first_week_of_year = first_week_of_year+timedelta(7-first_week_of_year.weekday())
        else:
            first_week_of_year = first_week_of_year - timedelta(first_week_of_year.weekday())
        prior_weeks_of_year = timedelta(days = (week-1)*7)
        week_start = first_week_of_year + prior_weeks_of_year
        week_end = first_week_of_year + prior_weeks_of_year + timedelta(days=6)
        week_start = week_start + timedelta(days=-1)
        week_end = week_end + timedelta(days=-1)
        month_end = first_day_this_month - timedelta(days=1)
        month_start = month_end.replace(day=1)
        month_days = (month_end - month_start).days+1
        start_30_days = day - timedelta(days=30)
        end_30_days = day
        start_90_days = day - timedelta(days=90) - timedelta(days=shift-1)
        end_90_days = day
        props = ['month_days','year','week',
                 'first_week_of_year',
                 'today','day',
                 'first_day_this_month',
                 'month_start','month_end',
                 'week_start','week_end',
                 'start_30_days','end_30_days',
                 'start_90_days','end_90_days'
                ]
        for prop in props:
            command = "self.%s = str(%s)" % (prop, prop)
            exec(command)

## Test Functions

In [None]:
def topsites():
    import requests, re
    print("Fetching the Alexa Top 25 US sites...")
    site = 'http://www.alexa.com/topsites/countries/US'
    response = requests.get(site)
    unresolved_urls = re.findall('/siteinfo/(.*)?"', response.text)
    responses = [requests.get('http://'+x) for x in unresolved_urls]
    details = [[[y.status_code for y in x.history], x.status_code, x.url] for x in responses]
    flat = [(x[0], x[1][0], x[1][1], x[1][2], '', '') for x in list(zip(unresolved_urls, details))]
    returnme = [('original_url', 'redirect_chain', 'status_code', 
                 'resolved_url', 'proxy_cache', 'cached_title')] + flat
    return returnme

In [None]:
def endless_proxies():
    global proxies
    print('. ', end="")
    if not proxies:
        print("(Fetching and shuffling web proxies...) ", end="")
        proxies = proxy_generator(31)
    try:
        proxy = next(proxies)
    except StopIteration:
        print("Ran out of good proxies. Re-creating generator.")
        proxies = proxy_generator(31)
        proxy = next(proxies)
    yield proxy

def proxy_generator(num_pages):
    import hma_scraper
    from random import shuffle
    from collections import namedtuple
    from urllib.parse import urlparse
    import requests
    from requests.packages.urllib3.exceptions import InsecureRequestWarning
    requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
    Proxy = namedtuple('Proxy', 'good scheme address seconds')
    proxy_list = hma_scraper.scrape_hma(num_pages)
    shuffle(proxy_list)
    for a_proxy in proxy_list:
        url_parts = urlparse(a_proxy)
        requests_arg = {url_parts.scheme: a_proxy}
        try:
            result = requests.get('https://www.google.com/search?q=test', 
                                  proxies=requests_arg, verify=False, timeout=1)
            yield Proxy(True, url_parts.scheme, a_proxy, result.elapsed.total_seconds())
        except:
            pass

def proxy_cache(**kwargs):
    resolved_url = kwargs['resolved_url']
    import requests
    from requests.packages.urllib3.exceptions import InsecureRequestWarning
    requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
    global proxy
    if proxy:
        proxy_arg = {proxy.scheme: proxy.address}
        try:
            response = requests.get(resolved_url, proxies=proxy_arg, timeout=5)
            return Response(ok=True, status_code='200', text=proxy.address)
        except:
            pass
    proxy = next(endless_proxies())
    proxy_arg = {proxy.scheme: proxy.address}
    for i in range(100):
        try:
            response = requests.get(resolved_url, proxies=proxy_arg, timeout=5)
            break
        except:
            proxy = next(endless_proxies())
            proxy_arg = {proxy.scheme: proxy.address}
        else:
            print("It seems no proxies are working.")
            raise SystemExit()
    #return_tuple = proxy.address, response.status_code, len(response.text)
    return Response(ok=True, status_code='200', text=proxy.address)